In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

In [2]:
insure = pd.read_csv("C:/Python/Datasets/Insure_auto.csv")
X = insure[['Home','Automobile']]
y = insure['Operating_Cost']

In [3]:
train, test = train_test_split(insure, test_size=0.3, random_state=25)
train.shape, test.shape

((7, 4), (3, 4))

In [4]:
X_train = train[['Home','Automobile']]
y_train = train['Operating_Cost']
X_test = test[['Home','Automobile']]
y_test = test['Operating_Cost']

In [5]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.coef_ , lr.intercept_

(array([164.34828889,  62.45260525]), -17048.63483953383)

In [6]:
X_test['Home']*164.34828889 + X_test['Automobile']*62.45260525 - 17048.63483953383

5     78273.628188
1     62956.204162
3    226844.685722
dtype: float64

In [7]:
y_pred = lr.predict(X_test)

In [8]:
root_mean_squared_error( y_test, y_pred )

6756.6852583055315

In [9]:
mean_absolute_error( y_test, y_pred )

6387.369915021186

In [10]:
r2_score( y_test, y_pred )

0.9903677053376259

In [11]:
X = insure[['Home','Automobile']]
y = insure['Operating_Cost']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

6387.369915021186

#### Polynomial Regression

In [13]:
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=25)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

45762.68316546028

In [14]:
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=25)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

343561.7745664336

#### Boston Housing

In [15]:
boston = pd.read_csv("C:/Python/Datasets/Boston.csv")
X = boston.drop('medv', axis=1)
y = boston['medv']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

3.1175694194800814

In [17]:
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=25)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

2.663548550881642

In [18]:
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=25)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

193.4522211819452

#### Housing

In [19]:
housing = pd.read_csv("C:/Python/Datasets/Housing.csv")
dum_hous = pd.get_dummies( housing , drop_first=True )
X = dum_hous.drop('price', axis=1)
y = dum_hous['price']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

11600.941705536447

In [21]:
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=25)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

12967.811671676362

In [22]:
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=25)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

18604.82147393365

#### Concrete Strength

In [23]:
concrete = pd.read_csv("C:/Python/Cases/Concrete_Strength/Concrete_Data.csv")
X = concrete.drop('Strength', axis=1)
y = concrete['Strength']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

7.70818272427273

In [25]:
poly = PolynomialFeatures(degree=2).set_output(transform='pandas')
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=25)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

5.929922276497001

In [26]:
poly = PolynomialFeatures(degree=3).set_output(transform='pandas')
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=25)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

5.186723830672629

In [27]:
poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=25)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mean_absolute_error(y_test, y_pred)

22.562456055380327

In [28]:
import sklearn 
sklearn.__version__

'1.7.2'

Build the best model on whole data

In [None]:
poly = PolynomialFeatures(degree=3).set_output(transform='pandas')
X_poly = poly.fit_transform(X)
lr.fit(X_poly, y)

##### Unlabelled Data

In [34]:
tst = pd.read_csv("C:/Python/Cases/Concrete_Strength/testConcrete.csv")
tst_poly = poly.transform(tst)

Inferencing

In [35]:
lr.predict(tst_poly)

array([  39.11058774,   -6.21571688,  107.15130399, -165.56534867,
        156.18911773, 1044.41069946,  110.16996572,  142.06361488,
        342.41500504,  879.42685499,   62.44554159,  448.72604923,
       -116.58724094,  477.85645771])