In [1]:
import pandas as pd

## Model Training

In [2]:
df = pd.read_csv('./data/boston.csv')
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
## Independent and dependent features

X = df.drop(labels=['MEDV'],axis=1)
Y = df[['MEDV']]

In [4]:
Y

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
...,...
501,22.4
502,20.6
503,23.9
504,22.0


In [5]:
# Define which columns should be ordinal-encoded and which should be scaled

categorical_cols = X.select_dtypes(include='object').columns

numerical_cols = X.select_dtypes(exclude='object').columns

print(categorical_cols)
print(numerical_cols)

Index([], dtype='object')
Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
      dtype='object')


In [None]:
# categorical_cols not available so custom ranking for ordinal variable not required

In [8]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:
## Numerical Pipeline

num_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

## Categorigal Pipeline

cat_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[])),
    ('scaler',StandardScaler())
    ]

)

## preprocessor

preprocessor = ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [10]:
## Train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=30)

In [11]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())

X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [12]:
X_train.head()

Unnamed: 0,num_pipeline__CRIM,num_pipeline__ZN,num_pipeline__INDUS,num_pipeline__CHAS,num_pipeline__NOX,num_pipeline__RM,num_pipeline__AGE,num_pipeline__DIS,num_pipeline__RAD,num_pipeline__TAX,num_pipeline__PTRATIO,num_pipeline__B,num_pipeline__LSTAT
0,0.029277,-0.493046,1.026512,-0.267615,-0.188792,-0.086058,0.79905,-0.335773,1.655948,1.519548,0.811111,0.423524,0.04036
1,1.156184,-0.493046,1.026512,-0.267615,1.618559,0.238413,0.891001,-0.865244,1.655948,1.519548,0.811111,-3.519694,0.767601
2,0.390575,-0.493046,1.026512,-0.267615,1.088518,-0.136406,0.353445,-0.89759,1.655948,1.519548,0.811111,-2.777446,1.254769
3,-0.390053,-0.493046,-0.75159,-0.267615,-0.475535,-0.453884,-1.340563,0.02535,-0.518741,-0.761065,0.355608,0.403156,-0.34432
4,-0.276245,-0.493046,1.243282,-0.267615,0.445519,0.912528,1.035999,-0.925737,-0.518741,-0.030326,-1.694155,0.081558,-1.122103


In [13]:
X_test.head()

Unnamed: 0,num_pipeline__CRIM,num_pipeline__ZN,num_pipeline__INDUS,num_pipeline__CHAS,num_pipeline__NOX,num_pipeline__RM,num_pipeline__AGE,num_pipeline__DIS,num_pipeline__RAD,num_pipeline__TAX,num_pipeline__PTRATIO,num_pipeline__B,num_pipeline__LSTAT
0,-0.316572,-0.493046,-0.432293,-0.267615,-0.136657,-0.455283,0.746002,0.10501,-0.633198,-0.596059,1.175513,0.393508,0.175138
1,1.984378,-0.493046,1.026512,-0.267615,1.618559,-0.660874,0.859172,-0.931096,1.655948,1.519548,0.811111,0.38193,1.337601
2,-0.386084,-0.493046,-0.370777,-0.267615,-0.293062,-0.214727,0.682344,-0.521452,-0.518741,-0.142294,1.129963,0.414304,0.120384
3,-0.401843,-0.493046,-1.031341,-0.267615,-0.379954,0.796446,0.222593,-0.424606,-0.518741,-0.660883,-0.828699,0.380001,-0.794985
4,-0.27528,-0.493046,-0.432293,-0.267615,-0.136657,-0.207734,0.834416,0.087919,-0.633198,-0.596059,1.175513,0.440354,0.861665


In [14]:
## Model Training

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [15]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [16]:
regression.coef_

array([[-1.01543013,  0.97720472,  0.12314546,  0.81051159, -2.15335944,
         2.67736416,  0.16150204, -3.15969207,  2.80619022, -2.26408701,
        -2.14526254,  0.68631966, -3.99645528]])

In [17]:
regression.intercept_

array([22.53762376])

In [18]:
import numpy as np

def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [20]:
## Train linear regression model

models={
    'LinearRegression':LinearRegression()
}

trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    mae, rmse, r2_square = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 score", r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 4.2347422812609405
MAE: 3.2254104069763287
R2 score 71.81173900062088




In [21]:
model_list

['LinearRegression']