This notebook is for Oppe-1 (jan 2025)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load the data
df = pd.read_csv('./datasets/NPPE1_Preprocessing1.csv')

In [3]:
# How many samples are there in the dataset ?
df.shape

(4000, 14)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CRIM          4000 non-null   float64
 1   ZN            4000 non-null   float64
 2   INDUS         4000 non-null   float64
 3   POLINDEX      4000 non-null   float64
 4   RM            4000 non-null   float64
 5   AGE           4000 non-null   float64
 6   DIS           4000 non-null   float64
 7   HIGHWAYCOUNT  4000 non-null   int64  
 8   TAX           4000 non-null   float64
 9   PTRATIO       4000 non-null   float64
 10  IMM           4000 non-null   float64
 11  BPL           4000 non-null   float64
 12  PRICE         4000 non-null   float64
 13  RIVERSIDE     4000 non-null   object 
dtypes: float64(12), int64(1), object(1)
memory usage: 437.6+ KB


In [None]:
# Average house price in Lacs
df['PRICE'].mean()

24.355923220694248

In [8]:
# How many houses have 5 or more bedrooms ?
df[df['RM']>=5].shape[0]

3953

In [15]:
# Average price of top ten most expensive houses
top_ten_df = df.sort_values('PRICE', ascending=False).head(10)
top_ten_df['PRICE'].mean()

52.36590175716407

In [20]:
# Total number of missing values in number of rooms feature
print('The distinct values in #rooms', df['RM'].unique())
print('The number of missing values in #rooms', df[df['RM']==-1].shape[0])

The distinct values in #rooms [ 6.  7.  8.  9. 10. -1.  5.  4. 11.]
The number of missing values in #rooms 40


In [22]:
# Total number of missing values in age feature
print('The distinct values in age', df['AGE'].unique())
print('The number of missing values in age', df[df['AGE']==-2].shape[0])

The distinct values in age [ 42.  63.  -2.   9.  20.  95.  35.  97.  36.  78.  99.  96.  19.  73.
  86.  37.  76.  50.  32.  92. 101.   8.  94.  84.  47.  43.  65.  46.
  59.  74.  93.  80.  98. 102.  56. 100.  89.  53.  18.  71.  62.  83.
  66.   7.  77.  91.  90.  58.  57.  60.  87.  75.  34.  85.  48.  33.
  38.  30.  23.  55.  11.  14.  45.  41.  88.  16.  12.  79.  67.  39.
  49.  61.  70.  54.  10.  22.  72.  15.  51.  81.  31.  52.  44.  28.
  17.  40.  29.  69.  64.  24.  82.  26.  68.  27.  13. 103.  21.  25.
   6.   5.   3.   4.]
The number of missing values in age 50


In [24]:
# Total number of missing values in riverside feature
print('The distinct values in riverside', df['RIVERSIDE'].unique())
print('The number of missing values in riverside', df[df['RIVERSIDE']=='UNKNOWN'].shape[0])

The distinct values in riverside ['NO' 'UNKNOWN' 'YES']
The number of missing values in riverside 88


In [26]:
# Total number of houses that are on riverside and has age less than or equal to 50 years
temp_df = df[(df['RIVERSIDE']!='UNKNOWN') & (df['AGE']!=-2)].copy()

condition = (temp_df['RIVERSIDE']=='YES') & (temp_df['AGE']<=50)
temp_df[condition].shape[0]

44

In [28]:
# How many houses are exactly near to 6,7,8 highways ?
print('The distinct values in highways', df['HIGHWAYCOUNT'].unique())
print('The number of houses near to 6,7,8 highways', df[df['HIGHWAYCOUNT'].isin([6,7,8])].shape[0])

The distinct values in highways [ 5  8 25  4  1  6  7  2 24  3 26 10  9 11 27]
The number of houses near to 6,7,8 highways 1211


In [30]:
# CREATING AN ADDITIONAL FEATURE CATEGORY

def my_func(price):
    if price < 10:
        return 'category 1'
    elif 10 <= price < 20:
        return 'category 2'
    elif 20 <= price < 30:
        return 'category 3'
    elif 30 <= price < 40:
        return 'category 4'
    else:
        return 'category 5'

df['CATEGORY'] = df['PRICE'].apply(my_func)

In [31]:
df['CATEGORY'].value_counts()

CATEGORY
category 3    2028
category 2    1158
category 4     503
category 5     268
category 1      43
Name: count, dtype: int64

In [33]:
# filling out the missing values
df['RIVERSIDE'] = df['RIVERSIDE'].replace('UNKNOWN', np.nan)
df['AGE'] = df['AGE'].replace(-2, np.nan)
df['RM'] = df['RM'].replace(-1, np.nan)

In [34]:
# splitting data
from sklearn.model_selection import train_test_split
X = df.drop('PRICE', axis=1)
y = df['PRICE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [37]:
# Number of samples in training set
X_train.shape

(2800, 14)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# SUB PIPELINES
rm_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ]
)

age_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', MinMaxScaler())
    ]
)

riverside_pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)


# PIPELINE
pipeline = ColumnTransformer(
    [
        ('drop', 'drop', ['CATEGORY']),
        ('scale', MinMaxScaler(), ['CRIM', 'ZN']),
        ('scale 2', StandardScaler(), ['INDUS']),
        ('scale 3', MinMaxScaler(), ['POLINDEX', 'DIS', 'HIGHWAYCOUNT', 'TAX', 'PTRATIO', 'IMM', 'BPL']),
        ('rm_pipeline', rm_pipeline, ['RM']),
        ('age_pipeline', age_pipeline, ['AGE']),
        ('riverside_pipeline', riverside_pipeline, ['RIVERSIDE']),
    ]
)

X_train_prepared = pipeline.fit_transform(X_train)

X_train_prepared.shape

(2800, 14)

In [50]:
# Transformed test data
X_test_prepared = pipeline.transform(X_test)
X_test_prepared.shape
print("Mean of the transformed test data", X_test_prepared.mean())

Mean of the transformed test data 0.3861745655097562


# Another Dataset

In [57]:
# Loading the dataset
df = pd.read_csv('./datasets/NPPE1_ModelBuilding3.csv')

In [58]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.087472,0.002829,1.768235,0.188396,0.117071,0.148148,0.478165,0.720443,0.967195,0.281563,0.428571,0.959596,1.0,0.0,23.358097
1,0.378379,0.022079,1.115629,0.091974,0.066089,0.851852,0.911759,0.785321,0.885001,0.424648,0.285714,0.868687,0.0,1.0,17.268768
2,0.066901,0.003828,-0.536262,0.221188,0.255671,0.296296,0.228024,0.406472,0.980184,0.274376,0.428571,0.767677,0.0,1.0,27.776974
3,0.140645,0.011132,1.323366,0.422514,0.153103,0.148148,0.410679,0.200319,0.861371,0.305006,0.142857,0.848485,0.0,1.0,16.12196
4,0.144225,0.204918,-0.93079,0.148694,0.17749,0.259259,0.146832,0.111429,0.983448,0.286322,0.285714,0.616162,1.0,0.0,23.129426


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       4000 non-null   float64
 1   1       4000 non-null   float64
 2   2       4000 non-null   float64
 3   3       4000 non-null   float64
 4   4       4000 non-null   float64
 5   5       4000 non-null   float64
 6   6       4000 non-null   float64
 7   7       4000 non-null   float64
 8   8       4000 non-null   float64
 9   9       4000 non-null   float64
 10  10      4000 non-null   float64
 11  11      4000 non-null   float64
 12  12      4000 non-null   float64
 13  13      4000 non-null   float64
 14  14      4000 non-null   float64
dtypes: float64(15)
memory usage: 468.9 KB


In [66]:
# train test split
from sklearn.model_selection import train_test_split
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

from sklearn.linear_model import Ridge

ridge = Ridge(alpha=10, solver='saga', tol=1e-4, random_state=42)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.6613547575262211

In [67]:
# Which feature is the most important ?
ridge.coef_

array([ -0.4825441 ,   3.74601838,  -0.73583331,   0.54199933,
        -9.89014109,   5.80114296,  -5.06099736,  -9.45015598,
         4.73124885, -23.51321982,  11.31863371,   0.49450664,
        -0.89196134,   0.89196134])

In [72]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    'tol': [1e-4, 1e-3, 1e-2, 1e-1],
    'penalty': ['l1', 'l2']
}

sgd = SGDRegressor(random_state=42)
grid_search = GridSearchCV(estimator=sgd,
                           param_grid=param_grid,
                           cv=5,
                           scoring = 'neg_mean_absolute_error',

                           )

grid_search.fit(X_train, y_train)

In [73]:
grid_search.best_params_

{'alpha': 0.001, 'penalty': 'l2', 'tol': 0.0001}

In [74]:
# Value of mean absolute error on test dataset
y_pred = grid_search.predict(X_test)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

3.8131121797994014

In [77]:
# Create a pipeline of PCA as transformer and Lasso as estimator

from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline


pipeline = Pipeline(
    [
        ('pca', PCA()),
        ('lasso', Lasso())
    ]
)

from sklearn.model_selection import GridSearchCV

param_grid = {
    'pca__n_components': [0.9, 0.95],
    'lasso__alpha': [10, 1, 0.01, 0.001]
}

grid_search = GridSearchCV(estimator=pipeline,
                            param_grid=param_grid,
                            cv=5,
                            scoring='neg_mean_absolute_error',
                            n_jobs=-1)

grid_search.fit(X_train, y_train)
print('best params', grid_search.best_params_)

best_model = grid_search.best_estimator_
print('best_model', best_model)

best params {'lasso__alpha': 0.01, 'pca__n_components': 0.95}
best_model Pipeline(steps=[('pca', PCA(n_components=0.95)), ('lasso', Lasso(alpha=0.01))])


In [78]:
# Calculate the r2 score
from sklearn.metrics import r2_score

y_pred = best_model.predict(X_test)
r2_score(y_test, y_pred)

0.6288625430197549

In [80]:
# How much variance explained by the first principal component ?
best_pca = best_model.named_steps['pca']
best_pca.explained_variance_


array([1.16350757, 0.15109745, 0.11321585, 0.07019734, 0.03866388,
       0.02770669, 0.02515772])

In [82]:
# Create a pipeline of the PolynomialFeatures as transformer and Lasso as an estimator with the following parameters:
# - For PolynomialFeatures:
#     - interaction_only = False
#     - degree = 2
# - For Lasso:
#     - alpha = 1
#     - warm_start = True
#     - random state as 0

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline


pipeline = Pipeline(
    [
        ('poly', PolynomialFeatures(interaction_only=False, degree=2)),
        ('lasso', Lasso(alpha=1, warm_start=True, random_state=0))
    ]
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.157678032410551

In [85]:
# If you eliminate 1 feature with recursive feature elimination, which feature will be eliminated?
# Type the index of the eliminated feature (index starts from 0).
# Use 
# 1
# LinearRegression
#  model with default parameters as an estimator.
# Use processed training data.
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
rfe = RFE(estimator=lr, n_features_to_select=X_train.shape[1]-1)
rfe.fit(X_train, y_train)

rfe.ranking_

array([1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])