## Discussion 9.1 notebook

In [42]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, LeaveOneOut
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn import set_config
set_config(display="diagram")

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [52]:
#load dataset
primary_ds = pd.read_csv("data/kidney-stone-dataset.csv", index_col = [0])

In [53]:
#explore dataset
primary_ds.head()


Unnamed: 0,gravity,ph,osmo,cond,urea,calc,target
0,1.021,4.91,725,14.0,443,2.45,0
1,1.017,5.74,577,20.0,296,4.49,0
2,1.008,7.2,321,14.9,101,2.36,0
3,1.011,5.51,408,12.6,224,2.15,0
4,1.005,6.52,187,7.5,91,1.16,0


In [54]:
print('Total number of rows=',primary_ds.shape[0])
print('Total number of columns=',primary_ds.shape[1])

Total number of rows= 90
Total number of columns= 7


In [55]:
#create the feature dataset and the target variable
X = primary_ds.drop('target', axis = 1)
y = primary_ds['target']

In [56]:
#determine the breakdown of the target variable
print(y.value_counts())

0    45
1    45
Name: target, dtype: int64


In [46]:
#using linear regression and k-fold cross validation to determine MSE
logs=[]
kf=KFold(n_splits=10, shuffle=True, random_state=None)
for train_index, test_index in kf.split(X):
    lr=LinearRegression()
    X_train = X.loc[train_index,]
    X_test = X.loc[test_index,]
    y_train = y.loc[train_index]
    y_test = y.loc[test_index]
    lr.fit(X_train, y_train)
    y_pred=lr.predict(X_test)
    logs.append(mean_squared_error(y_test, y_pred))
for i in range(0,10):
    print("Mean squared error for KFold[",i,"]=",round(logs[i],4))
print("Average MSE through KFold CV=",round(np.mean(logs),4))

Mean squared error for KFold[ 0 ]= 0.2343
Mean squared error for KFold[ 1 ]= 0.1189
Mean squared error for KFold[ 2 ]= 0.1665
Mean squared error for KFold[ 3 ]= 0.2515
Mean squared error for KFold[ 4 ]= 0.2319
Mean squared error for KFold[ 5 ]= 0.1391
Mean squared error for KFold[ 6 ]= 0.1584
Mean squared error for KFold[ 7 ]= 0.1825
Mean squared error for KFold[ 8 ]= 0.1465
Mean squared error for KFold[ 9 ]= 0.1125
Average MSE through KFold CV= 0.1742


In [39]:
#using linear regression and Holdout  cross validation to determine MSE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
lr=LinearRegression()
lr.fit(X_train, y_train)
y_pred=lr.predict(X_test)
MSE = mean_squared_error(y_test, y_pred)
print("Mean squared error for hold-out cross validation=",round(MSE,4))

Mean squared error for hold-out cross validation= 0.1603


In [47]:
#using linear regression and LLO  cross validation to determine MSE
logs=[]
leaveCV=LeaveOneOut()
for train_index, test_index in leaveCV.split(X):
    lr=LinearRegression()
    X_train = X.loc[train_index,]
    X_test = X.loc[test_index,]
    y_train = y.loc[train_index]
    y_test = y.loc[test_index]
    lr.fit(X_train, y_train)
    y_pred=lr.predict(X_test)
    logs.append(mean_squared_error(y_test, y_pred))
for i in range(0,len(logs)):
    print("Mean squared error for [",i,"]=",round(logs[i],4))
print("Average MSE through LOO CV=",round(np.mean(logs),4))

Mean squared error for [ 0 ]= 0.0816
Mean squared error for [ 1 ]= 0.2356
Mean squared error for [ 2 ]= 0.0886
Mean squared error for [ 3 ]= 0.046
Mean squared error for [ 4 ]= 0.0353
Mean squared error for [ 5 ]= 0.2237
Mean squared error for [ 6 ]= 0.0237
Mean squared error for [ 7 ]= 0.4849
Mean squared error for [ 8 ]= 0.0495
Mean squared error for [ 9 ]= 0.0443
Mean squared error for [ 10 ]= 0.1212
Mean squared error for [ 11 ]= 0.0199
Mean squared error for [ 12 ]= 0.0399
Mean squared error for [ 13 ]= 0.0191
Mean squared error for [ 14 ]= 0.0206
Mean squared error for [ 15 ]= 0.2129
Mean squared error for [ 16 ]= 0.0335
Mean squared error for [ 17 ]= 0.0554
Mean squared error for [ 18 ]= 0.517
Mean squared error for [ 19 ]= 0.0162
Mean squared error for [ 20 ]= 0.2796
Mean squared error for [ 21 ]= 0.0138
Mean squared error for [ 22 ]= 0.0601
Mean squared error for [ 23 ]= 0.0934
Mean squared error for [ 24 ]= 0.1771
Mean squared error for [ 25 ]= 0.1798
Mean squared error for [