# K-Fold

In [138]:
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import f1_score, r2_score, accuracy_score
from sklearn.svm import  SVC
from sklearn.linear_model import  Ridge, ElasticNet
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import warnings
warnings.simplefilter('ignore')



## Pizza Datset

In [42]:
pizza = pd.read_csv("./Datasets/pizza.csv")
pizza.head()

Unnamed: 0,Promote,Sales
0,23,554
1,56,1339
2,34,815
3,25,609
4,67,1600


## Concrete datasets

In [43]:
concrete = pd.read_csv("./Cases/Concrete Strength/Concrete_Data.csv")
concrete.head()

Unnamed: 0,Cement,Blast,Fly,Water,Superplasticizer,Coarse,Fine,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [44]:

X = concrete.drop('Strength', axis=1)
y = concrete['Strength']

In [45]:
alpha = [0.001,0.1,0.5,1.5,2]
scores = []
for i in alpha:
    ridge = Ridge(alpha=i)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    results = cross_val_score(ridge, X, y , cv=kf, scoring='r2')
    scores.append(results.mean())

i_max = np.argmax(scores)

print("Best feature: ", alpha[i_max])
print("Best Score: ", scores[i_max])

Best feature:  2
Best Score:  0.6006966418487563


In [46]:
scores

[0.6006959623676316,
 0.6006959960330011,
 0.6006961320395934,
 0.6006964719501772,
 0.6006966418487563]

# GridSearchCV

In [49]:
params = {'alpha': [0.001,0.1,0.5,1.5,2]}
ridge = Ridge()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
gcv = GridSearchCV(ridge, param_grid=params, cv=kf, scoring='r2')
gcv.fit(X,y)

In [50]:
print(gcv.best_params_, gcv.best_score_)

{'alpha': 2} 0.6006966418487563


In [54]:

el = ElasticNet()
params = {'alpha': [0.001,0.1,0.5,1.5,2], 'l1_ratio': [0.001,0.1,0.3,0.5,0.8]}
kf = KFold(n_splits=5, shuffle=True, random_state=42)
gcv = GridSearchCV(el, param_grid=params, cv=kf, scoring='r2')
gcv.fit(X,y)

In [55]:
print(gcv.best_params_, gcv.best_score_)


{'alpha': 2, 'l1_ratio': 0.001} 0.6010949290257621


In [69]:
tst = pd.read_csv("./Cases/Concrete Strength/testConcrete.csv")
tst.head()

Unnamed: 0,Cement,Blast,Fly,Water,Superplasticizer,Coarse,Fine,Age
0,495,120,0,155,5,866,884,75
1,262,129,0,271,2,808,787,174
2,201,48,1,215,5,807,839,113
3,329,141,0,286,1,881,823,229
4,354,14,0,129,2,839,847,210


In [70]:
el = ElasticNet(alpha=2, l1_ratio=0.001)
el.fit(X,y)
y_pred = el.predict(tst)

In [71]:
y_pred

array([69.02486164, 31.47254969, 19.40715451, 46.38618642, 58.61341639,
       16.72167375, 49.79136534, 80.92865824, 31.77125463, 42.36527234,
       43.72208943, 61.48938727, 51.86924716, 15.36309918])

In [72]:
bm = gcv.best_estimator_
bm.fit(X, y)
y_pred = bm.predict(tst)

In [73]:
y_pred

array([69.02486164, 31.47254969, 19.40715451, 46.38618642, 58.61341639,
       16.72167375, 49.79136534, 80.92865824, 31.77125463, 42.36527234,
       43.72208943, 61.48938727, 51.86924716, 15.36309918])

## Kyphosis Dataset

In [74]:
kyp = pd.read_csv("./Cases/Kyphosis/Kyphosis.csv")
kyp.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15


In [75]:
X = kyp.drop('Kyphosis', axis = 1)
y = kyp['Kyphosis']

In [83]:
LogisticRegression().get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [92]:

params = {'penalty': ['l2',None,'l1','elasticnet'],'l1_ratio': np.linspace(0.001,1,5)}
lr = LogisticRegression()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
gcv = GridSearchCV(lr, param_grid=params, cv=kf, scoring='accuracy')
gcv.fit(X,y)


In [93]:
print(gcv.best_params_, gcv.best_score_)


{'l1_ratio': 0.001, 'penalty': 'l2'} 0.8272058823529411


## Concrete Dataset (randomforest)

In [94]:
concrete = pd.read_csv("./Cases/Concrete Strength/Concrete_Data.csv")
X = concrete.drop('Strength', axis=1)
y = concrete['Strength']

In [100]:
max_depth = [2,5,7]
min_sample_split = [2,5,10,20]
min_sample_leaf = [1,5,10,20]
max_features = [3,4,5,6]
params = {'max_depth': max_depth,'min_samples_split': min_sample_split, 'min_samples_leaf': min_sample_leaf, 'max_features': max_features }
rf = RandomForestRegressor()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
gcv = GridSearchCV(rf, param_grid=params, cv=kf, scoring='r2', n_jobs=-1)
gcv.fit(X,y)
print(gcv.best_params_, gcv.best_score_)

{'max_depth': 7, 'max_features': 6, 'min_samples_leaf': 1, 'min_samples_split': 2} 0.8832170599171792


In [103]:
df_cv = pd.DataFrame(gcv.cv_results_)
df_cv.sort_values('rank_test_score', ascending=True).head(1)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
176,0.453531,0.023679,0.015791,0.000748,7,6,1,2,"{'max_depth': 7, 'max_features': 6, 'min_sampl...",0.855906,0.883319,0.891793,0.887609,0.897459,0.883217,0.014433,1


# GridSearch with Pipeline

In [104]:
concrete = pd.read_csv("./Cases/Concrete Strength/Concrete_Data.csv")
concrete.head()

Unnamed: 0,Cement,Blast,Fly,Water,Superplasticizer,Coarse,Fine,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [105]:
X = concrete.drop('Strength', axis=1)
y = concrete['Strength']

`StandardScaler()`

In [113]:
scaler = StandardScaler()
knn = KNeighborsRegressor()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([('SCL',scaler),('KNN',knn)])
params = {'KNN__n_neighbors':[1,2,3,4,5,6,7,8]}
gcv_knn = GridSearchCV(pipe, param_grid=params, cv=kf, scoring='r2', n_jobs=-1)
gcv_knn.fit(X,y)
print(gcv_knn.best_params_, gcv_knn.best_score_)

{'KNN__n_neighbors': 4} 0.7208563697569562


In [114]:
pipe.get_params()

{'memory': None,
 'steps': [('SCL', StandardScaler()), ('KNN', KNeighborsRegressor())],
 'verbose': False,
 'SCL': StandardScaler(),
 'KNN': KNeighborsRegressor(),
 'SCL__copy': True,
 'SCL__with_mean': True,
 'SCL__with_std': True,
 'KNN__algorithm': 'auto',
 'KNN__leaf_size': 30,
 'KNN__metric': 'minkowski',
 'KNN__metric_params': None,
 'KNN__n_jobs': None,
 'KNN__n_neighbors': 5,
 'KNN__p': 2,
 'KNN__weights': 'uniform'}

`MinMaxScaler()`

In [116]:
scaler = MinMaxScaler()
knn = KNeighborsRegressor()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([('SCL',scaler),('KNN',knn)])
params = {'KNN__n_neighbors':[1,2,3,4,5,6,7,8]}
gcv_knn = GridSearchCV(pipe, param_grid=params, cv=kf, scoring='r2', n_jobs=-1)
gcv_knn.fit(X,y)
print(gcv_knn.best_params_, gcv_knn.best_score_)

{'KNN__n_neighbors': 5} 0.6986003851456717


`without scaling`

In [117]:

knn = KNeighborsRegressor()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([('KNN',knn)])
params = {'KNN__n_neighbors':[1,2,3,4,5,6,7,8]}
gcv_knn = GridSearchCV(pipe, param_grid=params, cv=kf, scoring='r2', n_jobs=-1)
gcv_knn.fit(X,y)
print(gcv_knn.best_params_, gcv_knn.best_score_)

{'KNN__n_neighbors': 4} 0.7142754651802192


`scaling and Pipeline mixture`

In [119]:
scl_std, scl_mm = StandardScaler(), MinMaxScaler()
knn = KNeighborsRegressor()
kf = KFold(n_splits=5, shuffle=True, random_state=24)
pipe = Pipeline([('SCL',None),('KNN',knn)])
params = {'KNN__n_neighbors':[1,2,3,4,5,6,7,8], 'SCL': [None, scl_mm, scl_std]}
gcv_knn = GridSearchCV(pipe, param_grid=params, cv=kf, scoring='r2', n_jobs=-1)
gcv_knn.fit(X,y)
print(gcv_knn.best_params_, gcv_knn.best_score_)

{'KNN__n_neighbors': 3, 'SCL': StandardScaler()} 0.7162924205032816


## Glass Dataset

In [125]:
glass = pd.read_csv("./Cases/Glass Identification/Glass.csv")
glass.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,building_windows_float_processed
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,building_windows_float_processed
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,building_windows_float_processed
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,building_windows_float_processed


In [126]:
X = glass.drop('Type', axis=1)
y = glass['Type']

In [122]:
pipe.get_params()

{'memory': None,
 'steps': [('SCL', None), ('KNN', KNeighborsRegressor())],
 'verbose': False,
 'SCL': None,
 'KNN': KNeighborsRegressor(),
 'KNN__algorithm': 'auto',
 'KNN__leaf_size': 30,
 'KNN__metric': 'minkowski',
 'KNN__metric_params': None,
 'KNN__n_jobs': None,
 'KNN__n_neighbors': 5,
 'KNN__p': 2,
 'KNN__weights': 'uniform'}

In [127]:
scaler = StandardScaler()
penalities = ['l2',None,'l1','elasticnet' ]
l1_ratios = np.linspace(0.001,1,5)
multi = ['ovr','multinomial']

In [143]:
scl_std, scl_mm = StandardScaler(), MinMaxScaler()
lr = LogisticRegression(solver='saga', random_state=24)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
pipe = Pipeline([('SCL',None),('LR',lr)])
params = {'LR__l1_ratio':l1_ratios, 'SCL': [None, scl_mm, scl_std], 'LR__penalty':penalities, 'LR__multi_class': multi}
gcv_lr = GridSearchCV(pipe, param_grid=params, cv=kf, scoring='accuracy', n_jobs=-1)
gcv_lr.fit(X,y)
print(gcv_lr.best_params_, gcv_lr.best_score_)

{'LR__l1_ratio': 0.001, 'LR__multi_class': 'multinomial', 'LR__penalty': None, 'SCL': StandardScaler()} 0.6451827242524917


while doing the classification problem always use `StratifiedKFold`

In [133]:
glass_df = pd.DataFrame(gcv_lr.cv_results_)
glass_df.sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_LR__l1_ratio,param_LR__multi_class,param_LR__penalty,param_SCL,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
64,0.022987,0.000894,0.003998,0.000632,0.50050,multinomial,,MinMaxScaler(),"{'LR__l1_ratio': 0.5005, 'LR__multi_class': 'm...",0.604651,0.581395,0.744186,0.465116,0.690476,0.617165,0.095983,1
40,0.023386,0.002415,0.004997,0.001999,0.25075,multinomial,,MinMaxScaler(),"{'LR__l1_ratio': 0.25075, 'LR__multi_class': '...",0.604651,0.581395,0.744186,0.465116,0.690476,0.617165,0.095983,1
16,0.024985,0.002279,0.003798,0.000400,0.00100,multinomial,,MinMaxScaler(),"{'LR__l1_ratio': 0.001, 'LR__multi_class': 'mu...",0.604651,0.581395,0.744186,0.465116,0.690476,0.617165,0.095983,1
112,0.026784,0.003542,0.005797,0.003653,1.00000,multinomial,,MinMaxScaler(),"{'LR__l1_ratio': 1.0, 'LR__multi_class': 'mult...",0.604651,0.581395,0.744186,0.465116,0.690476,0.617165,0.095983,1
88,0.027984,0.004424,0.003598,0.000490,0.75025,multinomial,,MinMaxScaler(),"{'LR__l1_ratio': 0.75025, 'LR__multi_class': '...",0.604651,0.581395,0.744186,0.465116,0.690476,0.617165,0.095983,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45,0.050971,0.011586,0.005197,0.003427,0.25075,multinomial,elasticnet,,"{'LR__l1_ratio': 0.25075, 'LR__multi_class': '...",0.534884,0.348837,0.511628,0.372093,0.452381,0.443965,0.073662,112
12,0.021586,0.001625,0.003799,0.000400,0.00100,multinomial,l2,,"{'LR__l1_ratio': 0.001, 'LR__multi_class': 'mu...",0.534884,0.348837,0.511628,0.372093,0.452381,0.443965,0.073662,112
69,0.040776,0.006427,0.004598,0.001495,0.50050,multinomial,elasticnet,,"{'LR__l1_ratio': 0.5005, 'LR__multi_class': 'm...",0.534884,0.348837,0.511628,0.372093,0.452381,0.443965,0.073662,112
93,0.032581,0.005120,0.009794,0.005976,0.75025,multinomial,elasticnet,,"{'LR__l1_ratio': 0.75025, 'LR__multi_class': '...",0.534884,0.348837,0.511628,0.372093,0.452381,0.443965,0.073662,112
