In [3]:
import pandas as pd
import numpy as np 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import os 
os.chdir("C:/Python/Datasets/")

In [4]:
pizza = pd.read_csv("pizza.csv")
X = pizza[['Promote']]
y = pizza['Sales']

In [7]:
lr = LinearRegression()
kfold = KFold(n_splits=5, shuffle=True, random_state=25)

In [9]:
next(iter(kfold.split(pizza)))

(array([ 0,  1,  2,  3,  4,  5,  6,  8,  9, 11, 12, 13, 15, 16, 18]),
 array([ 7, 10, 14, 17]))

In [11]:
scores = []
for i, (train_index, test_index) in enumerate(kfold.split(pizza)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    lr.fit(X_train, y_train)

    print(f"  Test:  index={test_index}")    
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    y_pred = lr.predict(X_test)
    print("Predictions:",y_pred)
    
    print(r2_score(y_true=y_test, y_pred=y_pred))
    scores.append(r2_score(y_true=y_test, y_pred=y_pred))

Fold 0:
  Train: index=[ 0  1  2  3  4  5  6  8  9 11 12 13 15 16 18]
  Test:  index=[ 7 10 14 17]
Predictions: [ 332.00348235 1244.62508209 1580.85409253 2157.24668184]
0.977595082312369
Fold 1:
  Train: index=[ 2  4  5  7  8  9 10 11 12 13 14 15 16 17 18]
  Test:  index=[0 1 3 6]
Predictions: [ 549.45961696 1324.89864507  596.4559217  1089.9171214 ]
0.9790917952286047
Fold 2:
  Train: index=[ 0  1  2  3  4  6  7  8 10 12 13 14 15 16 17]
  Test:  index=[ 5  9 11 18]
Predictions: [1934.75259821 1105.87481441 2076.84593257  513.81925456]
0.9910691519887425
Fold 3:
  Train: index=[ 0  1  3  4  5  6  7  9 10 11 12 14 15 17 18]
  Test:  index=[ 2  8 13 16]
Predictions: [ 801.40307593  155.23096247 1816.81639708 1586.04064228]
0.9833261216932678
Fold 4:
  Train: index=[ 0  1  2  3  5  6  7  8  9 10 11 13 14 16 17 18]
  Test:  index=[ 4 12 15]
Predictions: [1573.60253204 1666.84821223 1037.43987098]
0.9713879680534088


In [12]:
np.mean(scores)

0.9804940238552785

In [14]:
scores

[0.977595082312369,
 0.9790917952286047,
 0.9910691519887425,
 0.9833261216932678,
 0.9713879680534088]

In [15]:
results = cross_val_score(lr, X, y, cv=kfold)
print(results)
print(np.mean(results))

[0.97759508 0.9790918  0.99106915 0.98332612 0.97138797]
0.9804940238552785


#### Breast Cancer

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [16]:
wisconsin = pd.read_csv("C:/Python/Cases/Wisconsin/BreastCancer.csv", index_col=0)
y = wisconsin['Class']
X = wisconsin.drop('Class', axis=1)

In [None]:
lr = LogisticRegression()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=25)
results = cross_val_score(lr, X, y, cv=kfold) # defaults to accuracy
print( results.mean() )

0.9613874614594039


In [38]:
results = cross_val_score(lr, X, y, cv=kfold, scoring='roc_auc') 
print( results.mean() )

0.9944343145762844


In [39]:
results = cross_val_score(lr, X, y, cv=kfold, scoring='neg_log_loss') 
print( results.mean() )

-0.09600334555767301


Hyper-Parameter Tuning

In [53]:
solvers = ['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']
Cs = np.linspace(0.001, 4, 20)
scores = []
for s in solvers:
    for c in Cs:
        lr = LogisticRegression(solver=s, C=c, random_state=25)
        results = cross_val_score(lr, X, y, cv=kfold, scoring='roc_auc') 
        scores.append([s , c ,np.mean(results)] )



In [54]:
df_scores = pd.DataFrame( scores, columns=['solver','C','score'] )
df_scores.sort_values( 'score', ascending=False )

Unnamed: 0,solver,C,score
56,newton-cg,3.368579,0.995573
36,liblinear,3.368579,0.995448
45,newton-cg,1.053368,0.995337
35,liblinear,3.158105,0.995291
97,sag,3.579053,0.995152
...,...,...,...
52,newton-cg,2.526684,0.992051
22,liblinear,0.421947,0.991681
110,saga,2.105737,0.991443
21,liblinear,0.211474,0.986086


### Grid Search CV

In [55]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {'solver': ['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'],
          'C': np.linspace(0.001, 4, 20)}
gcv = GridSearchCV(lr, param_grid=params, cv=kfold, scoring='roc_auc')
gcv.fit(X,y)

In [57]:
print( gcv.best_params_ )
print(gcv.best_score_)

{'C': 0.001, 'solver': 'lbfgs'}
0.9947864053648215


In [60]:
df_cv = pd.DataFrame( gcv.cv_results_ )
df_cv

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005704,0.000544,0.003295,0.000237,0.001,lbfgs,"{'C': 0.001, 'solver': 'lbfgs'}",0.993659,0.996603,0.995018,0.995290,0.993361,0.994786,0.001176,1
1,0.003029,0.000564,0.003094,0.000623,0.001,liblinear,"{'C': 0.001, 'solver': 'liblinear'}",0.951313,0.916440,0.973732,0.963893,0.972070,0.955490,0.021074,120
2,0.006654,0.000927,0.002757,0.000321,0.001,newton-cg,"{'C': 0.001, 'solver': 'newton-cg'}",0.993659,0.996603,0.995018,0.995290,0.993361,0.994786,0.001176,1
3,0.004801,0.000886,0.002667,0.000506,0.001,newton-cholesky,"{'C': 0.001, 'solver': 'newton-cholesky'}",0.993659,0.996603,0.995018,0.995290,0.993361,0.994786,0.001176,1
4,0.006755,0.000573,0.002610,0.000493,0.001,sag,"{'C': 0.001, 'solver': 'sag'}",0.993659,0.996603,0.995018,0.995290,0.993361,0.994786,0.001176,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0.003391,0.000665,0.003153,0.000767,4.000,liblinear,"{'C': 4.0, 'solver': 'liblinear'}",0.992074,0.994565,0.996150,0.994618,0.993132,0.994108,0.001395,58
116,0.009802,0.001035,0.003209,0.000403,4.000,newton-cg,"{'C': 4.0, 'solver': 'newton-cg'}",0.992301,0.995245,0.996377,0.994618,0.991987,0.994105,0.001701,60
117,0.004491,0.000531,0.002916,0.000479,4.000,newton-cholesky,"{'C': 4.0, 'solver': 'newton-cholesky'}",0.992301,0.995245,0.996377,0.994618,0.991987,0.994105,0.001701,60
118,0.009340,0.000887,0.002907,0.000919,4.000,sag,"{'C': 4.0, 'solver': 'sag'}",0.992754,0.993433,0.996150,0.994842,0.993361,0.994108,0.001229,42


### Randomized Search CV

In [61]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
params = {'solver': ['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'],
          'C': np.linspace(0.001, 4, 20)}
rgcv = RandomizedSearchCV(lr, param_distributions=params, cv=kfold, 
                          scoring='roc_auc',n_iter=20, random_state=25, verbose=3)
rgcv.fit(X,y)

In [66]:
print( rgcv.best_params_ )
print( rgcv.best_score_ )

{'solver': 'newton-cholesky', 'C': 0.6324210526315789}
0.9946069116848844


In [67]:
df_rcv = pd.DataFrame( rgcv.cv_results_ )
df_rcv

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_solver,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005924,0.000991,0.00411,0.000653,newton-cholesky,0.632421,"{'solver': 'newton-cholesky', 'C': 0.632421052...",0.991395,0.991848,0.995697,0.997757,0.996337,0.994607,0.002531,1
1,0.017466,0.006113,0.005298,0.003457,saga,1.263842,"{'solver': 'saga', 'C': 1.2638421052631577}",0.98596,0.993207,0.995245,0.996187,0.993819,0.992883,0.003617,18
2,0.009233,0.000177,0.002579,0.000337,saga,2.526684,"{'solver': 'saga', 'C': 2.5266842105263154}",0.98596,0.993207,0.995018,0.996187,0.993819,0.992838,0.003588,19
3,0.009017,0.000634,0.002247,0.000379,sag,1.053368,"{'solver': 'sag', 'C': 1.0533684210526315}",0.990489,0.992301,0.995471,0.99686,0.995879,0.9942,0.002404,12
4,0.004339,0.000206,0.003285,0.000781,newton-cholesky,1.053368,"{'solver': 'newton-cholesky', 'C': 1.053368421...",0.991395,0.991395,0.995697,0.997757,0.996337,0.994516,0.002634,5
5,0.005446,0.001535,0.003019,0.000544,newton-cholesky,0.842895,"{'solver': 'newton-cholesky', 'C': 0.842894736...",0.991395,0.991621,0.995697,0.997757,0.996337,0.994562,0.002582,3
6,0.009708,0.000612,0.002688,0.000582,sag,2.526684,"{'solver': 'sag', 'C': 2.5266842105263154}",0.990263,0.992301,0.995471,0.99686,0.995879,0.994155,0.002474,15
7,0.006402,0.000616,0.002554,0.000585,lbfgs,0.211474,"{'solver': 'lbfgs', 'C': 0.2114736842105263}",0.991395,0.991621,0.995697,0.997309,0.996337,0.994472,0.002475,6
8,0.008709,0.000585,0.003088,0.000787,newton-cg,3.789526,"{'solver': 'newton-cg', 'C': 3.7895263157894736}",0.991395,0.991395,0.995245,0.997757,0.996337,0.994426,0.0026,9
9,0.008013,0.000692,0.002361,0.000543,newton-cg,0.632421,"{'solver': 'newton-cg', 'C': 0.6324210526315789}",0.991395,0.991848,0.995697,0.997757,0.996337,0.994607,0.002531,1


##### Concrete Strength

In [70]:
from sklearn.ensemble import RandomForestRegressor

In [72]:
concrete = pd.read_csv("C:/Python/Cases/Concrete_Strength/Concrete_Data.csv")
X = concrete.drop('Strength', axis=1)
y = concrete['Strength']

In [None]:
rf = RandomForestRegressor(random_state=25)
rf.get_params()

In [73]:
params = {'max_features':[3,4,5,6,7], 'min_samples_split':[2,5,10,20],
          'max_depth':[None, 3, 5], 'min_samples_leaf':[1,5,10,20]}
kfold = KFold(n_splits=5, shuffle=True, random_state=25)
gcv = GridSearchCV(rf, param_grid=params, cv=kfold, scoring='r2', verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
[CV 1/5] END max_depth=None, max_features=3, min_samples_leaf=1, min_samples_split=2;, score=0.903 total time=   0.2s
[CV 2/5] END max_depth=None, max_features=3, min_samples_leaf=1, min_samples_split=2;, score=0.873 total time=   0.2s
[CV 3/5] END max_depth=None, max_features=3, min_samples_leaf=1, min_samples_split=2;, score=0.924 total time=   0.2s
[CV 4/5] END max_depth=None, max_features=3, min_samples_leaf=1, min_samples_split=2;, score=0.940 total time=   0.1s
[CV 5/5] END max_depth=None, max_features=3, min_samples_leaf=1, min_samples_split=2;, score=0.917 total time=   0.1s
[CV 1/5] END max_depth=None, max_features=3, min_samples_leaf=1, min_samples_split=5;, score=0.898 total time=   0.1s
[CV 2/5] END max_depth=None, max_features=3, min_samples_leaf=1, min_samples_split=5;, score=0.863 total time=   0.1s
[CV 3/5] END max_depth=None, max_features=3, min_samples_leaf=1, min_samples_split=5;, score=0.915 total time=

0,1,2
,estimator,RandomForestR...ndom_state=25)
,param_grid,"{'max_depth': [None, 3, ...], 'max_features': [3, 4, ...], 'min_samples_leaf': [1, 5, ...], 'min_samples_split': [2, 5, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,4
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [74]:
print(gcv.best_params_)
print(gcv.best_score_)

{'max_depth': None, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.914172402081719


In [75]:
best_model = RandomForestRegressor(random_state=25,max_depth=None, 
                                   max_features=4 ,min_samples_leaf=1, min_samples_split=2)
best_model.fit(X, y)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,4
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [77]:
tst = pd.read_csv("C:/Python/Cases/Concrete_Strength/testConcrete.csv")
best_model.predict( tst )

array([63.4894    , 40.6652    , 36.4088    , 43.2591    , 59.89651167,
       30.3139    , 50.965     , 61.06578667, 53.7163    , 48.1584    ,
       47.962     , 50.0491    , 49.4003    , 37.1643    ])

In [78]:
bm = gcv.best_estimator_
bm.predict(tst)

array([63.4894    , 40.6652    , 36.4088    , 43.2591    , 59.89651167,
       30.3139    , 50.965     , 61.06578667, 53.7163    , 48.1584    ,
       47.962     , 50.0491    , 49.4003    , 37.1643    ])

In [80]:
bm

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,4
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [79]:
import pickle

In [81]:
pkfile = open('C:/Python/Cases/Concrete_Strength/rf_conc_311.pkl', 'wb') 
pickle.dump(bm, pkfile)
pkfile.close()