## Hyper parameter tuning

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [15]:
iris = sns.load_dataset('iris')
iris
X = iris.drop('species', axis=1).values
y = iris['species'].values

### Logistic Regression model

In [16]:
from sklearn.metrics import accuracy_score

In [17]:
from sklearn.linear_model import LogisticRegression
clf_LR = LogisticRegression(solver='liblinear')

In [18]:
clf_LR.fit(X, y)
y_pred = clf_LR.predict(X)

In [19]:
accuracy_score(y, y_pred)

0.96

In [8]:
def fxGridSearch(df, target, classifier, params=[], GSopt="RandomGridSearch", scale=None, 
         simplifiedrpt=True, sortrank=True, cv=5, n_iter=2, rtnTS=False):
    
    # classifier: the trained Classifier
    # cv: cross-validation
    # n_inter: number of iteration
    # sortrank: sort the performance by rank
    # GSopt: Grid Search option, "GridSearch" or "RandomGridSearch" to perform Hyper-parameter tuning
    #        "RandomGridSearch" just randomly pick few parameters from the given sets, which is much faster
    # scale: 'minmax', 'standard', default is none
    # rtnTS: return train score
    
    import warnings
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, classification_report    
    
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import json
    
    warnings.filterwarnings('ignore', category=UserWarning) 
    
    classifiername = type(classifier).__name__
    hyperparam_opt, docref = '', ''
    webref = 'https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/'
    
    if not params: # no parameters specified, do recommendation
        helperlist=[]
        
        if classifiername=='LogisticRegression':
            hyperparam = {'C': [100, 10, 1.0, 0.1, 0.01], 'solver': ['lbfgs', 'liblinear', 'saga', 'sag'], 'l1_ratio':[-1, -0.5, 0, 0.5, 1]}
            hyperparam_opt = "'penalty': ['none', 'l1', 'l2', 'elasticnet']" + "  * Not all solvers support all regularization terms."                             
            docref = 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html'
        elif classifiername=='DecisionTreeClassifier':
            hyperparam = {'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4, 5] }
            docref = 'https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html'
        elif classifiername=='KNeighborsClassifier':
            hyperparam = {'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan', 'minkowski'] }
            hyperparam_opt = "'n_neighbors': [2, 3, 5, 7, 9]" + " * if not optimized with elbow method"
            docref = 'https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html'
        elif classifiername=='RandomForestClassifier':
            hyperparam = {'max_features':['sqrt', 'log2'], 'n_estimators':[10, 100, 1000] }
            docref = 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html'
        elif classifiername=='SVC':
            hyperparam = {'kernel':['linear', 'rbf'], 'C':[0.1, 0.01] }
            docref = 'https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html'
        elif classifiername=='XGBClassifier':
            hyperparam = {'learning_rate': [0.01, 0.1], 'n_estimators': [100, 1000], 
                          'max_depth': [3, 4, 5], 'subsample': [0.8, 0.1], 'colsample_bytree': [0.3, 0.8],
                          'gamma': [0, 1]}
            docref = 'https://xgboost.readthedocs.io/en/latest/parameter.html'
            webref = 'https://towardsdatascience.com/fine-tuning-xgboost-in-python-like-a-boss-b4543ed8b1e'

        # ******** generate helper string ************
        helperlist.append(type(classifier).__name__ + ', recommended hyper parameters: \n')
        helperlist.append(json.dumps(hyperparam))
        
        if hyperparam_opt!='': helperlist.append('optional parameters: ' + hyperparam_opt)
        if docref!='': helperlist.append('doc ref: ' + docref)
        if webref!='': helperlist.append('web ref: ' + webref)
        
        helperstr = '\n'.join(helperlist)
        print(helperstr)
        return hyperparam
    
    dfX = df.drop(target, 1)
    X = dfX.values
    y = df[target].values
    #xcols = dfX.columns.tolist()
    
    if scale=='standard':
        from sklearn.preprocessing import StandardScaler
        dfXs = pd.DataFrame(StandardScaler().fit_transform(dfX.values), columns=dfX.columns)
        X = dfXs.values
    elif scale=='minmax':
        from sklearn.preprocessing import MinMaxScaler
        dfXs = pd.DataFrame(MinMaxScaler().fit_transform(dfX.values), columns=dfX.columns)
        X = dfXs.values
        
    if GSopt=="GridSearch":
        # Using stanrd grid search
        from sklearn.model_selection import GridSearchCV
        clf_GSCV = GridSearchCV(classifier, params, cv=cv, return_train_score = rtnTS)
    elif GSopt=="RandomGridSearch":
        from sklearn.model_selection import RandomizedSearchCV
        clf_GSCV = RandomizedSearchCV(classifier, params, cv=cv, return_train_score = rtnTS, n_iter=n_iter)
        
            
    clf_GSCV.fit(X, y)
    result = clf_GSCV.cv_results_
    df_result = pd.DataFrame(result)
    
    if simplifiedrpt == True:
        cvcols = df_result.columns.tolist()
        cvcols_sel = []

        for paramstr in cvcols:
            if "param_" in paramstr:
                cvcols_sel.append(paramstr)
        cvcols_sel.append('mean_test_score')
        cvcols_sel.append('rank_test_score')
        
        df_result = df_result[cvcols_sel]
        colrev = df_result.columns.tolist()
        colrev = [item.replace("param_", "") for item in colrev]
        df_result.columns = colrev
        
        if sortrank==True:
            df_result.sort_values(by="rank_test_score", inplace=True)

    return df_result

In [22]:
params_LR = fxGridSearch(iris, 'species', clf_LR)

LogisticRegression, recommended hyper parameters: 

{"C": [100, 10, 1.0, 0.1, 0.01], "solver": ["lbfgs", "liblinear", "saga", "sag"], "l1_ratio": [-1, -0.5, 0, 0.5, 1]}
optional parameters: 'penalty': ['none', 'l1', 'l2', 'elasticnet']  * Not all solvers support all regularization terms.
doc ref: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
web ref: https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/


In [24]:
fxGridSearch(iris, 'species', clf_LR, params=params_LR, GSopt="GridSearch")

Unnamed: 0,C,l1_ratio,solver,mean_test_score,rank_test_score
54,1,0.5,saga,0.986667,1
50,1,0,saga,0.986667,1
46,1,-0.5,saga,0.986667,1
58,1,1,saga,0.986667,1
0,100,-1,lbfgs,0.980000,5
...,...,...,...,...,...
89,0.01,0,liblinear,0.666667,96
81,0.01,-1,liblinear,0.666667,96
93,0.01,0.5,liblinear,0.666667,96
97,0.01,1,liblinear,0.666667,96


#### Observations:
+ using `saga` solver with C=1, l1_ratio: 0.5, 1, -0.5 giving higher accuracy

### Using Decision Tree classifier

In [33]:
from sklearn.tree import DecisionTreeClassifier
clf_DT = DecisionTreeClassifier()
clf_DT.fit(X, y)

DecisionTreeClassifier()

In [34]:
y_pred = clf_DT.predict(X)
accuracy_score(y, y_pred)

1.0

In [31]:
params_DT = fxGridSearch(iris, 'species', clf_DT)

DecisionTreeClassifier, recommended hyper parameters: 

{"criterion": ["gini", "entropy"], "max_depth": [2, 3, 4, 5]}
doc ref: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
web ref: https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/


In [32]:
fxGridSearch(iris, 'species', clf_DT, params=params_DT, GSopt="GridSearch")

Unnamed: 0,criterion,max_depth,mean_test_score,rank_test_score
2,gini,4,0.966667,1
3,gini,5,0.966667,1
1,gini,3,0.96,3
5,entropy,3,0.96,3
6,entropy,4,0.953333,5
7,entropy,5,0.953333,5
0,gini,2,0.933333,7
4,entropy,2,0.933333,7


### Using XGB

In [35]:
from xgboost import XGBClassifier
clf_xgb = XGBClassifier()
clf_xgb.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [36]:
y_pred = clf_xgb.predict(X)
accuracy_score(y, y_pred)

1.0

In [38]:
params_xgb = fxGridSearch(iris, 'species', clf_xgb)

XGBClassifier, recommended hyper parameters: 

{"learning_rate": [0.01, 0.1], "n_estimators": [100, 1000], "max_depth": [3, 4, 5], "subsample": [0.8, 0.1], "colsample_bytree": [0.3, 0.8], "gamma": [0, 1]}
doc ref: https://xgboost.readthedocs.io/en/latest/parameter.html
web ref: https://towardsdatascience.com/fine-tuning-xgboost-in-python-like-a-boss-b4543ed8b1e


In [40]:
fxGridSearch(iris, 'species', clf_xgb, params=params_xgb, GSopt="GridSearch")

Unnamed: 0,colsample_bytree,gamma,learning_rate,max_depth,n_estimators,subsample,mean_test_score,rank_test_score
22,0.3,0,0.1,5,1000,0.8,0.966667,1
84,0.8,1,0.1,3,100,0.8,0.966667,1
92,0.8,1,0.1,5,100,0.8,0.966667,1
18,0.3,0,0.1,4,1000,0.8,0.966667,1
14,0.3,0,0.1,3,1000,0.8,0.966667,1
...,...,...,...,...,...,...,...,...
73,0.8,1,0.01,3,100,0.1,0.940000,79
41,0.3,1,0.1,4,100,0.1,0.940000,79
63,0.8,0,0.1,3,1000,0.1,0.940000,94
67,0.8,0,0.1,4,1000,0.1,0.940000,94
