# Hyperparameter tuning 

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("salaries.csv")
df.head()

Unnamed: 0,company,job,degree,salary_more_then_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0


In [3]:
inputs = df.drop('salary_more_then_100k',axis='columns')

In [4]:
target = df['salary_more_then_100k']

In [5]:
from sklearn.preprocessing import LabelEncoder
le_company = LabelEncoder()
le_job = LabelEncoder()
le_degree = LabelEncoder()

In [6]:
inputs['company_n'] = le_company.fit_transform(inputs['company'])
inputs['job_n'] = le_job.fit_transform(inputs['job'])
inputs['degree_n'] = le_degree.fit_transform(inputs['degree'])

In [7]:
inputs

Unnamed: 0,company,job,degree,company_n,job_n,degree_n
0,google,sales executive,bachelors,2,2,0
1,google,sales executive,masters,2,2,1
2,google,business manager,bachelors,2,0,0
3,google,business manager,masters,2,0,1
4,google,computer programmer,bachelors,2,1,0
5,google,computer programmer,masters,2,1,1
6,abc pharma,sales executive,masters,0,2,1
7,abc pharma,computer programmer,bachelors,0,1,0
8,abc pharma,business manager,bachelors,0,0,0
9,abc pharma,business manager,masters,0,0,1


In [8]:
inputs_n = inputs.drop(['company','job','degree'],axis='columns')

In [9]:
inputs_n

Unnamed: 0,company_n,job_n,degree_n
0,2,2,0
1,2,2,1
2,2,0,0
3,2,0,1
4,2,1,0
5,2,1,1
6,0,2,1
7,0,1,0
8,0,0,0
9,0,0,1


In [10]:
target

0     0
1     0
2     1
3     1
4     0
5     1
6     0
7     0
8     0
9     1
10    1
11    1
12    1
13    1
14    1
15    1
Name: salary_more_then_100k, dtype: int64

In [11]:
from sklearn import svm

In [12]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)
clf.fit(inputs_n, target)
clf.cv_results_

{'mean_fit_time': array([0.00715089, 0.00312572, 0.00422678, 0.006249  , 0.00645146,
        0.00312428]),
 'std_fit_time': array([0.00887377, 0.00625143, 0.00608429, 0.00765343, 0.00790791,
        0.00624857]),
 'mean_score_time': array([0.00230236, 0.00312438, 0.        , 0.00312552, 0.00312428,
        0.00312595]),
 'std_score_time': array([0.00460472, 0.00624876, 0.        , 0.00625105, 0.00624857,
        0.00625191]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


In [13]:
gv = pd.DataFrame(clf.cv_results_)
gv

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.007151,0.008874,0.002302,0.004605,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.5,0.666667,0.0,0.666667,0.666667,0.5,0.258199,6
1,0.003126,0.006251,0.003124,0.006249,1,linear,"{'C': 1, 'kernel': 'linear'}",0.5,0.666667,0.333333,0.666667,0.333333,0.5,0.149071,5
2,0.004227,0.006084,0.0,0.0,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.5,0.333333,0.666667,1.0,1.0,0.7,0.266667,1
3,0.006249,0.007653,0.003126,0.006251,10,linear,"{'C': 10, 'kernel': 'linear'}",0.5,0.666667,0.333333,1.0,0.333333,0.566667,0.249444,3
4,0.006451,0.007908,0.003124,0.006249,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.5,0.333333,0.666667,1.0,1.0,0.7,0.266667,1
5,0.003124,0.006249,0.003126,0.006252,20,linear,"{'C': 20, 'kernel': 'linear'}",0.5,0.666667,0.333333,1.0,0.333333,0.566667,0.249444,3


In [14]:
gv[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.5
1,1,linear,0.5
2,10,rbf,0.7
3,10,linear,0.566667
4,20,rbf,0.7
5,20,linear,0.566667


In [15]:
clf.best_params_

{'C': 10, 'kernel': 'rbf'}

In [16]:
clf.best_score_

0.7

In [17]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [18]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(inputs_n, target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.7,"{'C': 10, 'kernel': 'rbf'}"
1,random_forest,0.566667,{'n_estimators': 1}
2,logistic_regression,0.566667,{'C': 1}
