In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [79]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.preprocessing import LabelEncoder, normalize, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from bayes_opt import BayesianOptimization

In [3]:
data = pd.read_csv(r'C:\Users\durga\Desktop\uOttawa\Term1\ML_HernaViktor\Assignments\Assignment2\labor-neg.csv')
data = data.replace('?', np.nan)
original = data.copy()
mask = data.isnull()

In [4]:
data.isnull().sum()
#predict if more than 50%(>28 missing) missing else use statistical methods to fill
#features_to_predict = ['wage3.wage','pension','stby_pay','educ_allw.boolean','lngterm_disabil.boolean']

Duration                   1
wage1.wage                 1
wage2.wage                11
wage3.wage                42
cola                      20
hours.hrs                  6
pension                   30
stby_pay                  48
shift_diff                26
educ_allw.boolean         35
holidays                   4
vacation                   6
lngtrm_disabil.boolean    29
dntl_ins                  20
bereavement.boolea        27
empl_hplan                20
Agreement                  0
dtype: int64

In [5]:
#categorical to numerical
cat_columns = ['cola','pension', 'educ_allw.boolean', 'vacation', 'lngtrm_disabil.boolean', 'dntl_ins', 'bereavement.boolea', 'empl_hplan', 'Agreement',]
for column in cat_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
data = data.where(~mask, original)
data.dtypes

Duration                  object
wage1.wage                object
wage2.wage                object
wage3.wage                object
cola                      object
hours.hrs                 object
pension                   object
stby_pay                  object
shift_diff                object
educ_allw.boolean         object
holidays                  object
vacation                  object
lngtrm_disabil.boolean    object
dntl_ins                  object
bereavement.boolea        object
empl_hplan                object
Agreement                  int32
dtype: object

In [6]:
#convert numeric data to float datatype
data[['Duration','wage1.wage', 'wage2.wage','wage3.wage','hours.hrs','stby_pay','shift_diff','holidays']] = data[['Duration','wage1.wage', 'wage2.wage','wage3.wage','hours.hrs','stby_pay','shift_diff','holidays']].astype(str).astype(float)
data.dtypes

Duration                  float64
wage1.wage                float64
wage2.wage                float64
wage3.wage                float64
cola                       object
hours.hrs                 float64
pension                    object
stby_pay                  float64
shift_diff                float64
educ_allw.boolean          object
holidays                  float64
vacation                   object
lngtrm_disabil.boolean     object
dntl_ins                   object
bereavement.boolea         object
empl_hplan                 object
Agreement                   int32
dtype: object

In [7]:
data.var(numeric_only=True)

Duration       0.500974
wage1.wage     1.878532
wage2.wage     1.354961
wage3.wage     1.701238
hours.hrs      6.278431
stby_pay      25.277778
shift_diff    20.649462
holidays       1.587083
Agreement      0.231830
dtype: float64

In [8]:
#numeric values filling using statistics
data.fillna({"Duration":data['Duration'].mode()[0],
            "wage1.wage":data['wage1.wage'].mean(),
            "wage2.wage":data['wage2.wage'].mean(),
            "hours.hrs": data['hours.hrs'].median(),
            "shift_diff": data['shift_diff'].mean(),
            "holidays": data['holidays'].mean()}, inplace=True)
data.head()

Unnamed: 0,Duration,wage1.wage,wage2.wage,wage3.wage,cola,hours.hrs,pension,stby_pay,shift_diff,educ_allw.boolean,holidays,vacation,lngtrm_disabil.boolean,dntl_ins,bereavement.boolea,empl_hplan,Agreement
0,1.0,5.0,3.971739,,,40.0,,,2.0,,11.0,0.0,,,1.0,,1
1,2.0,4.5,5.8,,,35.0,2.0,,4.870968,1.0,11.0,1.0,,0.0,,0.0,1
2,2.0,3.803571,3.971739,,,38.0,0.0,,5.0,,11.0,2.0,1.0,1.0,1.0,1.0,1
3,3.0,3.7,4.0,5.0,1.0,38.0,,,4.870968,1.0,11.09434,,,,1.0,,1
4,3.0,4.5,4.5,5.0,,40.0,,,4.870968,,12.0,0.0,,1.0,1.0,1.0,1


In [9]:
data.fillna({"cola":data['cola'].mode()[0],
            "vacation":data['vacation'].mode()[0],
            "dntl_ins":data['dntl_ins'].mode()[0],
            "bereavement.boolea":data['bereavement.boolea'].mode()[0],
            "empl_hplan":data['empl_hplan'].mode()[0]}, inplace=True)
data.head()

Unnamed: 0,Duration,wage1.wage,wage2.wage,wage3.wage,cola,hours.hrs,pension,stby_pay,shift_diff,educ_allw.boolean,holidays,vacation,lngtrm_disabil.boolean,dntl_ins,bereavement.boolea,empl_hplan,Agreement
0,1.0,5.0,3.971739,,0,40.0,,,2.0,,11.0,0,,1,1,0,1
1,2.0,4.5,5.8,,0,35.0,2.0,,4.870968,1.0,11.0,1,,0,1,0,1
2,2.0,3.803571,3.971739,,0,38.0,0.0,,5.0,,11.0,2,1.0,1,1,1,1
3,3.0,3.7,4.0,5.0,1,38.0,,,4.870968,1.0,11.09434,1,,1,1,0,1
4,3.0,4.5,4.5,5.0,0,40.0,,,4.870968,,12.0,0,,1,1,1,1


In [10]:
data.isnull().sum()
#features_to_predict = ['wage3.wage','pension','stby_pay','educ_allw.boolean','lngterm_disabil.boolean']

Duration                   0
wage1.wage                 0
wage2.wage                 0
wage3.wage                42
cola                       0
hours.hrs                  0
pension                   30
stby_pay                  48
shift_diff                 0
educ_allw.boolean         35
holidays                   0
vacation                   0
lngtrm_disabil.boolean    29
dntl_ins                   0
bereavement.boolea         0
empl_hplan                 0
Agreement                  0
dtype: int64

In [11]:
#training data = data without null values of lngterm bool
data_wonull = data[~data['lngtrm_disabil.boolean'].isnull()]
#testing data = data with null of lngtermbool
data_wnull = data[data['lngtrm_disabil.boolean'].isnull()]

In [12]:
X_train = data_wonull.drop(columns=['wage3.wage', 'pension','stby_pay','educ_allw.boolean','lngtrm_disabil.boolean'])
Y_train = data_wonull['lngtrm_disabil.boolean']
Y_train = Y_train.astype('int')
X_test = data_wnull.drop(columns=['wage3.wage', 'pension','stby_pay','educ_allw.boolean','lngtrm_disabil.boolean'])

In [13]:
lr = LogisticRegression()
lr.fit(X_train,Y_train)

In [14]:
y_test = lr.predict(X_test)

In [15]:
y_test_df = pd.DataFrame(data = y_test, columns = ['lngtrm_disabil.boolean'], index = X_test.index.copy())
df_out = pd.merge(data, y_test_df, how = 'left', left_index = True, right_index = True)

In [16]:
df_out['lngtrm_disabil.boolean'] = df_out['lngtrm_disabil.boolean_x'].fillna(df_out['lngtrm_disabil.boolean_y'])
data = df_out.drop(columns=['lngtrm_disabil.boolean_x','lngtrm_disabil.boolean_y'])

In [17]:
#training data = data without null values of pension
data_wonull = data[~data['pension'].isnull()]
#testing data = data with null of pension
data_wnull = data[data['pension'].isnull()]

In [18]:
X_train = data_wonull.drop(columns=['wage3.wage', 'pension','stby_pay','educ_allw.boolean'])
Y_train = data_wonull['pension']
Y_train = Y_train.astype('int')
X_test = data_wnull.drop(columns=['wage3.wage', 'pension','stby_pay','educ_allw.boolean'])

In [19]:
lr = LogisticRegression(max_iter=400)
lr.fit(X_train,Y_train)

In [20]:
y_test = lr.predict(X_test)

In [21]:
y_test_df = pd.DataFrame(data = y_test, columns = ['pension'], index = X_test.index.copy())
df_out = pd.merge(data, y_test_df, how = 'left', left_index = True, right_index = True)

In [22]:
df_out['pension'] = df_out['pension_x'].fillna(df_out['pension_y'])
data = df_out.drop(columns=['pension_x','pension_y'])

In [23]:
#training data = data without null values of educ_allw.boolean
data_wonull = data[~data['educ_allw.boolean'].isnull()]
#testing data = data with null of educ_allw.boolean
data_wnull = data[data['educ_allw.boolean'].isnull()]

In [24]:
X_train = data_wonull.drop(columns=['wage3.wage','stby_pay','educ_allw.boolean'])
Y_train = data_wonull['educ_allw.boolean']
Y_train = Y_train.astype('int')
X_test = data_wnull.drop(columns=['wage3.wage','stby_pay','educ_allw.boolean'])

In [25]:
lr = LogisticRegression()
lr.fit(X_train,Y_train)

In [26]:
y_test = lr.predict(X_test)

In [27]:
y_test_df = pd.DataFrame(data = y_test, columns = ['educ_allw.boolean'], index = X_test.index.copy())
df_out = pd.merge(data, y_test_df, how = 'left', left_index = True, right_index = True)

In [28]:
df_out['educ_allw.boolean'] = df_out['educ_allw.boolean_x'].fillna(df_out['educ_allw.boolean_y'])
data = df_out.drop(columns=['educ_allw.boolean_x','educ_allw.boolean_y'])

In [29]:
#training data = data without null values of educ_allw.boolean
data_wonull = data[~data['wage3.wage'].isnull()]
#testing data = data with null of educ_allw.boolean
data_wnull = data[data['wage3.wage'].isnull()]

In [30]:
X_train = data_wonull.drop(columns=['wage3.wage','stby_pay'])
Y_train = data_wonull['wage3.wage']
Y_train = Y_train.astype('int')
X_test = data_wnull.drop(columns=['wage3.wage','stby_pay'])

In [31]:
lr = LogisticRegression(max_iter=500)
lr.fit(X_train,Y_train)

In [32]:
y_test = lr.predict(X_test)

In [33]:
y_test_df = pd.DataFrame(data = y_test, columns = ['wage3.wage'], index = X_test.index.copy())
df_out = pd.merge(data, y_test_df, how = 'left', left_index = True, right_index = True)

In [34]:
df_out['wage3.wage'] = df_out['wage3.wage_x'].fillna(df_out['wage3.wage_y'])
data = df_out.drop(columns=['wage3.wage_x','wage3.wage_y'])

In [35]:
#training data = data without null values of educ_allw.boolean
data_wonull = data[~data['stby_pay'].isnull()]
#testing data = data with null of educ_allw.boolean
data_wnull = data[data['stby_pay'].isnull()]

In [36]:
X_train = data_wonull.drop(columns=['stby_pay'])
Y_train = data_wonull['stby_pay']
Y_train = Y_train.astype('int')
X_test = data_wnull.drop(columns=['stby_pay'])

In [37]:
lr = LogisticRegression(max_iter=300)
lr.fit(X_train,Y_train)

In [38]:
y_test = lr.predict(X_test)

In [39]:
y_test_df = pd.DataFrame(data = y_test, columns = ['stby_pay'], index = X_test.index.copy())
df_out = pd.merge(data, y_test_df, how = 'left', left_index = True, right_index = True)

In [40]:
df_out['stby_pay'] = df_out['stby_pay_x'].fillna(df_out['stby_pay_y'])
data = df_out.drop(columns=['stby_pay_x','stby_pay_y'])
data.isnull().sum()

Duration                  0
wage1.wage                0
wage2.wage                0
cola                      0
hours.hrs                 0
shift_diff                0
holidays                  0
vacation                  0
dntl_ins                  0
bereavement.boolea        0
empl_hplan                0
Agreement                 0
lngtrm_disabil.boolean    0
pension                   0
educ_allw.boolean         0
wage3.wage                0
stby_pay                  0
dtype: int64

In [41]:
df1 = data.pop('Agreement')
data['Agreement'] = df1
data

Unnamed: 0,Duration,wage1.wage,wage2.wage,cola,hours.hrs,shift_diff,holidays,vacation,dntl_ins,bereavement.boolea,empl_hplan,lngtrm_disabil.boolean,pension,educ_allw.boolean,wage3.wage,stby_pay,Agreement
0,1.0,5.0,3.971739,0,40.0,2.0,11.0,0,1,1,0,0.0,2.0,0.0,2.0,8.0,1
1,2.0,4.5,5.8,0,35.0,4.870968,11.0,1,0,1,0,1.0,2.0,1.0,4.0,14.0,1
2,2.0,3.803571,3.971739,0,38.0,5.0,11.0,2,1,1,1,1.0,0.0,0.0,5.0,10.0,1
3,3.0,3.7,4.0,1,38.0,4.870968,11.09434,1,1,1,0,1.0,0.0,1.0,5.0,14.0,1
4,3.0,4.5,4.5,0,40.0,4.870968,12.0,0,1,1,1,1.0,0.0,0.0,5.0,14.0,1
5,2.0,2.0,2.5,0,35.0,6.0,12.0,0,1,1,0,1.0,0.0,1.0,4.0,2.0,1
6,3.0,4.0,5.0,1,38.0,4.870968,12.0,2,2,1,1,1.0,0.0,1.0,5.0,10.0,1
7,3.0,6.9,4.8,0,40.0,3.0,12.0,1,1,1,0,0.0,2.0,0.0,2.3,8.0,1
8,2.0,3.0,7.0,0,38.0,25.0,11.0,1,1,1,0,1.0,0.0,1.0,5.0,12.0,1
9,1.0,5.7,3.971739,0,40.0,4.0,11.0,2,0,1,0,1.0,0.0,0.0,5.0,14.0,1


In [42]:
#feature selection - remove least variance
L = pd.DataFrame(data.drop(columns='Agreement').var())
L = L.sort_values(by=[0], ascending=False)
L

Unnamed: 0,0
stby_pay,29.054511
shift_diff,11.062212
hours.hrs,5.60589
wage3.wage,1.869273
wage1.wage,1.844987
holidays,1.47372
wage2.wage,1.088808
vacation,0.588972
pension,0.539474
empl_hplan,0.536341


In [43]:
#dropping values with variance < 0.15(variance < 0.15 means the data is constant for more than 85% of the data)
data = data.drop(columns=['lngtrm_disabil.boolean','bereavement.boolea'])
data

Unnamed: 0,Duration,wage1.wage,wage2.wage,cola,hours.hrs,shift_diff,holidays,vacation,dntl_ins,empl_hplan,pension,educ_allw.boolean,wage3.wage,stby_pay,Agreement
0,1.0,5.0,3.971739,0,40.0,2.0,11.0,0,1,0,2.0,0.0,2.0,8.0,1
1,2.0,4.5,5.8,0,35.0,4.870968,11.0,1,0,0,2.0,1.0,4.0,14.0,1
2,2.0,3.803571,3.971739,0,38.0,5.0,11.0,2,1,1,0.0,0.0,5.0,10.0,1
3,3.0,3.7,4.0,1,38.0,4.870968,11.09434,1,1,0,0.0,1.0,5.0,14.0,1
4,3.0,4.5,4.5,0,40.0,4.870968,12.0,0,1,1,0.0,0.0,5.0,14.0,1
5,2.0,2.0,2.5,0,35.0,6.0,12.0,0,1,0,0.0,1.0,4.0,2.0,1
6,3.0,4.0,5.0,1,38.0,4.870968,12.0,2,2,1,0.0,1.0,5.0,10.0,1
7,3.0,6.9,4.8,0,40.0,3.0,12.0,1,1,0,2.0,0.0,2.3,8.0,1
8,2.0,3.0,7.0,0,38.0,25.0,11.0,1,1,0,0.0,1.0,5.0,12.0,1
9,1.0,5.7,3.971739,0,40.0,4.0,11.0,2,0,0,0.0,0.0,5.0,14.0,1


In [44]:
#feature scaling
scaler = MinMaxScaler()
scaler.fit(data)
D_transformed = scaler.transform(data)

In [45]:
D_transformed = pd.DataFrame(D_transformed, columns=list(data.columns.values))
D_transformed

Unnamed: 0,Duration,wage1.wage,wage2.wage,cola,hours.hrs,shift_diff,holidays,vacation,dntl_ins,empl_hplan,pension,educ_allw.boolean,wage3.wage,stby_pay,Agreement
0,0.0,0.6,0.394348,0.0,1.0,0.08,0.333333,0.0,0.5,0.0,1.0,0.0,0.0,0.5,1.0
1,0.5,0.5,0.76,0.0,0.615385,0.194839,0.333333,0.5,0.0,0.0,1.0,1.0,0.645161,1.0,1.0
2,0.5,0.360714,0.394348,0.0,0.846154,0.2,0.333333,1.0,0.5,0.5,0.0,0.0,0.967742,0.666667,1.0
3,1.0,0.34,0.4,0.5,0.846154,0.194839,0.349057,0.5,0.5,0.0,0.0,1.0,0.967742,1.0,1.0
4,1.0,0.5,0.5,0.0,1.0,0.194839,0.5,0.0,0.5,0.5,0.0,0.0,0.967742,1.0,1.0
5,0.5,0.0,0.1,0.0,0.615385,0.24,0.5,0.0,0.5,0.0,0.0,1.0,0.645161,0.0,1.0
6,1.0,0.4,0.6,0.5,0.846154,0.194839,0.5,1.0,1.0,0.5,0.0,1.0,0.967742,0.666667,1.0
7,1.0,0.98,0.56,0.0,1.0,0.12,0.5,0.5,0.5,0.0,1.0,0.0,0.096774,0.5,1.0
8,0.5,0.2,1.0,0.0,0.846154,1.0,0.333333,0.5,0.5,0.0,0.0,1.0,0.967742,0.833333,1.0
9,0.0,0.74,0.394348,0.0,1.0,0.16,0.333333,1.0,0.0,0.0,0.0,0.0,0.967742,1.0,1.0


In [46]:
X = D_transformed.drop(columns=['Agreement']).values
y = D_transformed['Agreement'].values

In [52]:
#SVM
clf = SVC(kernel="rbf", C=1,random_state=0)
scores = cross_val_score(clf, X, y, cv=10)
scores
#finding default gamma
gamma = 1 / (X.shape[1] * X.var())
gamma

0.49840619749805587

In [53]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.93 accuracy with a standard deviation of 0.09


In [62]:
hparams = {"C": (0.01, 100), "gamma": (0.001, 1)}
def estimator(C, gamma):
    # initialize model
    model = SVC(C=C, gamma=gamma, degree=1, random_state=0)
    # set in cross-validation
    result = cross_validate(model, X, y, cv=10)
    # result is mean of test_score
    return np.mean(result['test_score'])
from bayes_opt import BayesianOptimization
# give model and hyperparameter to optmizer
svc_bayesopt = BayesianOptimization(estimator, hparams)

In [68]:
svc_bayesopt.maximize(init_points=5, n_iter=50, acq='ucb')
svc_bayesopt.max

|   iter    |  target   |     C     |   gamma   |
-------------------------------------------------
| [0m221      [0m | [0m0.8733   [0m | [0m95.95    [0m | [0m0.8181   [0m |
| [0m222      [0m | [0m0.8733   [0m | [0m70.53    [0m | [0m0.8457   [0m |
| [0m223      [0m | [0m0.8933   [0m | [0m79.57    [0m | [0m0.5203   [0m |
| [0m224      [0m | [0m0.8733   [0m | [0m27.15    [0m | [0m0.3981   [0m |
| [0m225      [0m | [0m0.89     [0m | [0m5.477    [0m | [0m0.25     [0m |
| [0m226      [0m | [0m0.9267   [0m | [0m1.456    [0m | [0m0.1618   [0m |
| [0m227      [0m | [0m0.91     [0m | [0m1.537    [0m | [0m0.06811  [0m |
| [0m228      [0m | [0m0.9267   [0m | [0m1.32     [0m | [0m0.5518   [0m |
| [0m229      [0m | [0m0.9267   [0m | [0m0.9377   [0m | [0m0.6229   [0m |
| [0m230      [0m | [0m0.91     [0m | [0m1.478    [0m | [0m0.2893   [0m |
| [0m231      [0m | [0m0.91     [0m | [0m4.173    [0m | [0m0.3844   [0m 

{'target': 0.9466666666666667,
 'params': {'C': 1.0912305920913907, 'gamma': 0.20025095351096645}}

In [69]:
#SVM
#bestparms: C=1, gamma=0.2167860675976215
clf = SVC(kernel="rbf", C=1,random_state=0, gamma=0.2167860675976215)
scores = cross_val_score(clf, X, y, cv=10)
scores
#finding default gamma
gamma = 1 / (X.shape[1] * X.var())
gamma

0.49840619749805587

In [70]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.95 accuracy with a standard deviation of 0.08


In [71]:
#DT
clf = DecisionTreeClassifier(criterion='gini',max_depth=4,random_state=0)
scores = cross_val_score(clf, X, y, cv=10)
scores

array([1.        , 0.83333333, 0.83333333, 0.83333333, 0.83333333,
       1.        , 0.66666667, 0.8       , 0.8       , 0.8       ])

In [72]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.84 accuracy with a standard deviation of 0.09


In [73]:
hparams = {"min_samples_leaf": (0.1, 0.5), "min_samples_split": (0.1, 1.0)}
def estimator(min_samples_leaf, min_samples_split):
    # initialize model
    model = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)
    # set in cross-validation
    result = cross_validate(model, X, y, cv=10)
    # result is mean of test_score
    return np.mean(result['test_score'])
from bayes_opt import BayesianOptimization
# give model and hyperparameter to optmizer
dt_bayesopt = BayesianOptimization(estimator, hparams)

In [74]:
dt_bayesopt.maximize(init_points=5, n_iter=50, acq='ucb')
dt_bayesopt.max

|   iter    |  target   | min_sa... | min_sa... |
-------------------------------------------------
| [0m1        [0m | [0m0.91     [0m | [0m0.4063   [0m | [0m0.2951   [0m |
| [0m2        [0m | [0m0.91     [0m | [0m0.4051   [0m | [0m0.6459   [0m |
| [0m3        [0m | [0m0.79     [0m | [0m0.4761   [0m | [0m0.6006   [0m |
| [95m4        [0m | [95m0.9467   [0m | [95m0.3023   [0m | [95m0.5558   [0m |
| [0m5        [0m | [0m0.9467   [0m | [0m0.1901   [0m | [0m0.1006   [0m |
| [0m6        [0m | [0m0.9467   [0m | [0m0.2747   [0m | [0m0.476    [0m |
| [0m7        [0m | [0m0.9467   [0m | [0m0.2442   [0m | [0m0.5312   [0m |
| [0m8        [0m | [0m0.86     [0m | [0m0.4564   [0m | [0m0.5968   [0m |
| [0m9        [0m | [0m0.9467   [0m | [0m0.3014   [0m | [0m0.5524   [0m |
| [0m10       [0m | [0m0.84     [0m | [0m0.4691   [0m | [0m0.1011   [0m |
| [0m11       [0m | [0m0.9467   [0m | [0m0.2305   [0m | [0m0.4866   

{'target': 0.9466666666666667,
 'params': {'min_samples_leaf': 0.3022996207416178,
  'min_samples_split': 0.5557506017577213}}

In [77]:
clf_t = DecisionTreeClassifier()
hparams = {"criterion": ["gini", "entropy", "log_loss"], 
           "max_depth": [1,2,8,16,32],
          "min_samples_split": [0.1,0.5557506017577213,0.5],
          "min_samples_leaf":[0.1,0.3022996207416178,0.5]}

In [80]:
GS_estimator = GridSearchCV(clf_t, hparams, cv=10, scoring="accuracy")
GS_estimator.fit(X, y)
GS_estimator.best_params_

{'criterion': 'gini',
 'max_depth': 1,
 'min_samples_leaf': 0.1,
 'min_samples_split': 0.1}

In [83]:
#DT
#{'criterion': 'gini','max_depth': 1,'min_samples_leaf': 0.1,'min_samples_split': 0.1}
clf = DecisionTreeClassifier(criterion='gini',max_depth=1,random_state=0, min_samples_leaf=0.1, min_samples_split=0.1)
scores = cross_val_score(clf, X, y, cv=10)
scores

array([1.        , 0.83333333, 1.        , 1.        , 0.83333333,
       1.        , 1.        , 1.        , 0.8       , 1.        ])

In [84]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.95 accuracy with a standard deviation of 0.08


In [85]:
#k-neighbors
clf = KNeighborsClassifier()
scores = cross_val_score(clf, X, y, cv=10)
scores

array([0.83333333, 0.83333333, 1.        , 1.        , 0.83333333,
       1.        , 1.        , 0.6       , 0.8       , 1.        ])

In [86]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.89 accuracy with a standard deviation of 0.13


In [87]:
#List Hyperparameters that we want to tune.
leaf_size = [1,5,10,15,20,25,30,35,40,45,50]
n_neighbors = [1,5,10,15,20,25,30]
p=[1,2]
weights = ["uniform", "distance"]
algorithm = ["auto", "ball_tree", "kd_tree", "brute"]
#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p, weights=weights, algorithm=algorithm)
#Create new KNN object
knn_2 = KNeighborsClassifier()
#Use GridSearch
clf_t = GridSearchCV(knn_2, hyperparameters, cv=10)

In [88]:
#Fit the model
best_model = clf_t.fit(X,y)

In [89]:
print("leaf-size:", best_model.best_estimator_.get_params()['leaf_size'],
     "\np:", best_model.best_estimator_.get_params()['p'],
     "\nn_neighbors:", best_model.best_estimator_.get_params()['n_neighbors'],
     "\nweights: ", best_model.best_estimator_.get_params()['weights'],
     "\nalgorithm: ",best_model.best_estimator_.get_params()['algorithm']) 

leaf-size: 1 
p: 1 
n_neighbors: 20 
weights:  uniform 
algorithm:  auto


In [93]:
#k-neighbors
clf = KNeighborsClassifier(leaf_size=1, p=1, n_neighbors=20, weights='uniform', algorithm='auto')
scores = cross_val_score(clf, X, y, cv=10)
scores

array([1. , 1. , 1. , 1. , 1. , 1. , 1. , 0.8, 0.8, 1. ])

In [94]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.96 accuracy with a standard deviation of 0.08


In [96]:
#Random forest
clf = RandomForestClassifier()
scores = cross_val_score(clf, X, y, cv=10)
scores

array([1.        , 0.83333333, 1.        , 1.        , 0.83333333,
       1.        , 1.        , 0.8       , 0.8       , 1.        ])

In [97]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.93 accuracy with a standard deviation of 0.09


In [98]:
hparams = {"min_samples_leaf": (0.1, 0.5), "min_samples_split": (0.1, 1.0)}
def estimator(min_samples_leaf, min_samples_split):
    # initialize model
    model = RandomForestClassifier(min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)
    # set in cross-validation
    result = cross_validate(model, X, y, cv=10)
    # result is mean of test_score
    return np.mean(result['test_score'])
from bayes_opt import BayesianOptimization
# give model and hyperparameter to optmizer
rf_bayesopt = BayesianOptimization(estimator, hparams)

In [99]:
rf_bayesopt.maximize(init_points=5, n_iter=50, acq='ucb')
rf_bayesopt.max

|   iter    |  target   | min_sa... | min_sa... |
-------------------------------------------------
| [0m1        [0m | [0m0.6467   [0m | [0m0.3686   [0m | [0m0.4418   [0m |
| [95m2        [0m | [95m0.92     [0m | [95m0.2027   [0m | [95m0.2054   [0m |
| [0m3        [0m | [0m0.6467   [0m | [0m0.2891   [0m | [0m0.7767   [0m |
| [0m4        [0m | [0m0.6467   [0m | [0m0.3772   [0m | [0m0.8703   [0m |
| [0m5        [0m | [0m0.6467   [0m | [0m0.3812   [0m | [0m0.2124   [0m |
| [0m6        [0m | [0m0.91     [0m | [0m0.1599   [0m | [0m0.2026   [0m |
| [95m7        [0m | [95m0.96     [0m | [95m0.1914   [0m | [95m0.1      [0m |
| [0m8        [0m | [0m0.8733   [0m | [0m0.2416   [0m | [0m0.5685   [0m |
| [0m9        [0m | [0m0.6467   [0m | [0m0.4936   [0m | [0m0.9718   [0m |
| [0m10       [0m | [0m0.6467   [0m | [0m0.38     [0m | [0m0.1774   [0m |
| [0m11       [0m | [0m0.8867   [0m | [0m0.1923   [0m | [0m0.1027

{'target': 0.9666666666666668,
 'params': {'min_samples_leaf': 0.16189615898299237,
  'min_samples_split': 0.201222813897077}}

In [108]:
#{'target': 0.9666666666666668,'params': {'min_samples_leaf': 0.16189615898299237,'min_samples_split': 0.201222813897077}}
clf_rf = RandomForestClassifier()
hparams_rf = {"criterion": ["gini", "entropy", "log_loss"], 
           "max_depth": [1,2,8,16,32],
          "min_samples_split": [0.1,0.201222813897077,0.5],
          "min_samples_leaf":[0.1,0.16189615898299237,0.5]}

In [109]:
GS_estimator = GridSearchCV(clf_rf, hparams_rf, cv=10, scoring="accuracy")
GS_estimator.fit(X, y)
print(GS_estimator.best_params_)

{'criterion': 'log_loss', 'max_depth': 8, 'min_samples_leaf': 0.16189615898299237, 'min_samples_split': 0.5}


In [111]:
#Random forest
#{'criterion': 'log_loss', 'max_depth': 8, 'min_samples_leaf': 0.16189615898299237, 'min_samples_split': 0.5}
#
clf = RandomForestClassifier(criterion='log_loss',max_depth=8,min_samples_leaf=0.16189615898299237, min_samples_split=0.5)
scores = cross_val_score(clf, X, y, cv=10)
scores

array([1.        , 1.        , 1.        , 1.        , 0.83333333,
       1.        , 1.        , 0.8       , 0.8       , 1.        ])

In [112]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.94 accuracy with a standard deviation of 0.09


In [118]:
#MLP
clf = MLPClassifier(max_iter=700)
scores = cross_val_score(clf, X, y, cv=10)
scores

array([0.83333333, 0.83333333, 0.83333333, 1.        , 0.83333333,
       1.        , 1.        , 0.8       , 0.8       , 1.        ])

In [119]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.89 accuracy with a standard deviation of 0.09


In [127]:
mlp = MLPClassifier(max_iter=3000)
parameter_space = {
    'activation': ['logistic','tanh', 'relu'],
    'solver': ['lbfgs','sgd', 'adam'],
    'alpha': [0.0001, 0.05, 0.01],
    'learning_rate': ['constant','adaptive'],
}
clf_t = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=10)
clf_t.fit(X, y)
clf_t.best_params_

{'activation': 'relu',
 'alpha': 0.0001,
 'learning_rate': 'adaptive',
 'solver': 'lbfgs'}

In [130]:
#MLP
clf = MLPClassifier(max_iter=3000, activation='relu', alpha=0.0001, learning_rate='adaptive',solver='sgd')
scores = cross_val_score(clf, X, y, cv=10)
scores

array([1.        , 0.83333333, 1.        , 1.        , 0.83333333,
       1.        , 1.        , 0.8       , 0.8       , 1.        ])

In [131]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.93 accuracy with a standard deviation of 0.09


In [135]:
#GradientBoostingClassifier
clf = GradientBoostingClassifier()
scores = cross_val_score(clf, X, y, cv=10)
scores

array([1.        , 0.83333333, 0.83333333, 0.66666667, 0.83333333,
       1.        , 0.83333333, 0.8       , 0.8       , 0.8       ])

In [136]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.84 accuracy with a standard deviation of 0.09


In [134]:
gbc = GradientBoostingClassifier()
parameters = {
    "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}
clf_t = GridSearchCV(gbc,parameters,cv=10)
clf_t.fit(X,y)
clf_t.best_params_

{'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50}

In [137]:
#GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=50, learning_rate=0.01,max_depth=1, random_state=0)
scores = cross_val_score(clf, X, y, cv=10)
scores

array([1.        , 0.83333333, 1.        , 1.        , 0.83333333,
       1.        , 1.        , 1.        , 0.8       , 1.        ])

In [138]:
#The mean score and the standard deviation are hence given by:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.95 accuracy with a standard deviation of 0.08
