## Machine Learning Example - BayesSearchCV with Titanic Dataset

- Name: Germán Hilgert
- LinkedIn: https://www.linkedin.com/in/german-hilgert/
- Date: 23/10/22

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [26]:
# https://www.kaggle.com/code/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [28]:
#limpieza
df['age'].fillna(df['age'].median(), inplace = True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace = True)
df['fare'].fillna(df['fare'].median(), inplace = True)

In [30]:
# ingenieria
df['FamilySize'] = df['sibsp'] + df['parch'] + 1
df['IsAlone'] = 1
df['IsAlone'].loc[df['FamilySize'] > 1] = 0

df['FareBin'] = pd.qcut(df['fare'], 4)
df['AgeBin'] = pd.cut(df['age'].astype(int), 5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['IsAlone'].loc[df['FamilySize'] > 1] = 0


In [34]:
# codificación
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
   
df['Sex_Code'] = label.fit_transform(df['sex'])
df['Embarked_Code'] = label.fit_transform(df['embarked'])
df['AgeBin_Code'] = label.fit_transform(df['AgeBin'])
df['FareBin_Code'] = label.fit_transform(df['FareBin'])

In [35]:
df.shape

(891, 23)

In [36]:
df.isnull().sum()

survived           0
pclass             0
sex                0
age                0
sibsp              0
parch              0
fare               0
embarked           0
class              0
who                0
adult_male         0
deck             688
embark_town        2
alive              0
alone              0
FamilySize         0
IsAlone            0
FareBin            0
AgeBin             0
Sex_Code           0
Embarked_Code      0
AgeBin_Code        0
FareBin_Code       0
dtype: int64

In [37]:
df.drop(columns=['deck', 'embark_town'], inplace=True)

In [38]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
alive            0
alone            0
FamilySize       0
IsAlone          0
FareBin          0
AgeBin           0
Sex_Code         0
Embarked_Code    0
AgeBin_Code      0
FareBin_Code     0
dtype: int64

In [39]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,alive,alone,FamilySize,IsAlone,FareBin,AgeBin,Sex_Code,Embarked_Code,AgeBin_Code,FareBin_Code
0,0,3,male,22.0,1,0,7.2500,S,Third,man,...,no,False,2,0,"(-0.001, 7.91]","(16.0, 32.0]",1,2,1,0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,yes,False,2,0,"(31.0, 512.329]","(32.0, 48.0]",0,0,2,3
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,...,yes,True,1,1,"(7.91, 14.454]","(16.0, 32.0]",0,2,1,1
3,1,1,female,35.0,1,0,53.1000,S,First,woman,...,yes,False,2,0,"(31.0, 512.329]","(32.0, 48.0]",0,2,2,3
4,0,3,male,35.0,0,0,8.0500,S,Third,man,...,no,True,1,1,"(7.91, 14.454]","(32.0, 48.0]",1,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,...,no,True,1,1,"(7.91, 14.454]","(16.0, 32.0]",1,2,1,1
887,1,1,female,19.0,0,0,30.0000,S,First,woman,...,yes,True,1,1,"(14.454, 31.0]","(16.0, 32.0]",0,2,1,2
888,0,3,female,28.0,1,2,23.4500,S,Third,woman,...,no,False,4,0,"(14.454, 31.0]","(16.0, 32.0]",0,2,1,2
889,1,1,male,26.0,0,0,30.0000,C,First,man,...,yes,True,1,1,"(14.454, 31.0]","(16.0, 32.0]",1,0,1,2


In [40]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'alive', 'alone',
       'FamilySize', 'IsAlone', 'FareBin', 'AgeBin', 'Sex_Code',
       'Embarked_Code', 'AgeBin_Code', 'FareBin_Code'],
      dtype='object')

In [52]:
X = df[['pclass', 'Sex_Code', 'age', 'Embarked_Code', 'AgeBin_Code', 'FareBin_Code', 'IsAlone', 'fare']]
y = df['survived']

In [53]:
from sklearn.model_selection import train_test_split
# Dividimos los datos en Train y Test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [54]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier()

In [55]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [56]:
espacio = {"n_estimators": Integer(10, 3000),
           'max_depth':Integer(1, 40),
           'num_leaves': Integer(2, 500),
           'learning_rate': Real(0.0001, 0.3, prior='uniform'),
           'lambda_l1': Real(1e-8, 10.0, prior='log-uniform'),
           'lambda_l2': Real(1e-8, 10.0, prior='log-uniform'),
           'num_leaves': Integer(2, 256),
           'feature_fraction': Real(0.4, 1.0, prior='uniform'),
           'bagging_fraction': Real(0.4, 1.0, prior='uniform'),
           'bagging_freq': Integer(1, 7),
           'min_child_samples': Integer(5, 100),
           'subsample':Real(0.2, 1, prior='uniform')}

In [64]:
model = BayesSearchCV(lgb, search_spaces=espacio, n_iter=50, n_points=5, n_jobs=-1, cv=5, verbose=0)

In [65]:
%%time
model.fit(X_train, y_train)

CPU times: total: 3min 1s
Wall time: 43 s


BayesSearchCV(cv=5, estimator=LGBMClassifier(), n_jobs=-1, n_points=5,
              search_spaces={'bagging_fraction': Real(low=0.4, high=1.0, prior='uniform', transform='normalize'),
                             'bagging_freq': Integer(low=1, high=7, prior='uniform', transform='normalize'),
                             'feature_fraction': Real(low=0.4, high=1.0, prior='uniform', transform='normalize'),
                             'lambda_l1': Real(low=1e-08, high=1...
                             'max_depth': Integer(low=1, high=40, prior='uniform', transform='normalize'),
                             'min_child_samples': Integer(low=5, high=100, prior='uniform', transform='normalize'),
                             'n_estimators': Integer(low=10, high=3000, prior='uniform', transform='normalize'),
                             'num_leaves': Integer(low=2, high=256, prior='uniform', transform='normalize'),
                             'subsample': Real(low=0.2, high=1, prior='uniform'

In [66]:
model.best_params_

OrderedDict([('bagging_fraction', 0.7566955049260979),
             ('bagging_freq', 5),
             ('feature_fraction', 0.4344150893988227),
             ('lambda_l1', 1.4757253809467351e-06),
             ('lambda_l2', 1.3225339682617491e-07),
             ('learning_rate', 0.1516599246227219),
             ('max_depth', 3),
             ('min_child_samples', 53),
             ('n_estimators', 456),
             ('num_leaves', 34),
             ('subsample', 0.2336392046170124)])

In [67]:
model.best_score_

0.8338233643811019

In [68]:
clf = model.best_estimator_

In [69]:
y_pred = clf.predict(X_test)

In [70]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8116591928251121