In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
train = pd.read_csv('/content/drive/MyDrive/DeepLearningCourse/titanic_train.csv')
test = pd.read_csv('/content/drive/MyDrive/DeepLearningCourse/titanic_reserved.csv')

In [4]:
train = train.loc[:, train.columns != 'ticket']
test = test.loc[:, test.columns != 'ticket']

In [5]:
ed_train = pd.DataFrame()
rows_num = train.shape[0] / 3

for col in train.columns:
  if train[col].isna().sum() <= rows_num:
    ed_train[col] = train[col]

ed_test = pd.DataFrame()
rows_num = test.shape[0] / 3

for col in test.columns:
  if test[col].isna().sum() <= rows_num:
    ed_test[col] = test[col]

In [6]:
ed_train['fam_size'] = ed_train['sibsp'] + ed_train['parch']

ed_train = ed_train.loc[:, ed_train.columns != 'sibsp']
ed_train = ed_train.loc[:, ed_train.columns != 'parch']

ed_test['fam_size'] = ed_test['sibsp'] + ed_test['parch']

ed_test = ed_test.loc[:, ed_test.columns != 'sibsp']
ed_test = ed_test.loc[:, ed_test.columns != 'parch']

In [7]:
ed_train['honorific'] = list(ed_train['name'].str.extract('([A-Za-z]+)\.')[0])

ed_test['honorific'] = list(ed_test['name'].str.extract('([A-Za-z]+)\.')[0])

In [8]:
ed_train['honorific'].\
replace(['Mlle', 'Rev', 'Ms' ,  'Col', 'Dona', 'Dr', 'Countess', 'Major', 'Don', 'Capt'],\
        ['Miss', 'Mr' , 'Miss', 'Mr' ,  'Mrs' ,  'Mr',    'Mrs'  ,  'Mr',  'Mr',  'Mr' ], inplace=True)

ed_test['honorific'].\
replace(['Mlle', 'Rev', 'Ms' ,  'Col', 'Dona', 'Dr', 'Countess', 'Major', 'Don', 'Capt', 'Sir', 'Lady', 'Mme', 'Jonkheer'],\
        ['Miss', 'Mr' , 'Miss', 'Mr' ,  'Mrs' ,  'Mr',    'Mrs'  ,  'Mr',  'Mr',  'Mr',  'Mr' ,  'Mrs', 'Mrs', 'Mr'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ed_train['honorific'].\
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ed_test['honorific'].\


In [9]:
ed_train['honorific'].unique()

array(['Miss', 'Mrs', 'Mr', 'Master'], dtype=object)

In [10]:
ed_test['honorific'].unique()

array(['Mr', 'Miss', 'Master', 'Mrs'], dtype=object)

In [11]:
age_train = dict(ed_train.groupby('honorific')['age'].mean())

age_test = dict(ed_test.groupby('honorific')['age'].mean())

In [12]:
age_train

{'Master': 5.128205128205129,
 'Miss': 22.005765408805033,
 'Mr': 32.90043763676149,
 'Mrs': 36.98425196850393}

In [13]:
age_test

{'Master': 6.470235714285714,
 'Miss': 21.290124074074075,
 'Mr': 32.53691275167785,
 'Mrs': 36.93617021276596}

In [14]:
train_hmeans = ed_train.copy()
train_hmeans.loc[(train_hmeans['age'].isna()) & (train_hmeans['honorific'] == 'Master'), 'age'] = age_train['Master']
train_hmeans.loc[(train_hmeans['age'].isna()) & (train_hmeans['honorific'] == 'Miss'), 'age'] = age_train['Miss']
train_hmeans.loc[(train_hmeans['age'].isna()) & (train_hmeans['honorific'] == 'Mr'), 'age'] = age_train['Mr']
train_hmeans.loc[(train_hmeans['age'].isna()) & (train_hmeans['honorific'] == 'Mrs'), 'age'] = age_train['Mrs']

In [15]:
test_hmeans = ed_test.copy()
test_hmeans.loc[(test_hmeans['age'].isna()) & (test_hmeans['honorific'] == 'Master'), 'age'] = age_test['Master']
test_hmeans.loc[(test_hmeans['age'].isna()) & (test_hmeans['honorific'] == 'Miss'), 'age'] = age_test['Miss']
test_hmeans.loc[(test_hmeans['age'].isna()) & (test_hmeans['honorific'] == 'Mr'), 'age'] = age_test['Mr']
test_hmeans.loc[(test_hmeans['age'].isna()) & (test_hmeans['honorific'] == 'Mrs'), 'age'] = age_test['Mrs']

In [16]:
surv = train_hmeans['survived']

train_hmeans = train_hmeans.drop(columns=['name', 'honorific', 'survived'])
train_hmeans = pd.get_dummies(train_hmeans, drop_first=True)

test_hmeans = test_hmeans.drop(columns=['name', 'honorific'])
test_hmeans = pd.get_dummies(test_hmeans, drop_first=True)

In [17]:
log_reg = LogisticRegression(max_iter=3000, penalty='l2', random_state=13).fit(train_hmeans, surv)

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
parameter_grid = {
    'C': np.logspace(-5, 1),
    'penalty': ['l1', 'l2']
}
grid_searcher = GridSearchCV(estimator = LogisticRegression(solver='liblinear', max_iter=3000, random_state=13),
                             param_grid=parameter_grid,
                             cv=5,
                             scoring='f1_micro',
                             n_jobs=-1
                            )

In [20]:
%%time
grid_searcher.fit(train_hmeans, surv)

CPU times: user 513 ms, sys: 106 ms, total: 619 ms
Wall time: 8.85 s


Можно увидеть все результаты поиска по сетке

In [21]:
pd.DataFrame(grid_searcher.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.007586,0.000955,0.008865,0.000647,0.000010,l1,"{'C': 1e-05, 'penalty': 'l1'}",0.619289,0.617347,0.617347,0.617347,0.617347,0.617735,0.000777,87
1,0.007221,0.000181,0.008331,0.000418,0.000010,l2,"{'C': 1e-05, 'penalty': 'l2'}",0.644670,0.663265,0.678571,0.678571,0.632653,0.659546,0.018343,71
2,0.006947,0.000648,0.008195,0.000420,0.000013,l1,"{'C': 1.3257113655901082e-05, 'penalty': 'l1'}",0.619289,0.617347,0.617347,0.617347,0.617347,0.617735,0.000777,87
3,0.007157,0.000735,0.008545,0.000671,0.000013,l2,"{'C': 1.3257113655901082e-05, 'penalty': 'l2'}",0.644670,0.663265,0.673469,0.678571,0.632653,0.658526,0.017372,78
4,0.007045,0.000335,0.009189,0.001315,0.000018,l1,"{'C': 1.757510624854793e-05, 'penalty': 'l1'}",0.619289,0.617347,0.617347,0.617347,0.617347,0.617735,0.000777,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.009826,0.002453,0.009816,0.003177,5.689866,l2,"{'C': 5.689866029018293, 'penalty': 'l2'}",0.822335,0.760204,0.765306,0.795918,0.760204,0.780794,0.024666,8
96,0.020588,0.004665,0.014277,0.004177,7.543120,l1,"{'C': 7.543120063354607, 'penalty': 'l1'}",0.822335,0.760204,0.760204,0.795918,0.755102,0.778753,0.026236,18
97,0.009689,0.003157,0.007261,0.000269,7.543120,l2,"{'C': 7.543120063354607, 'penalty': 'l2'}",0.822335,0.760204,0.765306,0.795918,0.755102,0.779773,0.025585,14
98,0.019144,0.008275,0.011073,0.004587,10.000000,l1,"{'C': 10.0, 'penalty': 'l1'}",0.822335,0.760204,0.760204,0.795918,0.755102,0.778753,0.026236,18


Лучшие параметры модели:

In [22]:
grid_searcher.best_params_

{'C': 1.0481131341546852, 'penalty': 'l1'}

Лучший скор на кросс-валидации:

In [23]:
grid_searcher.best_score_

0.7828291722780483

Лучшая модель

In [24]:
lr = grid_searcher.best_estimator_
lr

In [27]:
print(list(lr.predict(test_hmeans)))

[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0]
