# Выжившие на Титанике

Гибель «Титаника» — одно из самых печально известных кораблекрушений в истории.

15 апреля 1912 года во время своего первого рейса «Титаник», считавшийся «непотопляемым», затонул после столкновения с айсбергом. К сожалению, спасательных шлюпок на всех на борту не хватило, в результате чего из 2224 пассажиров и членов экипажа погибли 1502 человека.

Хотя в выживании был некоторый элемент удачи, кажется, что у некоторых групп людей было больше шансов выжить, чем у других.

Используя данные о пассажирах, построим прогностическую модель, отвечающую на вопрос: «У каких людей больше шансов выжить?».

# Этап 1

---



### Подготовка


In [7]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [8]:
df_train = pd.read_csv('/content/train.csv')
df_train.info()
display(df_train.describe())
display(df_train.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
df_test0 = pd.read_csv('/content/test.csv')
df_test0.info()
display(df_test0.describe())
display(df_test0.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [10]:
df_train.columns = df_train.columns.str.lower()

In [11]:
df_train.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [12]:
df_test0.columns = df_test0.columns.str.lower()

### Удалим ненужные столбцы

Стоблцы 'passengerid', 'name', 'ticket', 'fare', 'embarked' не несут никакой информации для нашей модели. Столбец же 'cabin' нам нужен, однако все его значения уникальны, закодировать его не получится, но и агрегировать в более дифференцированные столбцы не выйдет. Удалим.

In [13]:
dropped_columns = ['passengerid', 'name', 'ticket', 'fare', 'embarked', 'cabin']
df_train = df_train.drop(dropped_columns, axis=1)
df_test = df_test0.drop(dropped_columns, axis=1)

In [14]:
df_train.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch'], dtype='object')

### Обработка пропусков

In [15]:
skip_train = df_train.isna().sum()
display(skip_train)

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
dtype: int64

In [16]:
skip_train / df_train.shape[0]

survived    0.000000
pclass      0.000000
sex         0.000000
age         0.198653
sibsp       0.000000
parch       0.000000
dtype: float64

Взять данные, чтобы заполнить пропуски неоткуда, но и удалить я их не могу, так как их слишком много. Заполню медианным значением столбец 'age'.

In [17]:
df_train['age'] = df_train['age'].fillna(df_train['age'].median())

In [18]:
df_train.isna().sum()

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
dtype: int64

In [19]:
skip_test = df_test.isna().sum()
display(skip_test)

pclass     0
sex        0
age       86
sibsp      0
parch      0
dtype: int64

In [20]:
skip_test / df_test.shape[0]

pclass    0.000000
sex       0.000000
age       0.205742
sibsp     0.000000
parch     0.000000
dtype: float64

Заполним пропуски тем же способом.

In [21]:
df_test['age'] = df_test['age'].fillna(df_test['age'].median())

In [22]:
df_test.isna().sum()

pclass    0
sex       0
age       0
sibsp     0
parch     0
dtype: int64

### Обработка дубликатов

In [23]:
df_train.duplicated().sum()

372

In [24]:
df_test.duplicated().sum()

154

Дубликаты вполне допустимы, так как я удалил имена пассажиров.

 ### Промежуточный вывод

Изучил данные, аномалий обнаружено не было. Обработал пропуски и дубликаты.

# Этап 2

---



### Порядковое кодирование

In [25]:
columms_ordinal = ['sex']
encoder = OrdinalEncoder()
df_train[columms_ordinal] = pd.DataFrame(encoder.fit_transform(df_train[columms_ordinal]), columns=df_train[columms_ordinal].columns)
df_test[columms_ordinal] = pd.DataFrame(encoder.fit_transform(df_test[columms_ordinal]), columns=df_test[columms_ordinal].columns)

In [26]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    float64
 3   age       891 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
dtypes: float64(2), int64(4)
memory usage: 41.9 KB


### Деление на выборки

In [27]:
STATE = 2802

In [28]:
features_train = df_train.drop('survived', axis=1)
target_train = df_train['survived']
features_train, features_valid, target_train, target_valid = train_test_split(features_train, target_train, test_size=0.2, random_state=STATE)
display(features_train.shape)
display(target_train.shape)
display(features_valid.shape)
display(target_valid.shape)

(712, 5)

(712,)

(179, 5)

(179,)

In [29]:
features_test = df_test[df_test.columns]
display(features_test.shape)

(418, 5)

### Стандартизация данных

In [30]:
numeric = ['pclass', 'age', 'sibsp', 'parch']

scaler = StandardScaler()
scaler.fit(features_train[numeric])
features_train = scaler.transform(features_train[numeric])
features_valid = scaler.transform(features_valid[numeric])
features_test = scaler.transform(features_test[numeric])

# Этап 3

---



## Модели машинного обучения

#### Логистическая регрессия

In [31]:
%%time

lr = LogisticRegression(random_state=STATE)

parametrs = {'C': range (1, 10),
              'max_iter': range (100, 1001, 50)}

rs = RandomizedSearchCV(lr, parametrs, random_state=STATE)
rs.fit(features_train, target_train)
display(rs.best_params_)

{'max_iter': 800, 'C': 4}

CPU times: user 359 ms, sys: 7.12 ms, total: 366 ms
Wall time: 397 ms


In [32]:
lr_best = LogisticRegression(max_iter=800, C=4, random_state=STATE)

In [33]:
lr_best.fit(features_train, target_train)
predictions = lr_best.predict(features_valid)
print('f1', f1_score(target_valid, predictions))

f1 0.5573770491803278


#### Дерево решений

In [34]:
%%time

dtc = DecisionTreeClassifier(random_state=STATE)

parametrs = {'criterion': ['gini', 'entropy', 'log_loss'],
             'max_depth': range(1, 51),
             'min_samples_leaf': range(1,11)}

rs = RandomizedSearchCV(dtc, parametrs, random_state=STATE)
rs.fit(features_train, target_train)
display(rs.best_params_)

{'min_samples_leaf': 6, 'max_depth': 28, 'criterion': 'gini'}

CPU times: user 217 ms, sys: 0 ns, total: 217 ms
Wall time: 263 ms


In [35]:
dtc_best = DecisionTreeClassifier(criterion='gini', max_depth=23, min_samples_split=4, random_state=STATE)

In [36]:
dtc_best.fit(features_train, target_train)
predictions = dtc_best.predict(features_valid)
print('f1', f1_score(target_valid, predictions))

f1 0.4793388429752066


#### Случайный лес

In [37]:
%%time

rfc = RandomForestClassifier(random_state=STATE)

parametrs = {'n_estimators': range(1, 1001, 50),
             'criterion': ['gini', 'entropy', 'log_loss'],
             'max_depth': range(1, 51),
             'min_samples_leaf': range(1,11)}

rs = RandomizedSearchCV(rfc, parametrs, random_state=STATE)
rs.fit(features_train, target_train)
display(rs.best_params_)

{'n_estimators': 901,
 'min_samples_leaf': 3,
 'max_depth': 30,
 'criterion': 'gini'}

CPU times: user 50.7 s, sys: 268 ms, total: 50.9 s
Wall time: 51.9 s


In [38]:
rfc_best = RandomForestClassifier(n_estimators=901, criterion='gini', max_depth=30, min_samples_leaf=3, random_state=STATE)

In [39]:
rfc_best.fit(features_train, target_train)
predictions = rfc_best.predict(features_valid)
print('f1', f1_score(target_valid, predictions))

f1 0.5511811023622047


#### Стохастический градиентный спуск

In [40]:
%%time

sgd = SGDClassifier(random_state=STATE)

parametrs = {'loss': ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
             'penalty': ['l2', 'l1', 'elasticnet', None],
             'max_iter': range (100, 1001, 50)}

rs = RandomizedSearchCV(sgd, parametrs, random_state=STATE)
rs.fit(features_train, target_train)
display(rs.best_params_)

{'penalty': 'elasticnet', 'max_iter': 650, 'loss': 'huber'}

CPU times: user 316 ms, sys: 2.04 ms, total: 318 ms
Wall time: 317 ms


In [41]:
sgd_best = SGDClassifier(loss='huber', penalty='elasticnet', max_iter=650, random_state=STATE)

In [42]:
sgd_best.fit(features_train, target_train)
predictions = rfc_best.predict(features_valid)
print('f1', f1_score(target_valid, predictions))

f1 0.5511811023622047


#### LGBM

In [43]:
%%time

lgbm = LGBMClassifier()

parametrs = { 'n_estimators': range (100, 1001, 50),
              'max_depth': range (1, 51, 2),
              'max_leaves': range (2, 9, 2) }

rs = RandomizedSearchCV(lgbm, parametrs, random_state=STATE)
rs.fit(features_train, target_train)
display(rs.best_params_)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
[LightGBM] [Info] Number of positive: 216, number of negative: 353
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.379613 -> initscore=-0.491190
[LightGBM] [Info] Start training from score -0.491190
[LightGBM] [Info] Number of positive: 216, number of negative: 353
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.379613 -> initscore=-0.491190
[LightGBM] [Info] Start training from score -0.491190
[LightGBM] [Info] Number of positive: 216, number of negative: 354
You can set `force_row_wi

{'n_estimators': 100, 'max_leaves': 4, 'max_depth': 3}

CPU times: user 25.2 s, sys: 2 s, total: 27.2 s
Wall time: 27.3 s


In [44]:
lgbm_best = LGBMClassifier(n_estimators=100, max_leaves=4, max_depth=3, random_state=STATE)

In [45]:
lgbm_best.fit(features_train, target_train)
predictions = rfc_best.predict(features_valid)
print('f1', f1_score(target_valid, predictions))

[LightGBM] [Info] Number of positive: 270, number of negative: 442
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 81
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.379213 -> initscore=-0.492888
[LightGBM] [Info] Start training from score -0.492888
f1 0.5511811023622047


Лучший показатель f1 у логистической регрессии, также результат получился за короткое время.

# Этап 4

---

In [46]:
lr_best.fit(features_train, target_train)
predictions = lr_best.predict(features_test)

# Этап 5

---

In [None]:
pred = pd.DataFrame({'Passengerid': df_test0['passengerid'], 'Survived': predictions})
pred.head()

In [None]:
pred.to_csv('pred.csv')