# Импорты

In [164]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.ensemble import (RandomForestClassifier, 
                              ExtraTreesClassifier,
                              AdaBoostClassifier,
                              GradientBoostingClassifier)

# Установим сиды для рандома

In [165]:
SEED = 42

np.random.seed(SEED)

# Загружаем данные

In [166]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

Проверим, что все открылось корректно

In [167]:
print(train_df.shape)
train_df.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [168]:
print(test_df.shape)
test_df.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Посмотрим пропуски/шум в данных

In [169]:
train_df.info()
print('_' * 40)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Passenger

Числовые признаки

In [170]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [171]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


Категориальные

In [172]:
train_df.describe(include='O')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Dooley, Mr. Patrick",male,347082,G6,S
freq,1,577,7,4,644


In [173]:
test_df.describe(include='O')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,418,418,418,91,418
unique,418,2,363,76,3
top,"Peter, Master. Michael J",male,PC 17608,B57 B59 B63 B66,S
freq,1,266,5,3,270


Поищем корреляции

In [174]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [175]:
train_df[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [176]:
train_df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.535885
2,2,0.464286
0,0,0.345395
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [177]:
train_df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.343658
5,5,0.2
4,4,0.0
6,6,0.0


# Обработка данных

Отбросим данные, которые не влияют на выживаемость

In [178]:
print('До', train_df.shape, test_df.shape)

train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
combine_df = [train_df, test_df]  # Чтобы легче обрабатывать

print('После', train_df.shape, test_df.shape, combine_df[0].shape, combine_df[1].shape)

До (891, 12) (418, 11)
После (891, 10) (418, 9) (891, 10) (418, 9)


Вычленим статусы пассажиров

In [179]:
for dataset in combine_df:
    dataset['Title'] = dataset.Name.str.extract(r' ([A-Za-z]+)\.', expand=False)

In [180]:
pd.crosstab(train_df['Title'], train_df['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [181]:
pd.crosstab(test_df['Title'], test_df['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Col,0,2
Dona,1,0
Dr,0,1
Master,0,21
Miss,78,0
Mr,0,240
Mrs,72,0
Ms,1,0
Rev,0,2


Уберем "редкие" звания

In [182]:
for dataset in combine_df:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [183]:
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


Сохранив статус пассажира удалим информацию о его имени. И айди заодно

In [184]:
train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name'], axis=1)
combine_df = [train_df, test_df]

train_df.shape, test_df.shape

((891, 9), (418, 9))

Переведем некоторые категориальные признаки в числовые

In [185]:
title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}

for dataset in combine_df:
    dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)
    dataset['Title'] = dataset['Title'].map(title_mapping)
    
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,1,22.0,1,0,7.25,S,1
1,1,1,0,38.0,1,0,71.2833,C,3
2,1,3,0,26.0,0,0,7.925,S,2
3,1,1,0,35.0,1,0,53.1,S,3
4,0,3,1,35.0,0,0,8.05,S,1


Заполним пропуски возраста

In [186]:
guess_ages = np.zeros((2,3))

for dataset in combine_df:
    for sex in range(2):
        for pclass in range(3):
            guess_df = dataset[(dataset['Sex'] == sex) & (dataset['Pclass'] == pclass + 1)]['Age'].dropna()
            
            age_guess = guess_df.median()
            
            guess_ages[sex, pclass] = int(age_guess)
            
    for sex in range(2):
        for pclass in range(3):
            dataset.loc[(dataset.Age.isnull()) & (dataset.Sex == sex) & (dataset.Pclass == pclass + 1), 'Age'] = guess_ages[sex, pclass]
    
    dataset['Age'] = dataset['Age'].astype(int)

print(train_df.Age.shape)
train_df.head()

(891,)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,1,22,1,0,7.25,S,1
1,1,1,0,38,1,0,71.2833,C,3
2,1,3,0,26,0,0,7.925,S,2
3,1,1,0,35,1,0,53.1,S,3
4,0,3,1,35,0,0,8.05,S,1


Образуем группы по возрастам и сгруппируем 

In [187]:
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby('AgeBand', as_index=False, observed=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(-0.08, 16.0]",0.55
1,"(16.0, 32.0]",0.337374
2,"(32.0, 48.0]",0.412037
3,"(48.0, 64.0]",0.434783
4,"(64.0, 80.0]",0.090909


Изменим возраст на возрастные группы

In [188]:
for dataset in combine_df:    
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[dataset['Age'] > 64, 'Age'] = 4
    
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,AgeBand
0,0,3,1,1,1,0,7.25,S,1,"(16.0, 32.0]"
1,1,1,0,2,1,0,71.2833,C,3,"(32.0, 48.0]"
2,1,3,0,1,0,0,7.925,S,2,"(16.0, 32.0]"
3,1,1,0,2,1,0,53.1,S,3,"(32.0, 48.0]"
4,0,3,1,2,0,0,8.05,S,1,"(32.0, 48.0]"


Удалим вспомогательный столбец

In [189]:
train_df = train_df.drop(['AgeBand'], axis=1)
combine_df = [train_df, test_df]

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,1,1,1,0,7.25,S,1
1,1,1,0,2,1,0,71.2833,C,3
2,1,3,0,1,0,0,7.925,S,2
3,1,1,0,2,1,0,53.1,S,3
4,0,3,1,2,0,0,8.05,S,1


Добавим признак размер семьи

In [190]:
for dataset in combine_df: 
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


Одинокий?

In [191]:
for dataset in combine_df:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()   

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


Удалим все, что связанно с семьей и оставим IsAlone

In [192]:
train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine_df = [train_df, test_df]

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone
0,0,3,1,1,7.25,S,1,0
1,1,1,0,2,71.2833,C,3,0
2,1,3,0,1,7.925,S,2,1
3,1,1,0,2,53.1,S,3,0
4,0,3,1,2,8.05,S,1,1


Создадим еще один искусственный признак

In [193]:
for dataset in combine_df:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

train_df.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)

Unnamed: 0,Age*Class,Age,Pclass
0,3,1,3
1,2,2,1
2,3,1,3
3,2,2,1
4,6,2,3
5,3,1,3
6,3,3,1
7,0,0,3
8,3,1,3
9,0,0,2


Заполним два пропуска в поле Embarked

In [194]:
freq_port = dataset.Embarked.mode().item()
train_df['Embarked'].fillna(freq_port, inplace=True)
print(freq_port)
train_df.describe(include='O')

S


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Embarked'].fillna(freq_port, inplace=True)


Unnamed: 0,Embarked
count,891
unique,3
top,S
freq,646


In [195]:
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


Преобразуем категориальный Embarked в числовой

In [196]:
for dataset in combine_df:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,0,3,1,1,7.25,0,1,0,3
1,1,1,0,2,71.2833,1,3,0,2
2,1,3,0,1,7.925,0,2,1,3
3,1,1,0,2,53.1,0,3,0,2
4,0,3,1,2,8.05,0,1,1,6


Заполним последний пропуск

In [197]:
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())

Создадим группы по стоимости проезда

In [198]:
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

  train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)


Unnamed: 0,FareBand,Survived
0,"(-0.001, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31.0]",0.454955
3,"(31.0, 512.329]",0.581081


Сделаем, как сделали с возрастом и удалим ненужное

In [199]:
for dataset in combine_df:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df = train_df.drop(['FareBand'], axis=1)
combine_df = [train_df, test_df]
    
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,IsAlone,Age*Class
0,0,3,1,1,0,0,1,0,3
1,1,1,0,2,3,1,3,0,2
2,1,3,0,1,1,0,2,1,3
3,1,1,0,2,3,0,3,0,2
4,0,3,1,2,1,0,1,1,6


# Модели

Подготавливаем данные для обучения

In [200]:
Y_train = train_df['Survived'] 
X_train = train_df.drop('Survived', axis=1)
X_test = test_df.drop('PassengerId', axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 8), (891,), (418, 8))

Параметры моделей

In [201]:
# Random Forest
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
    'warm_start': True, 
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting
gb_params = {
    'n_estimators': 500,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

Теперь сами модели по отдельности

In [202]:
# Random Forest

random_forest = RandomForestClassifier(**rf_params)
random_forest.fit(X_train, Y_train)
Y_pred_rf = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

84.29

In [203]:
# Extra Trees

extra_trees = ExtraTreesClassifier(**et_params)
extra_trees.fit(X_train, Y_train)
Y_pred_et = extra_trees.predict(X_test)
acc_extra_trees = round(extra_trees.score(X_train, Y_train) * 100, 2)
acc_extra_trees

85.07

In [204]:
# AdaBoost

adaboost = AdaBoostClassifier(**ada_params)
adaboost.fit(X_train, Y_train)
Y_pred_ada = adaboost.predict(X_test)
acc_adaboost = round(adaboost.score(X_train, Y_train) * 100, 2)
acc_adaboost

80.13

In [205]:
# Gradient Boosting

gboost = GradientBoostingClassifier(**gb_params)
gboost.fit(X_train, Y_train)
Y_pred_gb = gboost.predict(X_test)
acc_gboost = round(gboost.score(X_train, Y_train) * 100, 2)
acc_gboost

86.64

## Стекинг при помощи XGB

Вспомогательная функция, реализующая kfold

In [206]:
def get_oof_proba(estimator, X_train, y_train, X_test, n_folds=5, random_state=42):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    oof_train = np.zeros(X_train.shape[0])
    oof_test_folds = np.zeros((n_folds, X_test.shape[0]))

    for i, (tr_idx, va_idx) in enumerate(skf.split(X_train, y_train)):
        est = clone(estimator)

        X_tr = X_train.iloc[tr_idx] if hasattr(X_train, "iloc") else X_train[tr_idx]
        y_tr = y_train.iloc[tr_idx] if hasattr(y_train, "iloc") else y_train[tr_idx]
        X_va = X_train.iloc[va_idx] if hasattr(X_train, "iloc") else X_train[va_idx]

        est.fit(X_tr, y_tr)
        oof_train[va_idx] = est.predict_proba(X_va)[:, 1]
        oof_test_folds[i, :] = est.predict_proba(X_test)[:, 1]

    oof_test = oof_test_folds.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

Составление моделей, их ответов и сам бустинг-агрегатор

In [216]:
et  = ExtraTreesClassifier(**et_params)
ada = AdaBoostClassifier(**ada_params)
gb  = GradientBoostingClassifier(**gb_params)

et_tr,  et_te  = get_oof_proba(et,  X_train, Y_train, X_test, n_folds=5)
ada_tr, ada_te = get_oof_proba(ada, X_train, Y_train, X_test, n_folds=5)
gb_tr,  gb_te  = get_oof_proba(gb,  X_train, Y_train, X_test, n_folds=5)

x_meta_train = np.hstack([et_tr, ada_tr, gb_tr])
x_meta_test  = np.hstack([et_te, ada_te, gb_te])
y_meta = Y_train

gbm = xgb.XGBClassifier(
    n_estimators=2000,
    max_depth=4,
    min_child_weight=2,
    gamma=0.9,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    learning_rate=0.02  # по желанию
).fit(x_meta_train, y_meta)

# Предсказания классов для сабмита
Y_pred = gbm.predict(x_meta_test)
Y_pred[:10]

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0])

# Формирования файла с ответами

In [208]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })

In [209]:
submission.to_csv('submission.csv', index=False)

In [212]:
result = pd.read_csv('submission.csv')
result.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
