In [181]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import seaborn as sns

In [182]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [183]:
def crossValidate(model, X, y):
    n_splits = 10
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=13)
    train_loss, test_loss = 0, 0
    for train_index, test_index in kf.split(X):
        X_train_cv = X.iloc[train_index]
        y_train_cv = y[train_index]
        X_test_cv = X.iloc[test_index]
        y_test_cv = y[test_index]
        model.fit(X_train_cv, y_train_cv)
        y_test_pred = model.predict(X_test_cv)
        y_train_pred = model.predict(X_train_cv)
        train_loss += accuracy_score(y_train_cv, y_train_pred)
        test_loss += accuracy_score(y_test_cv, y_test_pred)
    return train_loss / n_splits, test_loss / n_splits

def out(pred):
    ids = test_data['PassengerId']
    pred = pd.Series(pred, name='Survived')
    out_data = pd.concat([ids, pred], axis=1)
    out_data.to_csv('out.csv', index=False, sep=',')

In [184]:
train_data = train_data.drop(columns=['PassengerId'])
X_test = test_data.drop(columns=['PassengerId'])

In [185]:
X_train = train_data.drop(columns=['Survived'])
y_train = train_data['Survived']

In [186]:
aba = X_train['Name'].str.find('.')
z = []
for i, point_pos in enumerate(aba):
    rev_name = X_train.iloc[i]['Name'][:point_pos][::-1]
    z.append(rev_name[:rev_name.find(' ')][::-1])
z = pd.Series(z)
bab = X_test['Name'].str.find('.')
v = []
for i, point_pos in enumerate(bab):
    rev_name = X_test.iloc[i]['Name'][:point_pos][::-1]
    v.append(rev_name[:rev_name.find(' ')][::-1])
v = pd.Series(v)
v.value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Ms          1
Dr          1
Dona        1
Name: count, dtype: int64

In [187]:
X_train['is_Mr.'] = 0
X_train.loc[z == 'Mr', 'is_Mr.'] = 1
X_test['is_Mr.'] = 0
X_test.loc[v == 'Mr', 'is_Mr.'] = 1
X_train['is_Miss.'] = 0
X_train.loc[z == 'Miss', 'is_Miss.'] = 1
X_test['is_Miss.'] = 0
X_test.loc[v == 'Miss', 'is_Miss.'] = 1
X_train['is_Mrs.'] = 0
X_train.loc[z == 'Mrs', 'is_Mrs.'] = 1
X_test['is_Mrs.'] = 0
X_test.loc[v == 'Mrs', 'is_Mrs.'] = 1
X_train['is_Master.'] = 0
X_train.loc[z == 'Master', 'is_Master.'] = 1
X_test['is_Master.'] = 0
X_test.loc[v == 'Master', 'is_Master.'] = 1
X_train = X_train.drop(columns=['Name'])
X_test = X_test.drop(columns=['Name'])
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Mr.,is_Miss.,is_Mrs.,is_Master.
0,3,male,34.5,0,0,330911,7.8292,,Q,1,0,0,0
1,3,female,47.0,1,0,363272,7.0000,,S,0,0,1,0
2,2,male,62.0,0,0,240276,9.6875,,Q,1,0,0,0
3,3,male,27.0,0,0,315154,8.6625,,S,1,0,0,0
4,3,female,22.0,1,1,3101298,12.2875,,S,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,male,,0,0,A.5. 3236,8.0500,,S,1,0,0,0
414,1,female,39.0,0,0,PC 17758,108.9000,C105,C,0,0,0,0
415,3,male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,1,0,0,0
416,3,male,,0,0,359309,8.0500,,S,1,0,0,0


In [188]:
X_train['Embarked'] = X_train['Embarked'].fillna('S')
train_embarked_ohe = pd.get_dummies(X_train.Embarked, prefix='Embarked', dtype=int)
test_embarked_ohe = pd.get_dummies(X_test.Embarked, prefix='Embarked', dtype=int)
train_sex_ohe = pd.get_dummies(X_train.Sex, prefix='Sex', dtype=int)
test_sex_ohe = pd.get_dummies(X_test.Sex, prefix='Sex', dtype=int)
X_train = pd.concat([X_train.drop(columns=['Embarked', 'Sex']), train_embarked_ohe, train_sex_ohe], axis=1)
X_test = pd.concat([X_test.drop(columns=['Embarked', 'Sex']), test_embarked_ohe, test_sex_ohe], axis=1)

In [189]:
X_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,is_Mr.,is_Miss.,is_Mrs.,is_Master.,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,3,22.0,1,0,A/5 21171,7.2500,,1,0,0,0,0,0,1,0,1
1,1,38.0,1,0,PC 17599,71.2833,C85,0,0,1,0,1,0,0,1,0
2,3,26.0,0,0,STON/O2. 3101282,7.9250,,0,1,0,0,0,0,1,1,0
3,1,35.0,1,0,113803,53.1000,C123,0,0,1,0,0,0,1,1,0
4,3,35.0,0,0,373450,8.0500,,1,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,211536,13.0000,,0,0,0,0,0,0,1,0,1
887,1,19.0,0,0,112053,30.0000,B42,0,1,0,0,0,0,1,1,0
888,3,,1,2,W./C. 6607,23.4500,,0,1,0,0,0,0,1,1,0
889,1,26.0,0,0,111369,30.0000,C148,1,0,0,0,1,0,0,0,1


In [190]:
X_train = X_train.drop(columns=['Ticket', 'Cabin'])
X_test = X_test.drop(columns=['Ticket', 'Cabin'])

In [191]:
X_test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,is_Mr.,is_Miss.,is_Mrs.,is_Master.,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,3,34.5,0,0,7.8292,1,0,0,0,0,1,0,0,1
1,3,47.0,1,0,7.0000,0,0,1,0,0,0,1,1,0
2,2,62.0,0,0,9.6875,1,0,0,0,0,1,0,0,1
3,3,27.0,0,0,8.6625,1,0,0,0,0,0,1,0,1
4,3,22.0,1,1,12.2875,0,0,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,,0,0,8.0500,1,0,0,0,0,0,1,0,1
414,1,39.0,0,0,108.9000,0,0,0,0,1,0,0,1,0
415,3,38.5,0,0,7.2500,1,0,0,0,0,0,1,0,1
416,3,,0,0,8.0500,1,0,0,0,0,0,1,0,1


In [192]:
X_train['Age'].mean()

29.69911764705882

In [193]:
X_train['Age'] = X_train['Age'].fillna(X_train['Age'].mean())
X_test['Age'] = X_test['Age'].fillna(X_test['Age'].mean())

In [194]:
model = XGBClassifier(min_child_weight=35, random_state=13)
crossValidate(model, X_train, y_train)
#35, no gamma

(0.8220466312371381, 0.8147690387016231)

In [219]:
X_train['Age'] = train_data['Age']
X_test['Age'] = test_data['Age']

In [220]:
X_train.isna().sum()

Pclass          0
Age           177
SibSp           0
Parch           0
Fare            0
is_Mr.          0
is_Miss.        0
is_Mrs.         0
is_Master.      0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
Sex_female      0
Sex_male        0
dtype: int64

In [221]:
mr_age_mean = X_train.loc[X_train['is_Mr.'] == 1, ['Age']].mean()
miss_age_mean = X_train.loc[X_train['is_Miss.'] == 1, ['Age']].mean()
master_age_mean = X_train.loc[X_train['is_Master.'] == 1, ['Age']].mean()
mrs_age_mean = X_train.loc[X_train['is_Mrs.'] == 1, ['Age']].mean()
mr_age_mean, miss_age_mean, master_age_mean, mrs_age_mean

(Age    32.36809
 dtype: float64,
 Age    21.773973
 dtype: float64,
 Age    4.574167
 dtype: float64,
 Age    35.898148
 dtype: float64)

In [222]:
X_train.loc[X_train['is_Mr.'] == 1, 'Age'] = X_train.loc[X_train['is_Mr.'] == 1, 'Age'].fillna(32.36809)
X_train.loc[X_train['is_Miss.'] == 1, 'Age'] = X_train.loc[X_train['is_Miss.'] == 1, 'Age'].fillna(21.773973)
X_train.loc[X_train['is_Master.'] == 1, 'Age'] = X_train.loc[X_train['is_Master.'] == 1, 'Age'].fillna(4.574167)
X_train.loc[X_train['is_Mrs.'] == 1, 'Age'] = X_train.loc[X_train['is_Mrs.'] == 1, 'Age'].fillna(35.898148)
X_test.loc[X_test['is_Mr.'] == 1, 'Age'] = X_test.loc[X_test['is_Mr.'] == 1, 'Age'].fillna(32.36809)
X_test.loc[X_test['is_Miss.'] == 1, 'Age'] = X_test.loc[X_test['is_Miss.'] == 1, 'Age'].fillna(21.773973)
X_test.loc[X_test['is_Master.'] == 1, 'Age'] = X_test.loc[X_test['is_Master.'] == 1, 'Age'].fillna(4.574167)
X_test.loc[X_test['is_Mrs.'] == 1, 'Age'] = X_test.loc[X_test['is_Mrs.'] == 1, 'Age'].fillna(35.898148)
X_train['Age'] = X_train['Age'].fillna(X_train['Age'].mean())
X_test['Age'] = X_test['Age'].fillna(X_test['Age'].mean())

In [228]:
X_test['Fare'] = X_test['Fare'].fillna(X_test.loc[X_test['Pclass'] == 3, 'Fare'].mean())

In [233]:
model = XGBClassifier(min_child_weight=36, random_state=13)
crossValidate(model, X_train, y_train)

(0.8236680458653615, 0.8181148564294632)

In [234]:
out(model.predict(X_test))

In [686]:
X_train1 = X_train.copy()
X_train1['Pclass_+_Age'] = X_train1['Pclass'] + X_train1['Age']
X_train1

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,is_Mr.,is_Miss.,is_Mrs.,is_Master.,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,Pclass_+_Age
0,3,22.000000,1,0,7.2500,1,0,0,0,0,0,1,0,1,25.000000
1,1,38.000000,1,0,71.2833,0,0,1,0,1,0,0,1,0,39.000000
2,3,26.000000,0,0,7.9250,0,1,0,0,0,0,1,1,0,29.000000
3,1,35.000000,1,0,53.1000,0,0,1,0,0,0,1,1,0,36.000000
4,3,35.000000,0,0,8.0500,1,0,0,0,0,0,1,0,1,38.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.000000,0,0,13.0000,0,0,0,0,0,0,1,0,1,29.000000
887,1,19.000000,0,0,30.0000,0,1,0,0,0,0,1,1,0,20.000000
888,3,21.773973,1,2,23.4500,0,1,0,0,0,0,1,1,0,24.773973
889,1,26.000000,0,0,30.0000,1,0,0,0,1,0,0,0,1,27.000000


In [717]:
model = XGBClassifier(gamma=0.15, min_child_weight=35, random_state=13)
crossValidate(model, X_train1, y_train)

(0.8269105637902745, 0.8226092384519351)

In [718]:
X_test1 = X_test.copy()
X_test1['Pclass_+_Age'] = X_test1['Pclass'] + X_test1['Age']
out(model.predict(X_test1))