In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

train_data = pd.read_csv("../data/titanic/train.csv")
test_data = pd.read_csv("../data/titanic/test.csv")

features = train_data.columns.values.tolist()
features.remove("Survived")
train_x = train_data[features]
train_y = train_data["Survived"]

full_x = [train_x, test_data]
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [2]:
from sklearn import preprocessing
def transform_feature(x,feature):
    le = preprocessing.LabelEncoder()
    le = le.fit(x[feature])
    x[feature] = le.fit_transform(x[feature])
    

In [3]:
from sklearn import preprocessing
for data in full_x:
    age_filled = data.Age.dropna().median()
    data.Age = data.Age.fillna(age_filled)
    cabin_filled = data.Cabin.value_counts().index[0]
    data.Cabin = data.Cabin.fillna(cabin_filled)
    fare_filled = data.Fare.dropna().median()
    data.Fare = data.Fare.fillna(fare_filled)
    embarked_filled = data.Embarked.value_counts().index[0]
    data.Embarked = data.Embarked.fillna(embarked_filled)
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['IsAlone'] = 0
    data['IsAlone'][data['FamilySize'] != 0] = 1
    transform_feature(data,'Sex')
    transform_feature(data,'Cabin')
    transform_feature(data,'Embarked')
    transform_feature(data,'Ticket')
    data.drop(['SibSp','Parch','Ticket'],axis = 1,inplace=True)
train_x.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,FamilySize,IsAlone
0,1,3,"Braund, Mr. Owen Harris",1,22.0,7.25,145,2,2,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,71.2833,81,0,2,1
2,3,3,"Heikkinen, Miss. Laina",0,26.0,7.925,145,2,1,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,53.1,55,2,2,1
4,5,3,"Allen, Mr. William Henry",1,35.0,8.05,145,2,1,1


In [4]:
train_data['AgeBand'] = pd.cut(train_data['Age'], 8)
train_data[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)
train_x.Age[train_x.Age <= 10] = 0
train_x.Age[(train_x.Age > 10) & (train_x.Age <= 20)] = 1
train_x.Age[(train_x.Age <= 30) & (train_x.Age > 20)] = 2
train_x.Age[(train_x.Age <= 40) & (train_x.Age > 30)] = 3
train_x.Age[(train_x.Age <= 50) & (train_x.Age > 40)] = 4
train_x.Age[(train_x.Age <= 60) & (train_x.Age > 50)] = 5
train_x.Age[(train_x.Age <= 70) & (train_x.Age > 60)] = 6
train_x.Age[train_x.Age > 70] = 7

In [5]:
test_data.Age[test_data.Age <= 10] = 0
test_data.Age[(test_data.Age > 10) & (test_data.Age <= 20)] = 1
test_data.Age[(test_data.Age <= 30) & (test_data.Age > 20)] = 2
test_data.Age[(test_data.Age <= 40) & (test_data.Age > 30)] = 3
test_data.Age[(test_data.Age <= 50) & (test_data.Age > 40)] = 4
test_data.Age[(test_data.Age <= 60) & (test_data.Age > 50)] = 5
test_data.Age[(test_data.Age <= 70) & (test_data.Age > 60)] = 6
test_data.Age[test_data.Age > 70] = 7

In [6]:
train_data['FareBand'] = pd.cut(train_data['Fare'], 8)
train_data[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

bins = (-1, 0, 8, 15, 31, 1000)
group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
categories = pd.cut(train_x.Fare, bins, labels=group_names)

train_x.Fare = categories
transform_feature(train_x,'Fare')
train_x.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Fare,Cabin,Embarked,FamilySize,IsAlone
0,1,3,"Braund, Mr. Owen Harris",1,2.0,0,145,2,2,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,3.0,3,81,0,2,1
2,3,3,"Heikkinen, Miss. Laina",0,2.0,0,145,2,1,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,3.0,3,55,2,2,1
4,5,3,"Allen, Mr. William Henry",1,3.0,1,145,2,1,1


In [7]:
bins = (-1, 0, 8, 15, 31, 1000)
group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
categories = pd.cut(test_data.Fare, bins, labels=group_names)

test_data.Fare = categories
test_data.Fare.value_counts()
transform_feature(test_data,'Fare')

In [8]:
combine = [train_x, test_data]
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)

In [9]:
train_x = pd.concat([train_x,pd.get_dummies(train_x.Pclass,prefix="Pclass")],axis=1)
test_data = pd.concat([test_data,pd.get_dummies(test_data.Pclass,prefix="Pclass")],axis=1)
train_x = pd.concat([train_x,pd.get_dummies(train_x.Embarked,prefix="Embarked")],axis=1)
test_data = pd.concat([test_data,pd.get_dummies(test_data.Embarked,prefix="Embarked")],axis=1)
train_x.drop(['Pclass','Embarked','Name','Title'],axis=1,inplace=True)
test_data.drop(['Pclass','Embarked','Name','Title'],axis=1,inplace=True)

In [10]:
from sklearn.model_selection import train_test_split

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, random_state = 1)

In [11]:
import xgboost as xgb

gbm = xgb.XGBClassifier(max_depth=3, n_estimators=600, learning_rate=0.05).fit(train_x, train_y)
score = gbm.score(val_x,val_y)
print(score)
test_y = gbm.predict(test_data)

0.7937219730941704


  if diff:
  if diff:


In [12]:
gbm.feature_importances_

array([0.4691358 , 0.05630834, 0.12827462, 0.04727492, 0.12496236,
       0.07949413, 0.        , 0.00813008, 0.00963565, 0.02860584,
       0.01746462, 0.00782897, 0.02288467], dtype=float32)

In [14]:
result = pd.DataFrame({'PassengerId':test_data.PassengerId,'Survived':test_y})
result.to_csv("submission.csv",index=False)