In [1]:
import pandas as pd
import math as ma
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train = df_train.drop(['Ticket', 'Cabin'], axis=1)
df_test = df_test.drop(['Ticket', 'Cabin'], axis=1)
combine = [df_train, df_test]

train_header = ["PassengerId","Survived","Pclass","Name","Sex","Age","SibSp","Parch",\
                "Ticket","Fare","Cabin","Embarked"]

In [2]:
#produce estimates for missing 'Age' values based on gender and age of the passengers
def makeOrdinal(df,label,show_dict):
    filtered = df.sort_values([label], ascending = [True])
    df_filtered = filtered.groupby(label).first().reset_index()
    strcat_dict = {}
    
    for i,row in df_filtered.iterrows():
        strcat_dict[row[label]] = i 
    
    if show_dict:
        print(strcat_dict)
    
    for j,row in df.iterrows():
        df.at[j,label] = strcat_dict.get(row[label])
    
    return df

age_comb_dropped = []

for df in combine:
    makeOrdinal(df,"Sex",False)
    df_dropped = df[['Sex','Age','Pclass']].dropna()
    age_comb_dropped.append(df_dropped)

X = age_comb_dropped[0][['Sex','Pclass']]
y = age_comb_dropped[0]['Age']

train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1)

rf_model_age = RandomForestRegressor(random_state=1,n_estimators=10)
rf_model_age.fit(train_X,train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [3]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for df in combine:
    #male = 1, female = 0
    makeOrdinal(df,"Sex",False)
    #S = 2, C = 0, Q = 1
    makeOrdinal(df,"Embarked",False)
    
    #create an 'engineered feature', by using regex to extract their Title
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    
    #convert the french names and the 'rare' titles like Rev or Don into general categories
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
                                                 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    df['Title'] = df['Title'].map(title_mapping)
    df['Title'] = df['Title'].fillna(0)

df_train = df_train.drop(['Name', 'PassengerId'], axis=1)
df_test = df_test.drop(['Name'], axis=1)

combine = [df_train,df_test]

In [4]:
#sex: female = 0, male = 1
def guessAge(sex,pclass):
    return int(rf_model_age.predict(np.array([[sex,pclass]])))

for df in combine:
    for i,row in df.iterrows():
        if ma.isnan(row['Age']):
            df.at[i,'Age'] = guessAge(row['Sex'],row['Pclass'])
combine = [df_train,df_test]

In [5]:
df_train['AgeBand'] = pd.cut(df_train['Age'], 5)
df_train[['AgeBand', 'Survived']].groupby(['AgeBand'], \
                                          as_index=False).mean().sort_values(by='AgeBand', \
                                                                             ascending=True)
#change the age values in the combined data set using the categories
#if the age in this entry of combine is in the bounds of the check assign it to 1,2,3 or not
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = int(0)
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = int(1)
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = int(2)
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = int(3)
    dataset.loc[ dataset['Age'] > 64, 'Age']

df_train = df_train.drop(['AgeBand'], axis=1)
combine = [df_train, df_test]
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,1,1.0,1,0,7.25,2,1
1,1,1,0,2.0,1,0,71.2833,0,3
2,1,3,0,1.0,0,0,7.925,2,2
3,1,1,0,2.0,1,0,53.1,2,3
4,0,3,1,2.0,0,0,8.05,2,1


In [6]:
for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

df_train.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)

Unnamed: 0,Age*Class,Age,Pclass
0,3.0,1.0,3
1,2.0,2.0,1
2,3.0,1.0,3
3,2.0,2.0,1
4,6.0,2.0,3
5,3.0,1.0,3
6,3.0,3.0,1
7,0.0,0.0,3
8,3.0,1.0,3
9,0.0,0.0,2


In [7]:
#find the most frequent departure...we will use this to fill the missing data in this instance
freq_port = df_train.Embarked.dropna().mode()[0]

#fill the empty 'Embarked' rows with the most frequent
#then print the survival 
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

df_train[['Embarked', 'Survived']].groupby(['Embarked'], \
                                           as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,0,0.553571
1,1,0.38961
2,2,0.339009


In [10]:
df_test['Fare'].fillna(df_test['Fare'].dropna().median(), inplace=True)
df_test.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Age*Class
0,0,3,1,1.0,1,0,7.25,2,1,3.0
1,1,1,0,2.0,1,0,71.2833,0,3,2.0
2,1,3,0,1.0,0,0,7.925,2,2,3.0
3,1,1,0,2.0,1,0,53.1,2,3,2.0
4,0,3,1,2.0,0,0,8.05,2,1,6.0


In [12]:
#create a grouped category for fares paid and their survival rates
df_train['FareBand'] = pd.qcut(df_train['Fare'], 4)
df_train[['FareBand', 'Survived']].groupby(['FareBand'], \
                                           as_index=False).mean().sort_values(by='FareBand', ascending=True)
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

df_train = df_train.drop(['FareBand'], axis=1)
combine = [df_train, df_test]

ValueError: Bin edges must be unique: array([0. , 0.5, 2. , 2. , 3. ]).
You can drop duplicate edges by setting the 'duplicates' kwarg

In [9]:
X_train = df_train.drop("Survived", axis=1)
Y_train = df_train["Survived"]
X_test  = df_test.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 9), (891,), (418, 9))