In [1]:
import pandas as pd
import math as ma
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train = df_train.drop(['Ticket', 'Cabin'], axis=1)
df_test = df_test.drop(['Ticket', 'Cabin'], axis=1)
combine = [df_train, df_test]

train_header = ["PassengerId","Survived","Pclass","Name","Sex","Age","SibSp","Parch",\
                "Ticket","Fare","Cabin","Embarked"]

In [2]:
#produce estimates for missing 'Age' values based on gender and age of the passengers
def makeOrdinal(df,label,show_dict):
    filtered = df.sort_values([label], ascending = [True])
    df_filtered = filtered.groupby(label).first().reset_index()
    strcat_dict = {}
    
    for i,row in df_filtered.iterrows():
        strcat_dict[row[label]] = i 
    
    if show_dict:
        print(strcat_dict)
    
    for j,row in df.iterrows():
        df.at[j,label] = strcat_dict.get(row[label])
    
    return df

#don't mess with the actual data
age_comb = combine
age_comb_dropped = []

for df in age_comb:
    makeOrdinal(df,"Sex",False)
    df_dropped = df[['Sex','Age','Pclass']].dropna()
    age_comb_dropped.append(df_dropped)

X = age_comb_dropped[0][['Sex','Pclass']]
y = age_comb_dropped[0]['Age']

train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1)

rf_model_age = RandomForestRegressor(random_state=1,n_estimators=10)
rf_model_age.fit(train_X,train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [3]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for df in combine:
    #male = 1, female = 0
    makeOrdinal(df,"Sex",False)
    #S = 2, C = 0, Q = 1
    makeOrdinal(df,"Embarked",False)
    
    #create an 'engineered feature', by using regex to extract their Title
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    
    #convert the french names and the 'rare' titles like Rev or Don into general categories
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
                                                 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    df['Title'] = df['Title'].map(title_mapping)
    df['Title'] = df['Title'].fillna(0)

df_train = df_train.drop(['Name', 'PassengerId'], axis=1)
df_test = df_test.drop(['Name'], axis=1)

combine = [df_train,df_test]

In [4]:
#sex: female = 0, male = 1
def guessAge(sex,pclass):
    return int(rf_model_age.predict(np.array([[sex,pclass]])))

for df in combine:
    for i,row in df.iterrows():
        if ma.isnan(row['Age']):
            df.at[i,'Age'] = guessAge(row['Sex'],row['Pclass'])
combine = [df_train,df_test]


In [7]:
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,1,22.0,1,0,7.2500,2,1
1,1,1,0,38.0,1,0,71.2833,0,3
2,1,3,0,26.0,0,0,7.9250,2,2
3,1,1,0,35.0,1,0,53.1000,2,3
4,0,3,1,35.0,0,0,8.0500,2,1
5,0,3,1,26.0,0,0,8.4583,1,1
6,0,1,1,54.0,0,0,51.8625,2,1
7,0,3,1,2.0,3,1,21.0750,2,4
8,1,3,0,27.0,0,2,11.1333,2,3
9,1,2,0,14.0,1,0,30.0708,0,3


In [6]:
X_train = df_train.drop("Survived", axis=1)
Y_train = df_train["Survived"]
X_test  = df_test.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape
#df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,1,22.0,1,0,7.25,2,1
1,1,1,0,38.0,1,0,71.2833,0,3
2,1,3,0,26.0,0,0,7.925,2,2
3,1,1,0,35.0,1,0,53.1,2,3
4,0,3,1,35.0,0,0,8.05,2,1
