In [1]:
# Import all the required packages first

import pandas as pd
import numpy as np
from numpy.random import randn
from pandas import Series,DataFrame

from scipy import stats

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
train_final = pd.read_csv('train_final.csv')

train_final.head()

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Age,Embarked,Age_was_missing,person,alone
0,0,0,3,22.0,S,False,male,0
1,1,1,1,38.0,C,False,female,0
2,2,1,3,26.0,S,False,female,1
3,3,1,1,35.0,S,False,female,0
4,4,0,3,35.0,S,False,male,1


In [3]:
train_final.drop('Unnamed: 0',axis=1,inplace=True)

train_final.head()

Unnamed: 0,Survived,Pclass,Age,Embarked,Age_was_missing,person,alone
0,0,3,22.0,S,False,male,0
1,1,1,38.0,C,False,female,0
2,1,3,26.0,S,False,female,1
3,1,1,35.0,S,False,female,0
4,0,3,35.0,S,False,male,1


In [4]:
# We need to change the columns containing objects into float64, let us check which columns they are

object_cols = [col for col in train_final.columns if train_final[col].dtype == "object"]

In [5]:
object_cols

['Embarked', 'person']

In [7]:
# There is also the age missing which contains booleans, which we will deal with later

# There are different ways to solve this problem, the first is to use LabelEncoder from
# sklearn.preprocessing. Here I will use a simplier method.

emb_dummies = pd.get_dummies(train_final['Embarked'])

emb_dummies.head()

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [8]:
train_final=pd.concat([train_final,emb_dummies],axis=1)

train_final.head()


Unnamed: 0,Survived,Pclass,Age,Embarked,Age_was_missing,person,alone,C,Q,S
0,0,3,22.0,S,False,male,0,0,0,1
1,1,1,38.0,C,False,female,0,1,0,0
2,1,3,26.0,S,False,female,1,0,0,1
3,1,1,35.0,S,False,female,0,0,0,1
4,0,3,35.0,S,False,male,1,0,0,1


In [9]:
train_final.drop('Embarked',axis=1,inplace=True)

per_dummies = pd.get_dummies(train_final['person'])

per_dummies.head()

Unnamed: 0,child,female,male
0,0,0,1
1,0,1,0
2,0,1,0
3,0,1,0
4,0,0,1


In [10]:
train_final=pd.concat([train_final,per_dummies],axis=1)

train_final.drop('person',axis=1,inplace=True)

train_final.head()

Unnamed: 0,Survived,Pclass,Age,Age_was_missing,alone,C,Q,S,child,female,male
0,0,3,22.0,False,0,0,0,1,0,0,1
1,1,1,38.0,False,0,1,0,0,0,1,0
2,1,3,26.0,False,1,0,0,1,0,1,0
3,1,1,35.0,False,0,0,0,1,0,1,0
4,0,3,35.0,False,1,0,0,1,0,0,1


In [None]:
# There is now a problem as if Q and S are equal to zero then C is equal to one and it is similar to
# the person entries. This is collinearity. To remove this from the data we will drop two columns

train_final.drop(['C','child'],axis=1,inplace=True)

In [13]:
train_final.head()

Unnamed: 0,Survived,Pclass,Age,Age_was_missing,alone,Q,S,female,male
0,0,3,22.0,False,0,0,1,0,1
1,1,1,38.0,False,0,0,0,1,0
2,1,3,26.0,False,1,0,1,1,0
3,1,1,35.0,False,0,0,1,1,0
4,0,3,35.0,False,1,0,1,0,1


In [18]:
# We are now ready to split the data
from sklearn.model_selection import train_test_split

y = train_final.Survived

X = train_final.drop('Survived',axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

X_train.head()

Unnamed: 0,Pclass,Age,Age_was_missing,alone,Q,S,female,male
140,3,29.699118,True,0,0,0,1,0
439,2,31.0,False,1,0,1,0,1
817,2,31.0,False,0,0,0,0,1
378,3,20.0,False,1,0,0,0,1
491,3,21.0,False,1,0,1,0,1


In [19]:
# Time to run the test

from sklearn.ensemble import RandomForestRegressor

model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [20]:
from sklearn.metrics import mean_absolute_error

In [21]:
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 0
Model 2 MAE: 0
Model 3 MAE: 0
Model 4 MAE: 0
Model 5 MAE: 0


In [24]:
y_train.head(10)

140    0
439    0
817    0
378    0
491    0
331    0
588    0
358    1
674    0
162    0
Name: Survived, dtype: int64

In [26]:
y_valid.head(10)

495    0
648    0
278    0
31     1
255    1
298    1
609    1
318    1
484    1
367    1
Name: Survived, dtype: int64

In [27]:
y_train.value_counts()

0    439
1    273
Name: Survived, dtype: int64

In [28]:
model_1.fit(X_train,y_train)

preds = model_1.predict(X_valid)

In [29]:
mean_absolute_error(y_valid, preds)

0.22345450305968134

In [30]:
model_2.fit(X_train,y_train)

preds = model_2.predict(X_valid)

mean_absolute_error(y_valid, preds)

0.22243869359601653

In [31]:
model_3.fit(X_train,y_train)

preds = model_3.predict(X_valid)

mean_absolute_error(y_valid, preds)

0.2067877094972067

In [32]:
model_4.fit(X_train,y_train)

preds = model_4.predict(X_valid)

mean_absolute_error(y_valid, preds)

0.24221331235682583

In [33]:
model_5.fit(X_train,y_train)

preds = model_5.predict(X_valid)

mean_absolute_error(y_valid, preds)

0.2363063428836576

In [34]:
# The best model is model 3 but its mean absolute error is still big, lets see if we can improve it
# slightly

model_6 = RandomForestRegressor(n_estimators=200, criterion='mae',max_depth=7, random_state=0)

model_6.fit(X_train,y_train)

preds = model_6.predict(X_valid)

mean_absolute_error(y_valid, preds)

0.19509776536312848

In [38]:
# Another problem is we need to have answers to be one or zero

preds[0:10]

array([0.    , 0.    , 0.285 , 0.99  , 0.5775, 0.0025, 1.    , 1.    ,
       0.36  , 0.725 ])

In [40]:
# We have to use logical regression

from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

predictions=logmodel.predict(X_valid)



In [41]:
mean_absolute_error(y_valid, predictions)

0.18435754189944134

In [42]:
predictions

array([0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0], dtype=int64)

In [None]:
# There are many improvements that can be made but let us now see what results we get with this method
# on the test set