# Reading raw data and cleaning

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [56]:
from fastai import *
from fastai.structured import *
from pandas import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn import metrics

In [3]:
PATH = "data/titanic/"
df_raw = pd.read_csv(f'{PATH}train.csv')

In [4]:
df_raw.shape

(891, 12)

In [7]:
df_raw

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [8]:
df_raw.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## Replacing strings by categories

In [9]:
train_cats(df_raw)

In [10]:
df_raw.dtypes

PassengerId       int64
Survived          int64
Pclass            int64
Name           category
Sex            category
Age             float64
SibSp             int64
Parch             int64
Ticket         category
Fare            float64
Cabin          category
Embarked       category
dtype: object

## Replacing nulls by values

In [11]:
df_raw.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

**NOTE**: only 'Age' will need to be processed for missing values, the two others 'Cabin' and 'Embarked' will either way have to be replaced by numerics after being turned to categories, so the missing values will be a separate number category of their own

### Let's save the cleaned up raw training data to be able to retrieve next time.

In [None]:
os.makedirs('tmp')

In [12]:
df_raw.to_feather('tmp/titanic-raw')

# Preprocessing of data

In [13]:
df_raw = pd.read_feather('tmp/titanic-raw')

  return feather.read_dataframe(path, nthreads=nthreads)


In [14]:
type(df_raw)

pandas.core.frame.DataFrame

## Separating the dependent variable 'Survived'

Note: the proc_df function also takes care of missing values (replacing them with the median, see effect on 'Age')

In [15]:
df, y, nas = proc_df(df=df_raw, y_fld='Survived')

In [16]:
df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_na
0,1,3,109,2,22.0,1,0,524,7.2500,0,3,False
1,2,1,191,1,38.0,1,0,597,71.2833,82,1,False
2,3,3,354,1,26.0,0,0,670,7.9250,0,3,False
3,4,1,273,1,35.0,1,0,50,53.1000,56,3,False
4,5,3,16,2,35.0,0,0,473,8.0500,0,3,False
5,6,3,555,2,28.0,0,0,276,8.4583,0,2,True
6,7,1,516,2,54.0,0,0,86,51.8625,130,3,False
7,8,3,625,2,2.0,3,1,396,21.0750,0,3,False
8,9,3,413,1,27.0,0,2,345,11.1333,0,3,False
9,10,2,577,1,14.0,1,0,133,30.0708,0,1,False


In [29]:
df.shape

(891, 12)

In [27]:
y

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 

In [28]:
y.shape

(891,)

In [17]:
# all the missing values for age (originally set as 0) were replaced by the median age 28.0
nas

{'Age': 28.0}

### (Or) If going the manual way without proc_df

In [18]:
y_fld = df_raw['Survived'].values
y_fld.shape

(891,)

In [19]:
y_fld

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 

In [20]:
df_raw_x = df_raw.drop(columns='Survived')

In [21]:
df_raw_x.shape

(891, 11)

**NOTE**: if going this manual route without proc_df, we still need to take care of categories and numericalize them

## Splitting training and validation sets

In [36]:
n_train = len(y)//2
# n_valid = len(y)-n_train

In [37]:
y_train = y[:n_train]
y_valid = y[n_train:]
y_train.shape, y_valid.shape

((445,), (446,))

In [38]:
df_train = df[:n_train]
df_valid = df[n_train:]
df_train.shape, df_valid.shape

((445, 12), (446, 12))

In [39]:
df_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_na
0,1,3,109,2,22.0,1,0,524,7.2500,0,3,False
1,2,1,191,1,38.0,1,0,597,71.2833,82,1,False
2,3,3,354,1,26.0,0,0,670,7.9250,0,3,False
3,4,1,273,1,35.0,1,0,50,53.1000,56,3,False
4,5,3,16,2,35.0,0,0,473,8.0500,0,3,False
5,6,3,555,2,28.0,0,0,276,8.4583,0,2,True
6,7,1,516,2,54.0,0,0,86,51.8625,130,3,False
7,8,3,625,2,2.0,3,1,396,21.0750,0,3,False
8,9,3,413,1,27.0,0,2,345,11.1333,0,3,False
9,10,2,577,1,14.0,1,0,133,30.0708,0,1,False


In [40]:
df_valid

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_na
445,446,1,217,2,4.00,0,2,290,81.8583,11,3,False
446,447,2,530,1,13.00,0,1,163,19.5000,0,3,False
447,448,1,736,2,34.00,0,0,46,26.5500,0,3,False
448,449,3,56,1,5.00,2,1,195,19.2583,0,1,False
449,450,1,662,2,52.00,0,0,41,30.5000,51,3,False
450,451,2,855,2,36.00,1,2,562,27.7500,0,3,False
451,452,3,317,2,28.00,1,0,498,19.9667,0,3,True
452,453,1,258,2,30.00,0,0,23,27.7500,54,1,False
453,454,1,291,2,49.00,1,0,85,89.1042,87,1,False
454,455,3,645,2,28.00,0,0,529,8.0500,0,3,True


# Training the Random Forest model

First looking at the training set score:

In [46]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(df_train,y_train)
m.score(df_train,y_train)

CPU times: user 41.5 ms, sys: 5.12 ms, total: 46.6 ms
Wall time: 110 ms


0.8813748568520169

Now let's try the score on the validation set:

In [47]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(df_valid,y_valid)
m.score(df_valid,y_valid)

CPU times: user 41.7 ms, sys: 1.89 ms, total: 43.6 ms
Wall time: 110 ms


0.8911356628982527

# Preparing submission to Kaggle

Read the test file for kg and prepare submission.

In [113]:
test_X_raw = pd.read_csv(f'{PATH}test.csv')

In [170]:
test_X_raw

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.2250,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0000,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.1500,,S


In [115]:
test_X_raw.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Need to clean up test_X_raw from missing values and convert to categories in the same way so our model can use it.

In [116]:
train_cats(test_X_raw)

In [111]:
test_X_raw.dtypes

PassengerId       int64
Pclass            int64
Name           category
Sex            category
Age             float64
SibSp             int64
Parch             int64
Ticket         category
Fare            float64
Cabin          category
Embarked       category
dtype: object

In [163]:
test_X_raw.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

Note: the 'Fare' in the test set has a missing value, whereas it didn't in the training set.

In [174]:
test_X,_,test_NA = proc_df(df=test_X_raw)

In [193]:
test_X.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_na', 'Fare_na'],
      dtype='object')

In [192]:
test_X_raw[test_X_raw['Fare'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [188]:
## *NOTE*: I used the 'dontcare' variables ,_, to capture the only return variables I need (no need to remove a col).
## Another way of doing it is to retrieve the whole in one list xyz then take the first element only.
## In that case xyz[0] would be the dataframe I need

# test_XYZ = proc_df(df=test_X_raw)
# test_X = test_XYZ[0]

### Let's save the test_X_raw that was cleaned up.

In [212]:
test_X_raw.to_feather('tmp/titanic-raw-test')

Now we'll try removing the 'Fare_na' column which only has 1 missing value.

In [196]:
test_X_reduced = test_X.drop(columns='Fare_na')
test_X_reduced

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_na
0,892,3,207,2,34.5,0,0,153,7.8292,0,2,False
1,893,3,404,1,47.0,1,0,222,7.0000,0,3,False
2,894,2,270,2,62.0,0,0,74,9.6875,0,2,False
3,895,3,409,2,27.0,0,0,148,8.6625,0,3,False
4,896,3,179,1,22.0,1,1,139,12.2875,0,3,False
5,897,3,367,2,14.0,0,0,262,9.2250,0,3,False
6,898,3,85,1,30.0,0,0,159,7.6292,0,2,False
7,899,2,58,2,26.0,1,1,85,29.0000,0,3,False
8,900,3,5,1,18.0,0,0,101,7.2292,0,1,False
9,901,3,104,2,21.0,2,0,270,24.1500,0,3,False


Predict values on test set using the trained model.
We need to round the result values so we get 0 or 1.
**NOTE**: converting from float to bool instead of rounding does not work because any non-zero value becomes 1!

In [None]:
test_Y_reduced = m.predict(test_X_reduced)
test_Y_reduced = np.around(test_Y_reduced).astype(int)
test_Y_reduced

In [203]:
res_kg_reduced = pd.DataFrame({'PassengerId':test_X_reduced['PassengerId'], 'Survived':test_Y_reduced})
res_kg_reduced

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [205]:
res_kg_reduced.to_csv('results/titanic-results-reduced.csv', index=False)

#### Trying the same but with the model trained on the entire set they provided

In [211]:
# combine both training and validation set and train the model on the whole thing before predicting test set.
df_comb = df_train.append(df_valid)
df_comb

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_na
0,1,3,109,2,22.0,1,0,524,7.2500,0,3,False
1,2,1,191,1,38.0,1,0,597,71.2833,82,1,False
2,3,3,354,1,26.0,0,0,670,7.9250,0,3,False
3,4,1,273,1,35.0,1,0,50,53.1000,56,3,False
4,5,3,16,2,35.0,0,0,473,8.0500,0,3,False
5,6,3,555,2,28.0,0,0,276,8.4583,0,2,True
6,7,1,516,2,54.0,0,0,86,51.8625,130,3,False
7,8,3,625,2,2.0,3,1,396,21.0750,0,3,False
8,9,3,413,1,27.0,0,2,345,11.1333,0,3,False
9,10,2,577,1,14.0,1,0,133,30.0708,0,1,False


In [217]:
y_comb = np.append(y_train, y_valid)
y_comb

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 

In [218]:
len(df_comb),len(y_comb)

(891, 891)

In [219]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(df_comb,y_comb)
m.score(df_comb,y_comb)

CPU times: user 78.3 ms, sys: 0 ns, total: 78.3 ms
Wall time: 111 ms


0.9024331320103537

In [220]:
test_comb_reduced = m.predict(test_X_reduced)
test_comb_reduced = np.around(test_comb_reduced).astype(int)
test_comb_reduced

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 

In [221]:
res_kg_comb_reduced = pd.DataFrame({'PassengerId':test_X_reduced['PassengerId'], 'Survived':test_comb_reduced})
res_kg_comb_reduced

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,1
8,900,1
9,901,0


In [223]:
res_kg_comb_reduced.to_csv('results/titanic-results-reduced-combined.csv', index=False)

#### Now try to keep the 'Fare_na' column in and instead add it to the model itself (no missing values for 'Fare' in training set so let's put an extra column for training with all False in 'Fare_na'). We'll look at validation errors of both models but will probably submit both to Public LB and see.

In [231]:
fare_na_col = df_comb.isnull()['Fare']
fare_na_col

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
861    False
862    False
863    False
864    False
865    False
866    False
867    False
868    False
869    False
870    False
871    False
872    False
873    False
874    False
875    False
876    False
877    False
878    False
879    False
880    False
881    False
882    False
883    False
884    False
885    False
886    False
887    False
888    False
889    False
890    False
Name: Fare, Length: 891, dtype: bool

In [235]:
len(df_comb.columns)

12

In [241]:
df_comb.insert(column='Fare_na',loc=len(df_comb.columns),value=fare_na_col)

ValueError: cannot insert Fare_na, already exists

In [244]:
df_comb.shape

(891, 13)

In [245]:
test_X.shape

(418, 13)

In [246]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(df_comb,y_comb)
m.score(df_comb,y_comb)

CPU times: user 78.1 ms, sys: 2.47 ms, total: 80.6 ms
Wall time: 111 ms


0.899016393442623

In [247]:
test_comb_aug = m.predict(test_X)
test_comb_aug = np.around(test_comb_aug).astype(int)
test_comb_aug

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 

In [248]:
res_kg_comb_aug = pd.DataFrame({'PassengerId':test_X['PassengerId'], 'Survived':test_comb_aug})
res_kg_comb_aug

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


In [249]:
res_kg_comb_aug.to_csv('results/titanic-results-augmented-combined.csv', index=False)