<a href="https://colab.research.google.com/github/g13e/g13e.github.io/blob/master/nbs/kaggle_titanic_fastai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Loading Data


In [0]:
#install kaggle API package
import sys
! {sys.executable} -m pip install kaggle --upgrade

#remember to upload the kaggle.json file in the working directory
! mkdir -p ~/.kaggle/
! cp kaggle.json ~/.kaggle/

In [0]:
#data analysis libraries 
import numpy as np
import pandas as pd

#visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [0]:
from fastai.basics import *

#change your dataset name here as needed
dataset='titanic'

path = Path("/content")/dataset
path.mkdir(parents=True, exist_ok=True)

# before you can download the dataset you need to go to the competition page 
# of the dataset and accept the rules (under the "rules" section)
# if you don't do that you will get a 404 when runnig the code below
! kaggle competitions download -c {dataset} -f train.csv -p {path}  
! kaggle competitions download -c {dataset} -f test.csv -p {path}  

In [0]:
#train dataset (include the prediction labels, i.e. "Survived" column 0/1 )
train_raw = pd.read_csv(path/'train.csv')
train=train_raw.copy()

#test dataset 
#(this one doesn't contain prediction labels, so can only be used at the end to generate the result for submission to kaggle)
test_raw = pd.read_csv(path/'test.csv')
test=test_raw.copy()

## Data Exploration

In [0]:
#get a list of the features within the dataset
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [0]:
#let's check a random sample of the training set
train.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
826,827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S
363,364,0,3,"Asim, Mr. Adola",male,35.0,0,0,SOTON/O.Q. 3101310,7.05,,S
213,214,0,2,"Givard, Mr. Hans Kristensen",male,30.0,0,0,250646,13.0,,S
298,299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S
382,383,0,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S


In [0]:
test.sample(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
22,914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S
344,1236,3,"van Billiard, Master. James William",male,,1,1,A/5. 851,14.5,,S
269,1161,3,"Pokrnic, Mr. Mate",male,17.0,0,0,315095,8.6625,,S
361,1253,2,"Mallet, Mrs. Albert (Antoinette Magnin)",female,24.0,1,1,S.C./PARIS 2079,37.0042,,C
194,1086,2,"Drew, Master. Marshall Brines",male,8.0,0,2,28220,32.5,,S


In [0]:
train.isnull().sum(), test.isnull().sum()

(PassengerId      0
 Survived         0
 Pclass           0
 Name             0
 Sex              0
 Age            177
 SibSp            0
 Parch            0
 Ticket           0
 Fare             0
 Cabin          687
 Embarked         2
 dtype: int64, PassengerId      0
 Pclass           0
 Name             0
 Sex              0
 Age             86
 SibSp            0
 Parch            0
 Ticket           0
 Fare             1
 Cabin          327
 Embarked         0
 dtype: int64)

In [0]:
#cabin column values
train[~train.Cabin.isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


#Data preparation

##Feature Engineering

In [0]:
##FEATURES ENGINEERING

#extract a new colum with a person title
for df in dfs:
    #extracting title from the name
    df['Title'] = df.Name.str.extract('([A-Za-z]+)\.', expand=False)
    #simplify titles
    mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
    df.replace({'Title': mapping}, inplace=True)
    #varibles to determine if people were alone or with family
    df['FamilySize'] = df ['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = 1 #initialize to yes/1 is alone
    df['IsAlone'].loc[df['FamilySize'] > 1] = 0 # now update to no/0 if family size is greater than 1
    #fill age based on median age per title
    titles = ['Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Rev']
    for title in titles:
      age_to_impute = df.groupby('Title')['Age'].median()[titles.index(title)]
      df.loc[(df['Age'].isnull()) & (df['Title'] == title), 'Age'] = age_to_impute
    #from the Cabin, extract cabing "letter"
    df['CabinGroup']=df.Cabin.str.extract('([A-Z])')
    df['CabinGroup'].fillna("none")


In [0]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'FamilySize',
       'IsAlone', 'CabinGroup'],
      dtype='object')

##Cleaning the Data

In [0]:
train.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilySize,IsAlone,CabinGroup
263,264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0,B94,S,Mr,1,1,B
811,812,0,3,"Lester, Mr. James",male,39.0,0,0,A/4 48871,24.15,,S,Mr,1,1,N
86,87,0,3,"Ford, Mr. William Neal",male,16.0,1,3,W./C. 6608,34.375,,S,Mr,5,0,N
796,797,1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S,Dr,1,1,D
852,853,0,3,"Boulos, Miss. Nourelain",female,9.0,1,1,2678,15.2458,,C,Miss,3,0,N


In [0]:
##CLEANING the data (replace nulls with some values)

dfs=[train,test]

for df in dfs:
  #fill fare NAs with the median of the Fare column values
  df.Fare.fillna(df.Fare.median(),inplace=True)
  #replace missing Embarked with the most common port of embarkment
  df.Embarked.fillna(df.Embarked.mode()[0],inplace=True)
  #replace missing cabin with NA
  df.Cabin.fillna("NA",inplace=True)

## Categorization & Numericalization

In [0]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [0]:
##CATEGORICAL VARIABLES

#let's set all columns (Series) associated to categorical values to pandas categorical type
cat_var=["Sex","Pclass",'Embarked','IsAlone','Title','CabinGroup']

for df in [train,test]:
  for col in cat_var:
    df[col]=df[col].astype('category').cat.as_ordered()

#then let's take the codes for each and replace the column values with the codes
for df in [train,test]:
  for col in cat_var:
    df[col]=df[col].cat.codes

In [0]:
print(train.Embarked)

0      2
1      0
2      2
3      2
4      2
      ..
886    2
887    2
888    2
889    0
890    1
Name: Embarked, Length: 891, dtype: int8


## Dropping unnecessary columns

In [0]:
train.head(1)
todrop=["Name","Ticket","Cabin"]
train=train.drop(columns=todrop)
test=test.drop(columns=todrop)

In [0]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int8
Sex            891 non-null int8
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       891 non-null int8
Title          891 non-null int8
FamilySize     891 non-null int64
IsAlone        891 non-null int8
CabinGroup     891 non-null int8
dtypes: float64(2), int64(5), int8(6)
memory usage: 54.1 KB


## Final check

In [0]:
#Final check of the data

print('Train columns with null values: \n', train.isnull().sum())
print("-"*10)
print (train.info())
print("-"*10)

print('Test/Validation columns with null values: \n', test.isnull().sum())
print("-"*10)
print (test.info())
print("-"*10)

train.describe(include = 'all')

Train columns with null values: 
 PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Title          0
FamilySize     0
IsAlone        0
CabinGroup     0
dtype: int64
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int8
Sex            891 non-null int8
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       891 non-null int8
Title          891 non-null int8
FamilySize     891 non-null int64
IsAlone        891 non-null int8
CabinGroup     891 non-null int8
dtypes: float64(2), int64(5), int8(6)
memory usage: 54.1 KB
None
----------
Test/Validation columns with null values: 
 PassengerId    0
Pclass         0
Sex            0
Age  

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,CabinGroup
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,1.308642,0.647587,29.410404,0.523008,0.381594,32.204208,1.536476,2.833895,1.904602,0.602694,5.946128
std,257.353842,0.486592,0.836071,0.47799,13.25289,1.102743,0.806057,49.693429,0.791503,0.775623,1.613459,0.489615,2.062347
min,1.0,0.0,0.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,223.5,0.0,1.0,0.0,21.5,0.0,0.0,7.9104,1.0,2.0,1.0,0.0,7.0
50%,446.0,0.0,2.0,1.0,30.0,0.0,0.0,14.4542,2.0,3.0,1.0,1.0,7.0
75%,668.5,1.0,2.0,1.0,35.0,1.0,0.0,31.0,2.0,3.0,2.0,1.0,7.0
max,891.0,1.0,2.0,1.0,80.0,8.0,6.0,512.3292,2.0,5.0,11.0,1.0,8.0


#Learners

## Random Forests

### Training

In [0]:
train.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,CabinGroup
0,1,0,2,1,22.0,1,0,7.25,2,3,2,0,7


In [0]:
plt.hist(train.CabinGroup)

In [0]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [0]:
rf = RandomForestClassifier(n_estimators=400,n_jobs=-1)

x_drop=['PassengerId','Title']
x=train.drop(x_drop+['Survived'], axis=1)
y=train.Survived

x_train, x_val,y_train,y_val = train_test_split(x,y,test_size = 0.30)

rf.fit(x_train, y_train)

f'train acc= {rf.score(x_train,y_train)} , test acc={rf.score(x_val,y_val)}'


'train acc= 0.9839486356340289 , test acc=0.8171641791044776'

### Submit result


In [0]:
ids = test['PassengerId']
predictions = rf.predict(test.drop(columns=x_drop))

In [0]:
#set the output as a dataframe and convert to csv file named submission.csv
res_df = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
res_file='titanic_submission.csv'
res_df.to_csv(res_file, index=False)

In [0]:
! kaggle competitions submit -c {dataset} -f {res_file} -m "Random Forest"

100% 2.77k/2.77k [00:04<00:00, 641B/s]
Successfully submitted to Titanic: Machine Learning from Disaster

##Neural Network 



### Initialization

In [0]:
from fastai.tabular import *

In [0]:
train.columns

In [0]:
#the prediction output, i.e. the model "y"
dep_var = 'Survived'

#the model input (x), split into categorical and continous variables
cat_names = ['Pclass', 'Sex', 'Embarked','IsAlone','Title']
cont_names = ['Age','Fare','FamilySize','SibSp','Parch']
##ignored ['Name','PassengerId','Ticket','Cabin']

#input processing transformations
procs = [Categorify, Normalize]

In [0]:
test_items = TabularList.from_df(test, path=path, cat_names=cat_names, cont_names=cont_names)

train_db = (TabularList.from_df(train, path=path, cat_names=cat_names, cont_names=cont_names,procs=procs)
                           .split_by_rand_pct(0.1)
                           .label_from_df(cols=dep_var)
                           .add_test(test_items,label=0)
                           .databunch())

In [0]:
train_db.show_batch(rows=3)

###Training

In [0]:
learn = tabular_learner(train_db, layers=[50,30],ps=[0.3,0.3],metrics=accuracy)

In [0]:
#learn.summary()

In [0]:
learn.lr_find()
learn.recorder.plot()

In [0]:
learn.fit_one_cycle(20,1e-1)

In [0]:
learn.recorder.plot_losses()

In [0]:
learn.recorder.plot_metrics()

In [0]:
learn.recorder.plot_lr()

In [0]:
#Inreference
predictions, *_ = learn.get_preds(DatasetType.Test)
labels = np.argmax(predictions, 1)

###Submit Result


In [0]:
res_df = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': labels})
res_file='titanic_submission.csv'
res_df.to_csv(res_file, index=False)

In [0]:
#! kaggle competitions submit -c {dataset} -f {res_file} -m "Another submission"

#Credits

Some of the steps in this notebook are inspired by these other notebooks

* https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy
* https://www.kaggle.com/nadintamer/titanic-survival-predictions-beginner
* https://www.kaggle.com/konstantinmasich/titanic-0-82-0-83