# Titanic prediction using Logistic Regression

## Import data & visualization 

In [1]:
from fastai.imports import *
from fastai.structured import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from IPython.display import display

from sklearn import metrics
import datetime

  from numpy.core.umath_tests import inner1d


In [2]:
PATH = "data/titanic/"

In [3]:
os.listdir(PATH)

['gender_submission.csv', 'models', 'test.csv', 'tmp', 'train.csv']

In [42]:
train = pd.read_csv(f'{PATH}train.csv')
test = pd.read_csv(f'{PATH}test.csv')

In [43]:
PassengerId = test['PassengerId']

In [None]:
train.head()

In [None]:
train.shape, test.shape

## Process data

In [None]:
# Change any columns of strings in a panda's dataframe to a column of
#     catagorical values. This applies the changes inplace.

In [None]:
# train_cats(train)
# train_cats(test)

In [4]:
data = [train, test]

In [5]:
# add FamilySize
for df in data:
    df['FamilySize']=df['SibSp']+df['Parch']+1

In [6]:
for df in data:
    age_avg = df['Age'].mean()
    age_std = df['Age'].std()
    age_null_count = df['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    df['Age'][np.isnan(df['Age'])] = age_null_random_list
    df['Age'] = df['Age'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [7]:
# add Fare per person
for df in data:
    df['Fare_Per_Person']=df['Fare']/(df['FamilySize'])
    df['Age*Class']=df['Age']*df['Pclass']

In [8]:
for df in data:
    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

In [9]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

In [10]:
for df in data:
    df['Title'] = df['Name'].apply(get_title)

In [11]:
for df in data:
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

In [12]:
for df in data:
    df['Fare'] = df['Fare'].fillna(train['Fare'].median())
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    df['Title'] = df['Title'].map(title_mapping)
    df['Title'] = df['Title'].fillna(0)
    
    # Mapping Fare
    df.loc[ df['Fare'] <= 7.91, 'Fare'] 						        = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare']   = 2
    df.loc[ df['Fare'] > 31, 'Fare'] 							        = 3
    df['Fare'] = df['Fare'].astype(int)
    
    # Mapping Age
    df.loc[ df['Age'] <= 16, 'Age'] 					       = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
    df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
    df.loc[ df['Age'] > 64, 'Age'] = 4 ;

In [13]:
for df in data:
    df['Has_Cabin'] = df["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

In [14]:
to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']

In [15]:
train = train.drop(to_drop, axis=1)
test = test.drop(to_drop, axis=1)

In [16]:
train.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Fare_Per_Person,Age*Class,IsAlone,Title,Has_Cabin
0,0,3,male,1,1,0,0,S,2,3.625,66,0,1,0
1,1,1,female,2,1,0,3,C,2,35.64165,38,0,3,1
2,1,3,female,1,0,0,1,S,1,7.925,78,1,2,0


In [17]:
# cont_vars = ['SibSp', 'Parch', 'FamilySize', 'Fare_Per_Person', 'Age*Class']
cont_vars = ['Fare_Per_Person', 'Age*Class']
cat_vars = [c for c in test.columns if c not in cont_vars]

In [18]:
train = train[cat_vars+cont_vars+['Survived']].copy()
test = test[cat_vars+cont_vars].copy()
test['Survived'] = 0
train.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,Has_Cabin,Fare_Per_Person,Age*Class,Survived
0,3,male,1,1,0,0,S,2,0,1,0,3.625,66,0
1,1,female,2,1,0,3,C,2,0,3,1,35.64165,38,1
2,3,female,1,0,0,1,S,1,1,2,0,7.925,78,1


In [19]:
for v in cat_vars: train[v] = train[v].astype('category').cat.as_ordered()

In [20]:
apply_cats(test,train)

In [21]:
for v in cont_vars:
    train[v] = train[v].fillna(0).astype('float32')
    test[v] = test[v].fillna(0).astype('float32')

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
Pclass             891 non-null category
Sex                891 non-null category
Age                891 non-null category
SibSp              891 non-null category
Parch              891 non-null category
Fare               891 non-null category
Embarked           889 non-null category
FamilySize         891 non-null category
IsAlone            891 non-null category
Title              891 non-null category
Has_Cabin          891 non-null category
Fare_Per_Person    891 non-null float32
Age*Class          891 non-null float32
Survived           891 non-null int64
dtypes: category(11), float32(2), int64(1)
memory usage: 25.8 KB


In [23]:
x, y, nas, mapper = proc_df(train, 'Survived', do_scale=True)
x_test, y_test, nas, mapper = proc_df(test, 'Survived', do_scale=True, mapper=mapper, na_dict=nas)

In [24]:
x.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,Has_Cabin,Fare_Per_Person,Age*Class
0,3,2,2,2,1,1,3,2,1,1,1,-0.454798,0.064716
1,1,1,3,2,1,4,1,2,1,3,2,0.438994,-0.763252
2,3,1,2,1,1,2,3,1,2,2,1,-0.334757,0.41956


## Train CNN 

In [25]:
n = len(x); n

891

In [26]:
max_y = np.max(y)
y_range = (0, max_y * 1.2)

In [27]:
train_ratio = 0.9
train_size = int(n * train_ratio)
val_idx = list(range(train_size, len(x)))

In [28]:
cat_sz = [(c, len(train[c].cat.categories)) for c in cat_vars]

In [29]:
cat_sz

[('Pclass', 3),
 ('Sex', 2),
 ('Age', 5),
 ('SibSp', 7),
 ('Parch', 7),
 ('Fare', 4),
 ('Embarked', 3),
 ('FamilySize', 9),
 ('IsAlone', 2),
 ('Title', 5),
 ('Has_Cabin', 2)]

In [30]:
emb_szs = [(c, min(50, c)) for _,c in cat_sz]

In [31]:
emb_szs

[(3, 3),
 (2, 2),
 (5, 5),
 (7, 7),
 (7, 7),
 (4, 4),
 (3, 3),
 (9, 9),
 (2, 2),
 (5, 5),
 (2, 2)]

In [32]:
from fastai.column_data import *

In [33]:
md = ColumnarModelData.from_data_frame(PATH, val_idx, x, y.astype(np.float32), cat_flds = cat_vars, bs=128,
                                       test_df=x_test)

In [34]:
m = md.get_learner(emb_szs, len(x.columns)-len(cat_vars),
                   0.05, 1, [1000,500], [0.001,0.01], y_range=y_range)

In [36]:
m.lr_find()

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                                                                         
    0      0.28359    0.367111  



In [35]:
lr = 1e-3

In [39]:
from sklearn.metrics import accuracy_score

In [36]:
m.fit(lr, 5)

HBox(children=(IntProgress(value=0, description='Epoch', max=5, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                                                                         
    0      0.196051   0.148534  
    1      0.171851   0.14527                                                                                          
    2      0.15834    0.135107                                                                                         
    3      0.149209   0.129586                                                                                         
    4      0.141435   0.11598                                                                                          



[array([0.11598])]

In [37]:
m.fit(lr, 3)

HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                                                                         
    0      0.142146   0.131669  
    1      0.131464   0.123636                                                                                         
    2      0.129334   0.117788                                                                                         



[array([0.11779])]

In [38]:
m.fit(lr, 3)

HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                                                                                         
    0      0.11975    0.11783   
    1      0.118043   0.112062                                                                                         
    2      0.11712    0.107786                                                                                         



[array([0.10779])]

In [67]:
# prediction
predicted = m.predict(True)

In [56]:
predicted[400][0]

1.1093631

In [79]:
res = []
for p in predicted:
    if p[0] < 0.5:
        res.append(0)
    else:
        res.append(1)
# res

In [80]:
res = np.array(res)

In [75]:
res.shape

(418,)

In [81]:
my_submission = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': res})

In [82]:
my_submission[my_submission.Survived==0].count()

PassengerId    269
Survived       269
dtype: int64

In [83]:
my_submission.to_csv('titanic_submission_dl.csv', index=False)

Score 0.77033