# Titanic - Machine Learning from Disaster

https://www.kaggle.com/c/titanic

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
titanic_train = '/home/ubuntu/pCloudDrive/18.Kaggle microcourses/00.Kaggle micro-courses/07.Intro to Machine Learning/02.Kaggle Playground Competition: Titanic/titanic/train.csv'
titanic_data = pd.read_csv(titanic_train)
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
titanic_data.shape

(891, 12)

I see I have to encode the values for Sex so we can use them for the model

I see that 177 out of 891 records have missing data for age <br>
I wonder if we can fill in missing data for age, like is this a safe practice <br>
Three things I can do here
1) remove missing rows from the age column by axis 0 <br>
2) remove missing values from the age column by axis 1 basically remove the entire column entirely <br>
3) try and fill missing values <br>
4) imputation on mean value of age of passengers seems the best so far <br>

I see almost all the values for cabin are missing but I think I won't need this feature anyway, so dont have to worry about it this much rn

# Baseline model sort of 

In [5]:
#skipping PassengerId, name, sex, age, ticket, cabin and embarked for now
y = titanic_data['Survived']
features = ['Pclass', 'SibSp', 'Parch', 'Fare']
X = titanic_data[features]

In [6]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

In [7]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(train_X, train_y)

val_predictions = logreg.predict(val_X)
val_predictions.size

223

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(val_y, val_predictions)
print(cm)
accuracy_score(val_y, val_predictions)

[[105  23]
 [ 51  44]]


0.6681614349775785

In [9]:
105+23+51+44

223

In [11]:
# 0.6681614349775785 not great, not terrible 

# Label encoding the sex column

In [10]:
titanic_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [12]:
#skipping PassengerId, name, age, ticket, cabin and embarked for now
y2 = titanic_data['Survived']
features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']
X2 = titanic_data[features]

In [13]:
train_X2, val_X2, train_y2, val_y2 = train_test_split(X2, y2, random_state = 1)

In [15]:
train_X2

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare
35,1,male,1,0,52.0000
46,3,male,1,0,15.5000
453,1,male,1,0,89.1042
291,1,female,1,0,91.0792
748,1,male,1,0,53.1000
...,...,...,...,...,...
715,3,male,0,0,7.6500
767,3,female,0,0,7.7500
72,2,male,0,0,73.5000
235,3,female,0,0,7.5500


In [29]:
train_X2.iloc[:,1]

35     1
46     1
453    1
291    0
748    1
      ..
715    1
767    0
72     1
235    0
37     1
Name: Sex, Length: 668, dtype: int64

In [16]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

## differentce between fit_transform and transform????

train_X2.iloc[:,1] = label_encoder.fit_transform(train_X2['Sex'])
val_X2.iloc[:,1] = label_encoder.transform(val_X2['Sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [17]:
train_X2

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare
35,1,1,1,0,52.0000
46,3,1,1,0,15.5000
453,1,1,1,0,89.1042
291,1,0,1,0,91.0792
748,1,1,1,0,53.1000
...,...,...,...,...,...
715,3,1,0,0,7.6500
767,3,0,0,0,7.7500
72,2,1,0,0,73.5000
235,3,0,0,0,7.5500


In [18]:
logreg2 = LogisticRegression()
logreg2.fit(train_X2, train_y2)

val_predictions2 = logreg2.predict(val_X2)
val_predictions2.size

223

In [19]:
cm2 = confusion_matrix(val_y2, val_predictions2)
print(cm2)
accuracy_score(val_y2, val_predictions2)

[[114  14]
 [ 34  61]]


0.7847533632286996

In [20]:
114 + 14 + 34 + 61

223

In [21]:
# Surprise surprise adding the age increases the score of the model from 0.6681614349775785 to 7847533632286996

# Filling missing values for age via numpy fillna

In [22]:
titanic_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [25]:
#skipping PassengerId, name, ticket, cabin and embarked for now
y3 = titanic_data['Survived']
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X3 = titanic_data[features]

In [30]:
train_X3, val_X3, train_y3, val_y3 = train_test_split(X3, y3, random_state = 1)

In [36]:
val_X3

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
862,1,0,48.0,0,0,25.9292
223,3,1,,0,0,7.8958
84,2,0,17.0,0,0,10.5000
680,3,0,,0,0,8.1375
535,2,0,7.0,0,2,26.2500
...,...,...,...,...,...,...
506,2,0,33.0,0,2,26.0000
467,1,1,56.0,0,0,26.5500
740,1,1,,0,0,30.0000
354,3,1,,0,0,7.2250


In [32]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

## differentce between fit_transform and transform????

train_X3.iloc[:,1] = label_encoder.fit_transform(train_X3['Sex'])
val_X3.iloc[:,1] = label_encoder.transform(val_X3['Sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [49]:
# replace all NA's the value that comes directly after it in the same column, 
# then replace all the remaining na's with 0

train_X3.iloc[:,2] = train_X3.iloc[:,2].fillna(method = 'bfill', axis = 0).fillna(0)
val_X3.iloc[:,2] = val_X3.iloc[:,2].fillna(method = 'bfill', axis = 0).fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [54]:
logreg3 = LogisticRegression()
logreg3.fit(train_X3, train_y3)

val_predictions3 = logreg3.predict(val_X3)
val_predictions3.size

223

In [55]:
cm3 = confusion_matrix(val_y3, val_predictions3)
print(cm3)
accuracy_score(val_y3, val_predictions3)

[[114  14]
 [ 28  67]]


0.8116591928251121

# Filling missing values for age via imputation

In [56]:
titanic_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [57]:
#skipping PassengerId, name, ticket, cabin and embarked for now
y4 = titanic_data['Survived']
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X4 = titanic_data[features]

In [60]:
train_X4, val_X4, train_y4, val_y4 = train_test_split(X4, y4, random_state = 1)

In [61]:
train_X4

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
35,1,male,42.0,1,0,52.0000
46,3,male,,1,0,15.5000
453,1,male,49.0,1,0,89.1042
291,1,female,19.0,1,0,91.0792
748,1,male,19.0,1,0,53.1000
...,...,...,...,...,...,...
715,3,male,19.0,0,0,7.6500
767,3,female,30.5,0,0,7.7500
72,2,male,21.0,0,0,73.5000
235,3,female,,0,0,7.5500


In [62]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

## differentce between fit_transform and transform????

train_X4.iloc[:,1] = label_encoder.fit_transform(train_X4['Sex'])
val_X4.iloc[:,1] = label_encoder.transform(val_X4['Sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [84]:
train_X4.loc[35:38,:]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
35,1,1,42.00,1,0,52.0000
46,3,1,,1,0,15.5000
453,1,1,49.00,1,0,89.1042
291,1,0,19.00,1,0,91.0792
748,1,1,19.00,1,0,53.1000
...,...,...,...,...,...,...
447,1,1,34.00,0,0,26.5500
122,2,1,32.50,1,0,30.0708
755,2,1,0.67,1,1,14.5000
501,3,0,21.00,0,0,7.7500


In [67]:
# train_X3.iloc[:,2] = train_X3.iloc[:,2].fillna(method = 'bfill', axis = 0).fillna(0)
# val_X3.iloc[:,2] = val_X3.iloc[:,2].fillna(method = 'bfill', axis = 0).fillna(0)

from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(train_X4))
imputed_X_valid = pd.DataFrame(my_imputer.transform(val_X4))

In [70]:
imputed_X_train.columns = train_X4.columns
imputed_X_valid.columns = val_X4.columns

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
35,2.0,1.0,31.0,1.0,1.0,26.25
36,3.0,0.0,9.0,3.0,2.0,27.9
37,2.0,0.0,13.0,0.0,1.0,19.5
38,3.0,1.0,30.073682,1.0,0.0,15.5
39,3.0,0.0,30.073682,0.0,0.0,7.75
40,2.0,0.0,24.0,0.0,2.0,14.5
41,2.0,1.0,27.0,0.0,0.0,26.0
42,2.0,1.0,33.0,0.0,0.0,12.275
43,1.0,1.0,25.0,1.0,0.0,91.0792
44,1.0,1.0,46.0,1.0,0.0,61.175


In [87]:
logreg4 = LogisticRegression()
logreg4.fit(imputed_X_train, train_y4)

val_predictions4 = logreg4.predict(imputed_X_valid)
val_predictions4.size

223

In [88]:
cm4 = confusion_matrix(val_y4, val_predictions4)
print(cm4)
accuracy_score(val_y4, val_predictions4)

[[111  17]
 [ 26  69]]


0.8071748878923767

# Generating submission for competition

In [89]:
titanic_train = '/home/ubuntu/pCloudDrive/18.Kaggle microcourses/00.Kaggle micro-courses/07.Intro to Machine Learning/02.Kaggle Playground Competition: Titanic/titanic/test.csv'
titanic_data_test = pd.read_csv(titanic_train)
titanic_data_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [92]:
titanic_data_test.shape

(418, 11)

In [113]:
#skipping PassengerId, name, ticket, cabin and embarked for now

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X5 = titanic_data_test[features]

In [132]:
X5

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,34.5,0,0,7.8292
1,3,0,47.0,1,0,7.0000
2,2,1,62.0,0,0,9.6875
3,3,1,27.0,0,0,8.6625
4,3,0,22.0,1,1,12.2875
...,...,...,...,...,...,...
413,3,1,39.0,0,0,8.0500
414,1,0,39.0,0,0,108.9000
415,3,1,38.5,0,0,7.2500
416,3,1,0.0,0,0,8.0500


In [115]:
X5.iloc[:,1]

0        male
1      female
2        male
3        male
4      female
        ...  
413      male
414    female
415      male
416      male
417      male
Name: Sex, Length: 418, dtype: object

In [116]:
X51 = X52 = X5

In [117]:
X5

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,34.5,0,0,7.8292
1,3,female,47.0,1,0,7.0000
2,2,male,62.0,0,0,9.6875
3,3,male,27.0,0,0,8.6625
4,3,female,22.0,1,1,12.2875
...,...,...,...,...,...,...
413,3,male,,0,0,8.0500
414,1,female,39.0,0,0,108.9000
415,3,male,38.5,0,0,7.2500
416,3,male,,0,0,8.0500


In [118]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

## differentce between fit_transform and transform????

#train_X3.iloc[:,1] = label_encoder.fit_transform(train_X3['Sex'])
#val_X3.iloc[:,1] = label_encoder.transform(val_X3['Sex'])

X51.iloc[:,1] = label_encoder.fit_transform(X5.iloc[:,1])
X52.iloc[:,1] = label_encoder.fit_transform(X5.iloc[:,1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [120]:
X51

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1,34.5,0,0,7.8292
1,3,0,47.0,1,0,7.0000
2,2,1,62.0,0,0,9.6875
3,3,1,27.0,0,0,8.6625
4,3,0,22.0,1,1,12.2875
...,...,...,...,...,...,...
413,3,1,,0,0,8.0500
414,1,0,39.0,0,0,108.9000
415,3,1,38.5,0,0,7.2500
416,3,1,,0,0,8.0500


In [121]:
X51.iloc[:,2] 

0      34.5
1      47.0
2      62.0
3      27.0
4      22.0
       ... 
413     NaN
414    39.0
415    38.5
416     NaN
417     NaN
Name: Age, Length: 418, dtype: float64

In [122]:
# replace all NA's the value that comes directly after it in the same column, 
# then replace all the remaining na's with 0

X51.iloc[:,2] = X51.iloc[:,2].fillna(method = 'bfill', axis = 0).fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [133]:
X51.isnull().sum()

Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
Fare      1
dtype: int64

In [140]:
X51.iloc[:,5] = X51.iloc[:,5].fillna(method = 'bfill', axis = 0).fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [141]:
X51.iloc[:,5].isnull().sum()

0

In [142]:
val_predictions5 = logreg3.predict(X51)
val_predictions5.size

418

In [146]:
titanic_data_test.PassengerId

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [151]:
output = pd.DataFrame({'PassengerId': titanic_data_test.PassengerId, 'Survived': val_predictions5})
output.to_csv('titanic/titanic_submission.csv', index=False)