# Previous model

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [57]:
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')

In [58]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [59]:
dict_sex = {'male':0, 'female':1}
train['Sex_binary'] = train['Sex'].map(dict_sex)
test['Sex_binary'] = test['Sex'].map(dict_sex)

In [60]:
# only numerical for now
variables = ['Sex_binary', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare']

X = train[variables]
y = train['Survived']

X_test = test[variables]

# valores nulos
# -1 invalida a linha
X = X.fillna(-1)
X_test = X_test.fillna(-1)

In [61]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5)

In [62]:
model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

cv_results = cross_validate(model, X_train, y_train, cv=10, scoring='accuracy')
cv_results['test_score'].mean()

0.8244949494949496

In [63]:
model.fit(X_train, y_train)

In [64]:
X_val_check = train.loc[X_val.index].copy()

X_val_check['p'] = model.predict(X_val)

X_val_check.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_binary,p
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C,0,1
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,1,1
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0,,S,1,1
802,803,1,1,"Carter, Master. William Thornton II",male,11.0,1,2,113760,120.0,B96 B98,S,0,1
422,423,0,3,"Zimmerman, Mr. Leo",male,29.0,0,0,315082,7.875,,S,0,0


In [65]:
errors = X_val_check[X_val_check['Survived'] != X_val_check['p']]

errors.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_binary,p
169,170,0,3,"Ling, Mr. Lee",male,28.0,0,0,1601,56.4958,,S,0,1
621,622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42.0,1,0,11753,52.5542,D19,S,0,0
100,101,0,3,"Petranec, Miss. Matilda",female,28.0,0,0,349245,7.8958,,S,1,1
724,725,1,1,"Chambers, Mr. Norman Campbell",male,27.0,1,0,113806,53.1,E8,S,0,0
783,784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S,0,1


In [66]:
women = errors[errors['Sex'] == 'female']
men = errors[errors['Sex'] == 'male']

In [67]:
women.sort_values('Survived')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_binary,p
100,101,0,3,"Petranec, Miss. Matilda",female,28.0,0,0,349245,7.8958,,S,1,1
140,141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C,1,1
578,579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C,1,1
357,358,0,2,"Funk, Miss. Annie Clemmer",female,38.0,0,0,237671,13.0,,S,1,1
111,112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C,1,1
680,681,0,3,"Peters, Miss. Katie",female,,0,0,330935,8.1375,,Q,1,1
415,416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S,1,1
312,313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26.0,1,1,250651,26.0,,S,1,1
593,594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q,1,1
396,397,0,3,"Olsson, Miss. Elina",female,31.0,0,0,350407,7.8542,,S,1,1


In [68]:
men.sort_values('Survived')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_binary,p
169,170,0,3,"Ling, Mr. Lee",male,28.0,0,0,1601,56.4958,,S,0,1
769,770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32.0,0,0,8471,8.3625,,S,0,1
336,337,0,1,"Pears, Mr. Thomas Clinton",male,29.0,1,0,113776,66.6,C2,S,0,1
438,439,0,1,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0,C23 C25 C27,S,0,1
528,529,0,3,"Salonen, Mr. Johan Werner",male,39.0,0,0,3101296,7.925,,S,0,1
34,35,0,1,"Meyer, Mr. Edgar Joseph",male,28.0,1,0,PC 17604,82.1708,,C,0,1
30,31,0,1,"Uruchurtu, Don. Manuel E",male,40.0,0,0,PC 17601,27.7208,,C,0,1
145,146,0,2,"Nicholls, Mr. Joseph Charles",male,19.0,1,1,C.A. 33112,36.75,,S,0,1
766,767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C,0,1
475,476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52.0,A14,S,0,1


## Strategy

- Entire dataset:
  - Title of each passenger
  - Ticket -> separate the ticket into two groups: numeric and alphanumerical
  - Treat null values for Cabin
- Train set
  - OHE for Embarked, Pclass
  - Scaling numerical features

In [88]:
train['Ticket'].str.isalnum().value_counts(normalize=True)

True     0.746352
False    0.253648
Name: Ticket, dtype: float64

# Feature engineering

## Check if it's balanced

In [70]:
train['Survived'].value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

## Title of each passenger

In [79]:
def treat_name(name):
  title = str.replace(name.split(' ')[1], '.', '')
  # after the value counts to see the most frequent ones
  most_common = ['Mr', 'Miss', 'Mrs', 'Master']

  if title in most_common:
    return title

  return 'Other'

In [80]:
train['Title'] = train['Name'].apply(treat_name)

## Ticket of the passenger

In [92]:
def binary_ticket(ticket):
  return 1 if ticket.isalnum() else 0

In [94]:
train['Ticket_binary'] = train['Ticket'].apply(binary_ticket)

## Treat null values for cabin

In [125]:
train['Cabin'].isnull().sum() / train.shape[0]

0.7710437710437711

In [131]:
train['Cabin_labeled'] = train['Cabin'].fillna(0).apply(lambda x: 0 if x == 0 else 1)

In [133]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_binary,Title,Ticket_binary,Cabin_labeled
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,Mr,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,Mrs,0,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Miss,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,Mrs,1,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,Mr,1,0
