<a href="https://colab.research.google.com/github/denisecammarota/kaggle-notebooks/blob/main/TitanicCompetition/Kaggle_Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading Libraries

In [72]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.express as px
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go

# Loading Data, taking a look at the data and modifying a few things

In [73]:
total_data = pd.read_csv('train.csv')

In [74]:
total_data.shape

(891, 12)

In [75]:
total_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [76]:
total_data.shape

(891, 12)

In [77]:
total_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [78]:
print(total_data.apply(lambda col: col.unique()))

PassengerId    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
Survived                                                  [0, 1]
Pclass                                                 [3, 1, 2]
Name           [Braund, Mr. Owen Harris, Cumings, Mrs. John B...
Sex                                               [male, female]
Age            [22.0, 38.0, 26.0, 35.0, nan, 54.0, 2.0, 27.0,...
SibSp                                      [1, 0, 3, 4, 2, 5, 8]
Parch                                      [0, 1, 2, 5, 3, 4, 6]
Ticket         [A/5 21171, PC 17599, STON/O2. 3101282, 113803...
Fare           [7.25, 71.2833, 7.925, 53.1, 8.05, 8.4583, 51....
Cabin          [nan, C85, C123, E46, G6, C103, D56, A6, C23 C...
Embarked                                          [S, C, Q, nan]
dtype: object


In [79]:
print(total_data.apply(lambda col: col.isnull().values.sum()))

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


## Train and Test Split 
- passengerid: won't be put into the model
- name: won't be put into the model 
- cabin: won't put it into the model, too many missing values

In [80]:
total_data = total_data.drop(columns=['Name','Cabin','PassengerId'],axis=1)

In [81]:
total_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.2500,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,S
3,1,1,female,35.0,1,0,113803,53.1000,S
4,0,3,male,35.0,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,211536,13.0000,S
887,1,1,female,19.0,0,0,112053,30.0000,S
888,0,3,female,,1,2,W./C. 6607,23.4500,S
889,1,1,male,26.0,0,0,111369,30.0000,C


In [82]:
x_total =  total_data.iloc[:,1:].values
y_total = total_data.iloc[:,0].values

In [83]:
x_train, x_test, y_train, y_test = train_test_split(x_total, y_total, test_size = 0.3, random_state = 1)

## Replacing missing values

- age: replace by mean age 
- embarked: just replace the missing values with the one port that has the most people

### age

In [12]:
imputer_age = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer_age.fit(x_train[:,3:4])
x_train[:,3:4] = imputer_age.transform(x_train[:,3:4])

### embarked

In [13]:
imputer_embarked = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_embarked.fit(x_train[:,7:])
x_train[:,7:] = imputer_embarked.transform(x_train[:,7:])

## Apply to test data

In [85]:
x_test[:,3:4] = imputer_age.transform(x_test[:,3:4])
x_test[:,7:] = imputer_embarked.transform(x_test[:,7:])

# First model: Decision Tree Classificator