# # Titanic - Machine Learning from Disaster parte 2
## Vamos utilizar os dados disponíveis no Kaggle: https://www.kaggle.com/competitions/titanic
## - É um dataset de competição
## - O resultado é avaliado através da acurácia

## Importações necessárias

In [41]:
# Importando o pandas
import pandas as pd

In [42]:
# Visualizando a base de treino
treino = pd.read_csv('train.csv')
treino.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [43]:
# Visualizando a base de teste
teste = pd.read_csv('test.csv')
teste.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [44]:
# Eliminando as colunas com elevada cardinalidade
treino = treino.drop(['Name','Ticket','Cabin'],axis=1)
teste = teste.drop(['Name','Ticket','Cabin'],axis=1)

# Usando a média para substituir valores nulos na coluna de idade
treino.loc[treino.Age.isnull(),'Age'] = treino.Age.mean()
teste.loc[teste.Age.isnull(),'Age'] = teste.Age.mean()

# Tratando a coluna Embarked da base de treino usando a moda 
treino.loc[treino.Embarked.isnull(),'Embarked'] = treino.Embarked.mode()[0]

# E também a coluna Fare da base de teste usando a média
teste.loc[teste.Fare.isnull(),'Fare'] = teste.Fare.mean()

## Análise das colunas de texto

In [45]:
# Verificar colunas de texto
treino.columns[treino.dtypes == 'object'] 

Index(['Sex', 'Embarked'], dtype='object')

In [46]:
# Verificando valores das colunas Sex e Embarked
treino.Sex.value_counts(), treino.Embarked.value_counts(), 

(Sex
 male      577
 female    314
 Name: count, dtype: int64,
 Embarked
 S    646
 C    168
 Q     77
 Name: count, dtype: int64)

### Tratamento das coluna Sex e Embarked

#### Coluna Sex

In [47]:
# Tratamento Sex em treino
treino['MaleCheck'] = treino.Sex.apply(lambda x: 1 if x == 'male' else 0)

In [48]:
# Verificação
treino[['Sex', 'MaleCheck']].value_counts()

Sex     MaleCheck
male    1            577
female  0            314
Name: count, dtype: int64

In [49]:
# Tratamento Sex em teste
teste['MaleCheck'] = teste.Sex.apply(lambda x: 1 if x == 'male' else 0)

# Verificação
teste[['Sex', 'MaleCheck']].value_counts()

Sex     MaleCheck
male    1            266
female  0            152
Name: count, dtype: int64

#### Embarked

In [50]:
# Tratamento Embarked em treino
# Importando o OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# Criando o encoder
ohe = OneHotEncoder(handle_unknown='ignore',dtype='int32')

# Fazendo o fit com os dados
ohe = ohe.fit(treino[['Embarked']])

# Fazendo a transformação
ohe.transform(treino[['Embarked']]).toarray()

# Transformando esse resultado em um DataFrame
ohe_df = pd.DataFrame(ohe.transform(treino[['Embarked']]).toarray(),columns=ohe.get_feature_names_out())
ohe_df.head(3)

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1


In [51]:
# adicionando essa coluna na nossa base de treino
treino = pd.concat([treino,ohe_df],axis=1)

# Verificação
treino[['Embarked','Embarked_C','Embarked_Q','Embarked_S']].value_counts()

Embarked  Embarked_C  Embarked_Q  Embarked_S
S         0           0           1             646
C         1           0           0             168
Q         0           1           0              77
Name: count, dtype: int64

In [52]:
# Tratamento Embarked em teste
ohe_df = pd.DataFrame(ohe.transform(teste[['Embarked']]).toarray(),columns=ohe.get_feature_names_out())

# Adicionando o resultado na base de teste
teste = pd.concat([teste,ohe_df],axis=1)

# Verificação
teste[['Embarked','Embarked_C','Embarked_Q','Embarked_S']].value_counts()

Embarked  Embarked_C  Embarked_Q  Embarked_S
S         0           0           1             270
C         1           0           0             102
Q         0           1           0              46
Name: count, dtype: int64

## Utilização da base de dados tratada

In [53]:
# Visualização
treino.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,MaleCheck,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,male,22.0,1,0,7.25,S,1,0,0,1
1,2,1,1,female,38.0,1,0,71.2833,C,0,1,0,0
2,3,1,3,female,26.0,0,0,7.925,S,0,0,0,1


In [54]:
teste.head(3)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,MaleCheck,Embarked_C,Embarked_Q,Embarked_S
0,892,3,male,34.5,0,0,7.8292,Q,1,0,1,0
1,893,3,female,47.0,1,0,7.0,S,0,0,0,1
2,894,2,male,62.0,0,0,9.6875,Q,1,0,1,0


In [55]:
# Eliminação colunas tratadas
treino = treino.drop(['Sex', 'Embarked'], axis=1)
treino

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,MaleCheck,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.000000,1,0,7.2500,1,0,0,1
1,2,1,1,38.000000,1,0,71.2833,0,1,0,0
2,3,1,3,26.000000,0,0,7.9250,0,0,0,1
3,4,1,1,35.000000,1,0,53.1000,0,0,0,1
4,5,0,3,35.000000,0,0,8.0500,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.000000,0,0,13.0000,1,0,0,1
887,888,1,1,19.000000,0,0,30.0000,0,0,0,1
888,889,0,3,29.699118,1,2,23.4500,0,0,0,1
889,890,1,1,26.000000,0,0,30.0000,1,1,0,0


In [56]:
# Eliminação colunas tratadas
teste = teste.drop(['Sex', 'Embarked'], axis=1)
teste

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,MaleCheck,Embarked_C,Embarked_Q,Embarked_S
0,892,3,34.50000,0,0,7.8292,1,0,1,0
1,893,3,47.00000,1,0,7.0000,0,0,0,1
2,894,2,62.00000,0,0,9.6875,1,0,1,0
3,895,3,27.00000,0,0,8.6625,1,0,0,1
4,896,3,22.00000,1,1,12.2875,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
413,1305,3,30.27259,0,0,8.0500,1,0,0,1
414,1306,1,39.00000,0,0,108.9000,0,1,0,0
415,1307,3,38.50000,0,0,7.2500,1,0,0,1
416,1308,3,30.27259,0,0,8.0500,1,0,0,1


## Análise preditiva com modelos

### Separar base treino em treino e validação

In [57]:
# Importando o train_test_split
from sklearn.model_selection import train_test_split

# Separando a base de treino em X e y
X = treino.drop(['PassengerId','Survived'],axis=1)
y = treino.Survived

In [58]:
# Separando em treino e validação
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

### Árvore de classificação

In [59]:
# Fazendo a importação
from sklearn import tree

# Criando o classificador
clf_ac = tree.DecisionTreeClassifier(random_state=42)

In [60]:
# Fazendo o fit com os dados
clf_ac = clf_ac.fit(X_train,y_train)

# Fazendo a previsão
y_previsao_ac = clf_ac.predict(X_val)
y_previsao_ac

array([0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 0], dtype=int64)

### KNN (KNeighNors Classifier)

In [61]:
# Importando
from sklearn.neighbors import KNeighborsClassifier

# Criando o classificador
clf_knn = KNeighborsClassifier(n_neighbors=3)

In [62]:
# Fazendo o fit com os dados
clf_knn = clf_knn.fit(X_train,y_train)

# Fazendo a previsão
y_previsao_knn = clf_knn.predict(X_val)
y_previsao_knn

array([0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 0], dtype=int64)

### Regressão Logística

In [63]:
# Importando
from sklearn.linear_model import LogisticRegression

# Criando o classificador
clf_rl = LogisticRegression(random_state=42,max_iter=1000)

In [64]:
# Fazendo o fit com os dados
clf_rl = clf_rl.fit(X_train,y_train)

# Fazendo a previsão
y_previsao_rl = clf_rl.predict(X_val)
y_previsao_rl

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0], dtype=int64)

## Avaliação dos modelos

### Avaliando acurácia

In [65]:
# Importando
from sklearn.metrics import accuracy_score

# Para a árvore
accuracy_score(y_val, y_previsao_ac)

0.7491525423728813

In [66]:
# Para o knn
accuracy_score(y_val, y_previsao_knn)

0.7152542372881356

In [67]:
# Para a regressão logística
accuracy_score(y_val, y_previsao_rl)

0.8169491525423729

### Avaliando matriz de confusão

In [68]:
# Importando
from sklearn.metrics import confusion_matrix

# Para a árvore
confusion_matrix(y_val, y_previsao_ac)

array([[138,  37],
       [ 37,  83]], dtype=int64)

In [69]:
# Para o knn
confusion_matrix(y_val, y_previsao_knn)

array([[147,  28],
       [ 56,  64]], dtype=int64)

In [70]:
# Para a regressão logística
confusion_matrix(y_val, y_previsao_rl)

array([[153,  22],
       [ 32,  88]], dtype=int64)

## Previsão para os dados de teste

In [71]:
# Visualizando o X_train
X_train.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,MaleCheck,Embarked_C,Embarked_Q,Embarked_S
6,1,54.0,0,0,51.8625,1,0,0,1
718,3,29.699118,0,0,15.5,1,0,1,0
685,2,25.0,1,2,41.5792,1,1,0,0


In [72]:
# Visualizando a base de teste
teste.head(3)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,MaleCheck,Embarked_C,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,7.8292,1,0,1,0
1,893,3,47.0,1,0,7.0,0,0,0,1
2,894,2,62.0,0,0,9.6875,1,0,1,0


In [73]:
# Para a base de teste ser igual a base de treino, precisamos eliminar a coluna de id
X_teste = teste.drop('PassengerId',axis=1)

# Utilizando a regressão logística na base de teste
y_pred = clf_rl.predict(X_teste)

# Criando uma nova coluna com a previsão na base de teste
teste['Survived'] = y_pred
y_pred

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [74]:
# Coluna de Id e Survived para fazer o envio
base_envio = teste[['PassengerId','Survived']]

In [75]:
# Exportando para um csv
base_envio.to_csv('resultados4.csv',index=False)