In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import binarize, Binarizer

In [4]:
#Importação dos dados de treino e teste
df_titanic_train = pd.read_csv("train.csv")
df_titanic_test = pd.read_csv("test.csv")

In [6]:
#Visualização dos dados de treino
df_titanic_train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [8]:
#Visualização dos dados de teste
df_titanic_test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [10]:
#Verifica a quantidade de valores missing do dataframe
df_titanic_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
#Preenche os valores missing da variável idade com a média de idade presente na coluna
df_titanic_train['Age'].fillna(df_titanic_train['Age'].mean(), inplace=True)

In [14]:
#Testa se ainda há valores missing na variável
print(f'Número de variáveis com valor ausente: {df_titanic_train.Age.isna().sum()}')

Número de variáveis com valor ausente: 0


In [16]:
#Binariza a variável sex com 0 para male e 1 para female
for sexo in range(0, len(df_titanic_train.Sex)):
    if df_titanic_train['Sex'][sexo] == 'male':
        df_titanic_train['Sex'][sexo] = 0
    else:
        df_titanic_train['Sex'][sexo] = 1
# sexo for sexo in range(0, len(df_titanic_train.Sex)): if df_titanic_train['Sex'][sexo] == 'male'

In [18]:
#Testa a binarização
df_titanic_train.Sex.value_counts()

1    891
Name: Sex, dtype: int64

In [20]:
#Separação em dados de treino e de validação
X_titanic_train, X_titanic_valid, y_titanic_train, y_titanic_valid = train_test_split(df_titanic_train.drop(['Survived','PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1), df_titanic_train['Survived'],test_size=0.30, random_state=42)

## Implementa o modelo Random Forest Classifier

In [22]:
#Instancia o classificador
random_forest_classifier = RandomForestClassifier(n_estimators=100,n_jobs=1)

In [24]:
#Treina o modelo
model_random_forest_classifier = random_forest_classifier.fit(X_titanic_train, y_titanic_train)

In [26]:
#Primeira avaliação da classificação
model_random_forest_classifier.score(X_titanic_valid, y_titanic_valid)

0.6604477611940298

In [28]:
#Treino dos valores para validação
y_titanic_predict = model_random_forest_classifier.predict(X_titanic_valid)

In [30]:
#Score com previsão e validação
mean_squared_error(y_titanic_valid, y_titanic_predict)

0.33955223880597013

## Separa os dados de previsão do data set de teste

In [32]:
X_test = df_titanic_test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)

In [34]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,34.5,0,0,7.8292
1,3,female,47.0,1,0,7.0000
2,2,male,62.0,0,0,9.6875
3,3,male,27.0,0,0,8.6625
4,3,female,22.0,1,1,12.2875
...,...,...,...,...,...,...
413,3,male,,0,0,8.0500
414,1,female,39.0,0,0,108.9000
415,3,male,38.5,0,0,7.2500
416,3,male,,0,0,8.0500


In [36]:
#Binariza a variável sex com 0 para male e 1 para female
for sexo in range(0, len(X_test.Sex)):
    if X_test['Sex'][sexo] == 'male':
        X_test['Sex'][sexo] = 0
    else:
        X_test['Sex'][sexo] = 1

In [38]:
#Preenche os valores missing da variável idade com a média de idade presente na coluna
X_test['Age'].fillna(X_test['Age'].mean(), inplace=True)
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace=True)

In [40]:
#verifica se ainda há valores ausentes
print(f'Valores ausentes nas variáveis de teste:\n{X_test.isna().sum()}')

Valores ausentes nas variáveis de teste:
Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
Fare      0
dtype: int64


In [42]:
survived_forest  = model_random_forest_classifier.predict(X_test)

In [44]:
survived_forest

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,

In [46]:
df_survived_predict_test = pd.DataFrame(df_titanic_test.PassengerId)
df_survived_predict_test['Survived'] = pd.DataFrame(survived_forest)

In [48]:
df_survived_predict_test.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


In [50]:
#Exportação dos dados para inserção no kaggle
df_survived_predict_test.to_csv('forest_test_20200611.csv', index=False)