<a href="https://colab.research.google.com/github/gizattos/Titanic/blob/master/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import warnings 

## Preprocessamento
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC


## Modelos
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

## Avaliando nossos modellos
from sklearn.model_selection import cross_val_score, GridSearchCV

## Automatizando tratamento e treinamento
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
df = pd.read_csv('/content/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.columns = [x.lower() for x in df.columns]
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
## porcentagem de valores nulos na coluna cabin
df.cabin.isna().sum() / df.shape[0]

0.7710437710437711

In [0]:
x = df.loc[:,['pclass','sex','age','sibsp','parch','fare','embarked']]
y = df.survived

In [14]:
y.isna().sum()

0

In [13]:
x.isna().sum()

pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [0]:
## Separando as features
cat_features = ['pclass','sex','embarked']
num_features = ['age','sibsp','parch','fare']

In [0]:
## criando os passos para features categóricas e númericas
steps_cat = [('imputer_categorica',SimpleImputer(strategy='most_frequent')),('ohe', OneHotEncoder())]
steps_num = [('imputer_numericas',SimpleImputer(strategy='mean'))]

In [0]:
## criando pipelines
pipe_cat = Pipeline(steps_cat)
pipe_num = Pipeline(steps_num)

In [0]:
## criando transformadores
transformers = [('cat',pipe_cat, cat_features),('num',pipe_num, num_features)]
col_transformer = ColumnTransformer(transformers)

In [0]:
pipe_final = Pipeline(steps = [('pre_processing',col_transformer),('random_forest',RandomForestClassifier())])


In [0]:
def compare_models(model1,model2,x,y,cv):
  print('--COMPARANDO MODELOS--')
  print(f'Model1: {cross_val_score(model1,X=x,y=y,cv=cv).mean()}')
  print(f'Model2: {cross_val_score(model2,X=x,y=y,cv=cv).mean()}')

In [43]:
##DummyClassifier(strategy='most_frequent').fit(x,y)
dummy = DummyClassifier(strategy='most_frequent')
compare_models(dummy,pipe_final,x,y,cv=10)

--COMPARANDO MODELOS--
Model1: 0.616167290886392
Model2: 0.8048064918851436


In [55]:
params = {'random_forest__n_estimators': [100,200,300,400,500,600,700],
          'random_forest__max_depth': [5,6,7,8,9,10]}
grid = GridSearchCV(pipe_final,param_grid=params,cv=2)
grid.fit(x,y)

GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre_processing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer_categorica',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                       

In [56]:
grid_df = pd.DataFrame(grid.cv_results_)
grid_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_random_forest__max_depth,param_random_forest__n_estimators,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,0.161705,0.002832,0.016964,0.000156,5,100,"{'random_forest__max_depth': 5, 'random_forest...",0.7713,0.797753,0.784527,0.013226,42
1,0.286692,0.00144,0.025617,0.000321,5,200,"{'random_forest__max_depth': 5, 'random_forest...",0.7713,0.804494,0.787897,0.016597,37
2,0.422287,0.004725,0.038973,0.004249,5,300,"{'random_forest__max_depth': 5, 'random_forest...",0.769058,0.802247,0.785653,0.016594,38
3,0.562415,0.009963,0.046725,0.001539,5,400,"{'random_forest__max_depth': 5, 'random_forest...",0.764574,0.804494,0.784534,0.01996,40
4,0.702965,0.005315,0.057611,0.000108,5,500,"{'random_forest__max_depth': 5, 'random_forest...",0.766816,0.802247,0.784532,0.017716,41
5,0.823826,0.00691,0.070495,0.004199,5,600,"{'random_forest__max_depth': 5, 'random_forest...",0.769058,0.808989,0.789024,0.019965,36
6,0.970504,0.007749,0.077887,0.000853,5,700,"{'random_forest__max_depth': 5, 'random_forest...",0.762332,0.806742,0.784537,0.022205,39
7,0.15174,0.003123,0.016471,0.000391,6,100,"{'random_forest__max_depth': 6, 'random_forest...",0.780269,0.81573,0.798,0.017731,29
8,0.291444,0.000471,0.02697,0.000166,6,200,"{'random_forest__max_depth': 6, 'random_forest...",0.775785,0.817978,0.796881,0.021096,32
9,0.422907,0.005083,0.039057,8.9e-05,6,300,"{'random_forest__max_depth': 6, 'random_forest...",0.773543,0.811236,0.792389,0.018847,34


In [57]:
## pegando o melhor resultado
grid.best_params_

{'random_forest__max_depth': 7, 'random_forest__n_estimators': 600}