In [1]:
#Base
import pandas as pd
import numpy as np

#Plot
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Modelos
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost.sklearn import XGBClassifier

#Metrics
from sklearn.metrics import make_scorer, accuracy_score

#Model Select
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

#Skopt Lib
from skopt import gp_minimize
from skopt.plots import plot_convergence

import warnings
warnings.filterwarnings("ignore")

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


In [2]:
df_train = pd.read_csv('train.csv')

<div style="text-align:center;">
    <h1>Introdução </h1>
    <p>Tentarei nesse notebook descrever passo a passo da forma que foi pensada para buscar um resultado satisfario do nosso modelo para classificar entre Sobrevivente e pessoas que morreram no incidente do Titanic, alem do codigo tentarei referenciar o maximo possivel de informações para quem esteja começando para usar como base para compreender e se aprofundar nas tecnicas,exemplificando podendo assim tirarem suas proprias hipóteses</p>
</div>

<div style="text-align:center;">
    <h1>Sumário </h1>
</div>
<h3>1 - Exploração dos dados</h3>
<p style="text-indent:3em">1.1 - Verificação de valores nulos e drop de coluna que não iremos utiliza</p>
<p style="text-indent:3em">1.2 - Valores estatisticos da coluna idade e preenchimento dos valores nulos</p>
<p style="text-indent:3em">1.3 - Verificação dos valores nulos e preenchimento da coluna Embarque</p>
<p style="text-indent:3em">1.4 - Verificação dos tipos das colunas</p>
<p style="text-indent:3em">1.5 - Criação de variaveis Dummies</p>
<p style="text-indent:3em">1.6 - Extração com regex e tratamento da coluna</p>
<p style="text-indent:3em">1.7 - Criando Grupo por idade</p>
<p style="text-indent:3em">1.8 - Escolha das Features</p>
<h3>2 - Modelagem</h3>
<p style="text-indent:3em">2.1 - RandomFlorestClassifier</p>
<p style="text-indent:3em">2.2 - LGBMClassifier</p>
<p style="text-indent:3em">2.3 - XGBClassifier</p>
<p style="text-indent:3em">2.4 - XGBC com gp_minimize</p>
<p style="text-indent:3em">2.4 - KNN</p>

<h3>3 - Aplicação</h3>



## Exploração dos dados

In [3]:
df_train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


#### 1.1 - Verificação de valores nulos

In [4]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
df_train.drop(columns={'Cabin'},inplace=True)

#### 1.2 - Valores estatisticos da coluna idade e preenchimento dos valores nulos

In [5]:
df_train['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [6]:
df_train['Age'].fillna(df_train['Age'].mean(),inplace=True)

#### 1.3 - Verificação dos valores nulos e preenchimento da coluna Embarque

In [7]:
df_train['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [8]:
df_train['Embarked'].fillna('S',inplace=True)

In [10]:
df_train2 = df_train.copy()

#### 1.4 - Verificação dos tipos das colunas

In [12]:
df_train2.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked        object
dtype: object

#### 1.5 - Criação de variaveis Dummies
<p>Variável dummy é uma variável categegórica que foi transformada em numérica.Por exemplo a coluna Sexo, temos "masculino" e "feminino" iremos transformar essas variaveis em numerica.Criando uma nova coluna apenas para Masc. e Fem. onde sera setado 1 para positivo e 0 para negativo</p>

In [13]:
df_train3 = pd.get_dummies(df_train2, columns=['Sex','Embarked'])

In [14]:
df_train3.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,0,1,0,0


#### 1.6 Extração com regex 
<p>Iremos utilizar Regex(Expressão regular) para extrair o titulo das pessoas baseando na coluna nome e criar assim uma nova coluna</p>

In [15]:
df_train3['Title'] = df_train3['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)
df_train3['Title'].unique().tolist()

['Mr',
 'Mrs',
 'Miss',
 'Master',
 'Don',
 'Rev',
 'Dr',
 'Mme',
 'Ms',
 'Major',
 'Lady',
 'Sir',
 'Mlle',
 'Col',
 'Capt',
 'Countess',
 'Jonkheer']

In [16]:
df_train3['Title'].value_counts(normalize=True)*100

Mr          58.024691
Miss        20.426487
Mrs         14.029181
Master       4.489338
Dr           0.785634
Rev          0.673401
Major        0.224467
Mlle         0.224467
Col          0.224467
Don          0.112233
Countess     0.112233
Sir          0.112233
Lady         0.112233
Jonkheer     0.112233
Mme          0.112233
Ms           0.112233
Capt         0.112233
Name: Title, dtype: float64

In [17]:
mapping = {'Mlle': 'Miss', 'Major': 'Rare', 'Col': 'Rare', 'Sir': 'Rare', 'Don': 'Rare', 'Mme': 'Mrs',
           'Jonkheer': 'Rare', 'Lady': 'Rare', 'Capt': 'Rare', 'Countess': 'Rare', 'Ms': 'Miss', 'Dona': 'Mrs', 'Rev':'Rare', 'Dr':'Rare'}

df_train3.replace({'Title': mapping}, inplace=True)

df_train3['Title'].value_counts(normalize=True)*100

Mr        58.024691
Miss      20.763187
Mrs       14.141414
Master     4.489338
Rare       2.581369
Name: Title, dtype: float64

In [18]:
df_train3['Title'] = df_train3['Title'].map({'Mr':0, 'Miss':1, 'Mrs':2, 'Master':3, 'Rare':4})

#### 1.7 - Criando Grupo por idade
<p>Nesse caso irei utilizar a idade que foi fornecida do nosso dataset para criação dos grupos para saber se o passageiro era uma criança,jovem,adulto e etc.Nesse caso estamos fazendo Feacture Engineer onde transformamos uma coluna para obter uma outra através dela</p>

In [39]:
bins = [-1, 0, 14, 25, 35, 60, np.inf]
labels = ['Unknown', 'Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']
df_train3['AgeGroup'] = pd.cut(df_train3["Age"], bins, labels = labels)
age_mapping = {'Unknown': None,'Child': 1, 'Teenager': 2, 'Young Adult': 3, 'Adult': 4, 'Senior': 5}
df_train3['AgeGroup'] = df_train3['AgeGroup'].map(age_mapping)

In [40]:
df_train3.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title,AgeGroup
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,0,1,0,0,1,0,2.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,0,1,0,0,2,4.0


#### 1.8 Escolha das Features

In [45]:
features = ['Pclass','Sex_female','Sex_male','SibSp','Parch','Fare','Embarked_C','Embarked_Q','Embarked_S','Title','AgeGroup']

### 2.0 Modelagem

In [46]:
X = df_train3[features]
y = df_train3['Survived']

In [47]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33)

#### 2.1 RandomFlorestClassifier

In [48]:
# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [4, 6, 9,20,50], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2,5, 10,50,100,150], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8],
              'bootstrap':[True]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds,shuffle=True)
# Run the grid search
grid_obj = GridSearchCV(clf, parameters, acc_scorer,cv=skf.split(X_train, y_train))
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))

0.8135593220338984


Best: Score 0.8379888268156425

### 2.2 LGBMClassifier

In [49]:
lgb = LGBMClassifier()

# Choose some parameter combinations to try
param_grid = {
            'num_leaves': [60, 80],
            'n_estimators': [200, 400], #default class*iteration=2*100
            'bagging_freq': [5],
            'bagging_fraction' : [0.8, 0.9],  # subsample
            'feature_fraction' : [0.8, 0.9], # colsample_bytree
            'reg_alpha': [0.2, 0.6, 0.8],
            'reg_lambda': [0.4, 0.6, 0.8]
            }


# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds,shuffle=True)
# Run the grid search
grid_obj3 = GridSearchCV(lgb, param_grid, acc_scorer,cv=skf.split(X_train, y_train))
grid_obj3 = grid_obj3.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf_3 = grid_obj3.best_estimator_

# Fit the best algorithm to the data. 
clf_3.fit(X_train, y_train)

predictions_2 = clf_3.predict(X_test)
print(accuracy_score(y_test, predictions_2))

0.8203389830508474


### 2.3 XGBC

In [50]:
params_grid = {
'n_estimators': [50, 100, 250, 500],
'learning_rate': [0.01, 0.1, 0.3, 0.5],
'colsample_bytree': [0.3, 0.5, 0.8],
}

n_folds = 5
skf = StratifiedKFold(n_splits=n_folds,shuffle=True)
acc_scorer = make_scorer(accuracy_score)
# create xgboost classifier
xgb = XGBClassifier(objective = 'binary:logistic')

clf_2 = GridSearchCV(xgb, params_grid, acc_scorer,cv=skf.split(X_train, y_train))
clf_2.fit(X_train,y_train)
pred = clf_2.predict(X_test)
print(accuracy_score(y_test, pred))

0.823728813559322


## 2.4 XGBC com gp_minimize

In [128]:
def treinar_modelo_xgbc(params):
    max_depth = params[0]
    learning_rate = params[1]
    min_child_weight = params[2]
    gamma = params[3]
    colsample_bytree = params[4]
    print(params, '\n')
    
    xgb = XGBClassifier(objective = 'binary:logistic',max_depth=max_depth,
                        learning_rate=learning_rate,
                        min_child_weight=min_child_weight,
                       gamma=gamma,
                       colsample_bytree=colsample_bytree,
                       random_state = 0)
    xgb.fit(X_train, y_train)
    
    p = xgb.predict(X_test)
    
    return metrics.accuracy_score(y_test, p)


space = [(1, 250),
         (0.001,1),
         (1,10),
         (0.01,1),
         (0.01,1)
        ]

In [129]:
resultados_gp = gp_minimize(treinar_modelo_xgbc,space,random_state=1, verbose=1, n_calls=40, n_random_starts=10,n_jobs=8)

Iteration No: 1 started. Evaluating function at random point.
[249, 0.9326248019793204, 2, 0.9990501101709035, 0.24372808718245637] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.0347
Function value obtained: 0.8102
Current minimum: 0.8102
Iteration No: 2 started. Evaluating function at random point.
[100, 0.3885228304208455, 7, 0.9361836800979717, 0.8478478075191571] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.0417
Function value obtained: 0.8237
Current minimum: 0.8102
Iteration No: 3 started. Evaluating function at random point.
[79, 0.5250236114132986, 5, 0.23728144159252734, 0.539069769857547] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.0327
Function value obtained: 0.8271
Current minimum: 0.8102
Iteration No: 4 started. Evaluating function at random point.
[229, 0.45774760317900137, 5, 0.9397365115293898, 0.7806053439731684] 

Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.0362



Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 0.3249
Function value obtained: 0.7763
Current minimum: 0.7763
Iteration No: 24 started. Searching for the next optimal point.
[215, 0.016486574371321477, 4, 0.045198362873957905, 0.02416326647778661] 

Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 0.3080
Function value obtained: 0.7763
Current minimum: 0.7763
Iteration No: 25 started. Searching for the next optimal point.
[6, 0.49714188673195897, 2, 0.6268482028559071, 0.010266506285695552] 

Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 0.3571
Function value obtained: 0.7898
Current minimum: 0.7763
Iteration No: 26 started. Searching for the next optimal point.
[246, 0.03599250635580402, 4, 0.8289302742692076, 0.014902687718494723] 

Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.3338
Function value obtained: 0.7831
Current minimum: 0.7763
Iteration No: 



Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 0.3665
Function value obtained: 0.7763
Current minimum: 0.7763
Iteration No: 33 started. Searching for the next optimal point.
[1, 0.001, 10, 0.01, 0.01] 





Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 0.3507
Function value obtained: 0.7763
Current minimum: 0.7763
Iteration No: 34 started. Searching for the next optimal point.
[247, 0.007502613138436302, 6, 0.36000769220883583, 0.027781006814715833] 

Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 0.3705
Function value obtained: 0.7932
Current minimum: 0.7763
Iteration No: 35 started. Searching for the next optimal point.
[3, 0.003190455255957783, 3, 0.5540801669426648, 0.4316486891820062] 

Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 0.3968
Function value obtained: 0.8102
Current minimum: 0.7763
Iteration No: 36 started. Searching for the next optimal point.
[246, 0.07969609925447814, 9, 0.06755336629793791, 0.018783531131829484] 

Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 0.3809
Function value obtained: 0.7932
Current minimum: 0.7763
Iteration No: 



Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 0.3928
Function value obtained: 0.7763
Current minimum: 0.7763
Iteration No: 38 started. Searching for the next optimal point.
[38, 0.007801732187393861, 10, 0.9065524480681918, 0.011889551962075372] 

Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 0.3824
Function value obtained: 0.7932
Current minimum: 0.7763
Iteration No: 39 started. Searching for the next optimal point.
[114, 0.04008428345025525, 7, 0.7645361262686431, 0.01037695773779219] 

Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 0.3824
Function value obtained: 0.7831
Current minimum: 0.7763
Iteration No: 40 started. Searching for the next optimal point.
[1, 0.001, 1, 0.01, 0.01] 

Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 0.3611
Function value obtained: 0.7763
Current minimum: 0.7763


In [131]:
resultados_gp.x

[1, 0.001, 10, 1.0, 0.01]

In [133]:
plot_convergence(resultados_gp)

<matplotlib.axes._subplots.AxesSubplot at 0x17fed7f05c0>

In [134]:
type(resultados_gp)

scipy.optimize.optimize.OptimizeResult

In [135]:
xgb2 = XGBClassifier(objective = 'binary:logistic',
                     max_depth=resultados_gp.x[0],
                     learning_rate=resultados_gp.x[1],
                     min_child_weight=resultados_gp.x[2],
                     gamma=resultados_gp.x[3],
                     colsample_bytree=resultados_gp.x[4],
                     random_state = 0)
xgb2.fit(X_train, y_train)
    
p2 = xgb2.predict(X_test)
    
print(metrics.accuracy_score(y_test, p2))

0.7762711864406779


### 2.5 KNN

In [53]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
pred3 = knn.predict(X_test)
print(accuracy_score(y_test, pred3))

0.7830508474576271


### Aplicação do modelo no csv test

In [60]:
df_test = pd.read_csv('test.csv')

In [61]:
df_test.head(1)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q


In [62]:
df_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [63]:
df_test['Age'].fillna(df_test['Age'].mean(),inplace=True)

In [64]:
df_test['Fare'].fillna(df_test['Fare'].mean(),inplace=True)

In [65]:
df_test['Title'] = df_test['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)
#df_test['Title'].unique().tolist()

In [66]:
mapping = {'Mlle': 'Miss', 'Major': 'Rare', 'Col': 'Rare', 'Sir': 'Rare', 'Don': 'Rare', 'Mme': 'Mrs',
           'Jonkheer': 'Rare', 'Lady': 'Rare', 'Capt': 'Rare', 'Countess': 'Rare', 'Ms': 'Miss', 'Dona': 'Mrs', 'Rev':'Rare', 'Dr':'Rare'}

df_test.replace({'Title': mapping}, inplace=True)

df_test['Title'].value_counts(normalize=True)*100

Mr        57.416268
Miss      18.899522
Mrs       17.464115
Master     5.023923
Rare       1.196172
Name: Title, dtype: float64

In [67]:
df_test['Title'] = df_test['Title'].map({'Mr':0, 'Miss':1, 'Mrs':2, 'Master':3, 'Rare':4})

In [71]:
df_test['AgeGroup'] = pd.cut(df_test["Age"], bins, labels = labels)
df_test['AgeGroup'] = df_test['AgeGroup'].map(age_mapping)

In [69]:
df_test = pd.get_dummies(df_test, columns=['Sex','Embarked'])

In [136]:
f_pred = clf.predict(df_test[features])
f_pre2 = clf_2.predict(df_test[features])
f_pre3 = clf_3.predict(df_test[features])
f_pre4 = xgb2.predict(df_test[features])

In [137]:
Id = df_test.PassengerId
output = pd.DataFrame({'PassengerId': Id, 'Survived':f_pre4})
output.to_csv('submission.csv', index=False)