# Bibliotecas / Pacotes usados

In [198]:
#%pip install pandas
#%pip install seaborn
#%pip install numpy
#%pip install matplotlib.pyplot
#%pip install -U scikit-learn 
#%pip install optuna
#%pip install xgboost

import optuna
import pandas                   as pd
import seaborn                  as sns
import numpy                    as np
import matplotlib.pyplot        as plt
from sklearn.preprocessing      import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection    import train_test_split
from sklearn.metrics            import precision_score, f1_score, accuracy_score, auc
from sklearn.linear_model       import LogisticRegression
from sklearn.metrics            import confusion_matrix
from sklearn.metrics            import classification_report
from sklearn.metrics            import confusion_matrix, ConfusionMatrixDisplay
from sklearn.tree               import DecisionTreeClassifier
from sklearn.ensemble           import RandomForestClassifier
from sklearn.model_selection    import StratifiedKFold, cross_validate, cross_val_score
from xgboost                    import XGBRegressor, XGBClassifier
from sklearn.linear_model       import LinearRegression, Lasso
from sklearn.preprocessing      import MinMaxScaler, RobustScaler
from sklearn.metrics            import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

## Funções

### Função - Configuração dos gráficos

In [199]:
# ===================================================================
# Configura os gráficos
def jupyter_settings():
    %matplotlib inline
    # %pylab inline
    
    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [22, 9]
    plt.rcParams['font.size'] = 21

    # display(HTML('<style>.conteiner{width:100% !important;}</style>'))

    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.expand_frame_repr', False)
    
    # configura o pandas para quantidade de casas decimais
    pd.set_option('display.float_format', lambda x: '%.2f' % x)

    sns.set()

jupyter_settings()

### Função - Arrumando o Banco para Modelagem

In [200]:
def arrumando_banco(data):

    dataframe = data.copy()

    #1 – IDADE :: imputação de missings
    dataframe['Age'].fillna(dataframe['Age'].median(), inplace=True)
    dataframe['Pclass'].fillna(dataframe['Pclass'].mean(), inplace=True)
    dataframe['Fare'].fillna(dataframe['Fare'].mean(), inplace=True)

    #2 – IDADE :: recodificação da variával idade (quanti para quali[str])
    conditions = [
    dataframe['Age'] < 12,
        (dataframe['Age'] >= 12) & (dataframe['Age'] < 22),
        (dataframe['Age'] >= 22) & (dataframe['Age'] < 35),
        dataframe['Age'] >= 35
                ]
    choices = [
            'Age - menores que 12 anos',
            'Age - entre 12 e 22 anos',
            'Age - entre 22 e 35 anos',
            'Age - maiores que o 35 anos'
            ]
    dataframe['Age_rec'] = np.select(conditions, choices)

    #3 SEXO :: Transformando quali[str] em quali[num]
    dataframe = pd.get_dummies(dataframe, columns=['Sex'], prefix=['Sex'], dtype = int, drop_first = True)

     #4 – SEXO * IDADE :: criando um fator de interação
    dataframe['Int_Age_Sex'] = dataframe.apply(lambda x: str(x['Sex_male']) + str(x['Age_rec']), axis = 1 )
    dataframe['Int_Age_Pclass'] = dataframe.apply(lambda x: str(x['Pclass']) + str(x['Age_rec']), axis = 1 )

    #5 – IDADE, SEXO e INTERAÇÃO :: Transformando quali[str] em quali[num]
    label = OrdinalEncoder()
    dataframe[['Age_rec', 'Pclass', 'Int_Age_Pclass']] = label.fit_transform(
        dataframe[['Age_rec', 'Pclass', 'Int_Age_Pclass']])
    
    label = OrdinalEncoder()
    dataframe[['Age_rec', 'Sex_male', 'Int_Age_Sex']] = label.fit_transform(
        dataframe[['Age_rec', 'Sex_male', 'Int_Age_Sex']])
    
       #3.1 Mudando tipo da variável Sexo
    dataframe['Sex_male'] = dataframe['Sex_male'].astype('int64')
    dataframe['Age_rec'] = dataframe['Age_rec'].astype('int64')
    dataframe['Pclass'] = dataframe['Pclass'].astype('int64')
    dataframe['Int_Age_Sex'] = dataframe['Int_Age_Sex'].astype('int64')
    dataframe['Int_Age_Pclass'] = dataframe['Int_Age_Sex'].astype('int64')
    
    return dataframe


### Função - Coletar as Métricas - Treino/Teste

In [201]:
def calcula_metricas( nome, model, y_test, y_pred ):
    # Avaliando o modelo
    precision = precision_score( y_test, y_pred )
    f1 = f1_score( y_test, y_pred )
    accuracy = accuracy_score( y_test, y_pred )
    # auc_ = auc( y_test, y_pred )

    # =======================================
    # Retornando a tabela de métricas
    # =======================================
    linhas = [[ nome, precision, f1, accuracy]]
    colunas = [ "Modelo", "Precision", "F1", "Accuracy" ]

    regressao_metricas =  pd.DataFrame( linhas, columns = colunas )

    #ConfusionMatrixDisplay(confusion_matrix = matriz_confusao1, 
                    #display_labels=model.classes_).plot()
    #plt.grid(False)
    #plt.show()

    return regressao_metricas

### Função - Coletar as Métricas - Cross Validation

In [202]:
def CV_calc_metricas(nome, model, X, y ):

    scoring = {'accuracy': 'accuracy', 'precision': 'precision', 'f1': 'f1'}

    modeloCV = cross_validate( model, X, y, scoring = scoring, cv=5 )

    precision_i = modeloCV['test_precision'].mean()
    f1_i = modeloCV['test_f1'].mean()
    accuracy_i = modeloCV['test_accuracy'].mean()

    # =======================================
    # Retornando a tabela de métricas
    # =======================================
    linhas = [[ nome, precision_i, f1_i, accuracy_i ]]
    colunas = [ "Modelo", "Precision", "F1", "Accuracy" ]

    regressaoCV_metricas = pd.DataFrame( linhas, columns = colunas )

    return regressaoCV_metricas

# Lendo o BANCO - Parte 1

In [203]:
df_raw = pd.read_csv('../banco/train.csv')

In [204]:
df_kaggle_raw = pd.read_csv('../banco/test.csv')

In [205]:
df = df_raw.copy()
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [206]:
df_kaggle = df_kaggle_raw.copy()
df_kaggle.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# Separando os bancos "y" e "Xs"

## Banco "y"

In [207]:
y = df["Survived"]
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [208]:
y.describe()

count   891.00
mean      0.38
std       0.49
min       0.00
25%       0.00
50%       0.00
75%       1.00
max       1.00
Name: Survived, dtype: float64

## Banco "Xs"

In [209]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [210]:
X = df [ ['Age', 'Sex', 'Pclass', 'Fare'] ]

In [211]:
X.describe()

Unnamed: 0,Age,Pclass,Fare
count,714.0,891.0,891.0
mean,29.7,2.31,32.2
std,14.53,0.84,49.69
min,0.42,1.0,0.0
25%,20.12,2.0,7.91
50%,28.0,3.0,14.45
75%,38.0,3.0,31.0
max,80.0,3.0,512.33


## Separando os dados de treino e teste

In [212]:
# Separa os dados com 70% dos dados para treino e 30% dos dados para teste
X_treino, X_teste, y_treino, y_teste = train_test_split( X, y, 
                                                        train_size = 0.7, 
                                                        random_state = 42 )

In [213]:
print( len( X_treino ) )
print( len( X_teste ) )
print( len( y_treino ) )
print( len( y_teste ) )

623
268
623
268


In [214]:
Xa_treino = arrumando_banco( X_treino )

In [215]:
Xa_teste = arrumando_banco( X_teste )

In [216]:
Xa_treino.columns

Index(['Age', 'Pclass', 'Fare', 'Age_rec', 'Sex_male', 'Int_Age_Sex',
       'Int_Age_Pclass'],
      dtype='object')

In [217]:
X1_treino = Xa_treino [[ 'Age', 'Pclass', 'Fare', 'Sex_male' ]]
X2_treino = Xa_treino [[ 'Fare', 'Int_Age_Sex', 'Int_Age_Pclass' ]]
X3_treino = Xa_treino [[ 'Pclass', 'Fare', 'Int_Age_Sex' ]]

#X4_treino = Xa_treino [[ 'Age' ]]
#X5_treino = Xa_treino [[ 'Age', 'Pclass' ]]
#X6_treino = Xa_treino [[ 'Age', 'Pclass', 'Fare']]


In [218]:
X1_teste = Xa_teste [[ 'Age', 'Pclass', 'Fare', 'Sex_male' ]]
X2_teste = Xa_teste [[ 'Fare', 'Int_Age_Sex', 'Int_Age_Pclass' ]]
X3_teste = Xa_teste [[ 'Pclass', 'Fare', 'Int_Age_Sex' ]]


##################################################################################
##################################################################################

# TREINO / TESTE

# Regressão Logística

##################################################################################
##################################################################################

In [219]:
reglog1 = LogisticRegression( random_state=42 )
reglog1.fit( X1_treino, y_treino )
y_pred_reglog1 = reglog1.predict( X1_teste )

reglog2 = LogisticRegression( random_state=42 )
reglog2.fit( X2_treino, y_treino )
y_pred_reglog2 = reglog2.predict( X2_teste )

reglog3 = LogisticRegression( random_state=42 )
reglog3.fit( X3_treino, y_treino )
y_pred_reglog3 = reglog3.predict( X3_teste )

# Decision Tree Classifier

##################################################################################
##################################################################################

In [220]:
classifier_dt1 = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=3)
classifier_dt1.fit( X1_treino, y_treino )
y_pred_dt1 = classifier_dt1.predict( X1_teste )

classifier_dt2 = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=3)
classifier_dt2.fit( X2_treino, y_treino )
y_pred_dt2 = classifier_dt2.predict( X2_teste )

classifier_dt3 = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=3)
classifier_dt3.fit( X3_treino, y_treino )
y_pred_dt3 = classifier_dt3.predict( X3_teste )

# Random Forest Classifier
##################################################################################
##################################################################################

In [221]:
classifier_rf1 = RandomForestClassifier( random_state = 0, 
                                        criterion = 'gini', 
                                        max_depth = 10, 
                                        n_estimators = 50, 
                                        n_jobs = -1 )

classifier_rf1.fit( X1_treino, y_treino )
y_pred_rf1 = classifier_rf1.predict( X1_teste )

classifier_rf2 = RandomForestClassifier( random_state = 0, criterion = 'gini', max_depth = 10, n_estimators = 50, n_jobs = -1 )
classifier_rf2.fit( X2_treino, y_treino )
y_pred_rf2 = classifier_rf2.predict( X2_teste )

classifier_rf3 = RandomForestClassifier( random_state = 0, criterion = 'gini', max_depth = 10, n_estimators = 50, n_jobs = -1 )
classifier_rf3.fit( X3_treino, y_treino )
y_pred_rf3 = classifier_rf3.predict( X3_teste )

# XGBoost Classifier

##################################################################################
##################################################################################

In [222]:
classifier_xgb1 = XGBClassifier(eval_metric='mlogloss', random_state=0)
classifier_xgb1.fit(X1_treino, y_treino)
y_pred_xgb1 = classifier_xgb1.predict(X1_teste)

classifier_xgb2 = XGBClassifier(eval_metric='mlogloss', random_state=0)
classifier_xgb2.fit(X2_treino, y_treino)
y_pred_xgb2 = classifier_xgb2.predict(X2_teste)

classifier_xgb3 = XGBClassifier(eval_metric='mlogloss', random_state=0)
classifier_xgb3.fit(X3_treino, y_treino)
y_pred_xgb3 = classifier_xgb3.predict(X3_teste)

# Métricas

In [223]:
# plota tab 2x2 
matriz_confusao1 = confusion_matrix( y_teste, y_pred_reglog1 )
matriz_confusao2 = confusion_matrix( y_teste, y_pred_reglog2 )
matriz_confusao3 = confusion_matrix( y_teste, y_pred_reglog3 )

In [224]:
# Métricas Regressão Logística
metrica_reglog1 = calcula_metricas( 'Reg. Logistica 1', reglog1, y_teste, y_pred_reglog1 )
metrica_reglog2 = calcula_metricas( 'Reg. Logistica 2', reglog2, y_teste, y_pred_reglog2 )
metrica_reglog3 = calcula_metricas( 'Reg. Logistica 3', reglog3, y_teste, y_pred_reglog3 )
# Métricas Decision Tree
metrica_dt1 = calcula_metricas( 'Decision Tree 1', classifier_dt1, y_teste, y_pred_dt1 )
metrica_dt2 = calcula_metricas( 'Decision Tree 2', classifier_dt2, y_teste, y_pred_dt2 )
metrica_dt3 = calcula_metricas( 'Decision Tree 3', classifier_dt3, y_teste, y_pred_dt3 )
# Métricas Random Forest
metrica_rf1 = calcula_metricas( 'Random Forest 1', classifier_rf1, y_teste, y_pred_rf1 )
metrica_rf2 = calcula_metricas( 'Random Forest 2', classifier_rf2, y_teste, y_pred_rf2 )
metrica_rf3 = calcula_metricas( 'Random Forest 3', classifier_rf3, y_teste, y_pred_rf3 )
# Métricas XGBoost
metrica_xgb1 = calcula_metricas( 'XGBoost Classifier 1', classifier_xgb1, y_teste, y_pred_xgb1 )
metrica_xgb2 = calcula_metricas( 'XGBoost Classifier 2', classifier_xgb2, y_teste, y_pred_xgb2 )
metrica_xgb3 = calcula_metricas( 'XGBoost Classifier 3', classifier_xgb3, y_teste, y_pred_xgb3 )

## CROSS VALIDATION

In [225]:
X_geral = arrumando_banco( X )

X1_geral = X_geral [[ 'Age', 'Pclass','Fare', 'Sex_male' ]]
X2_geral = X_geral [[ 'Int_Age_Sex', 'Int_Age_Pclass' ]]
X3_geral = X_geral [[ 'Pclass', 'Fare', 'Int_Age_Sex' ]]

## Métricas Cross Validation

In [226]:
# Regressão Logística
metrica_CV_reglog1 = CV_calc_metricas( 'CV - Reg. Logistica 1', reglog1, X1_geral, y )
metrica_CV_reglog2 = CV_calc_metricas( 'CV - Reg. Logistica 2', reglog2, X2_geral, y )
metrica_CV_reglog3 = CV_calc_metricas( 'CV - Reg. Logistica 3', reglog3, X3_geral, y )
# Árvore de Decisão
metrica_CV_classifier_dt1 = CV_calc_metricas( 'CV - Decision Tree 1', classifier_dt1, X1_geral, y )
metrica_CV_classifier_dt2 = CV_calc_metricas( 'CV - Decision Tree 2', classifier_dt2, X2_geral, y )
metrica_CV_classifier_dt3 = CV_calc_metricas( 'CV - Decision Tree 3', classifier_dt3, X3_geral, y )
# Random Forest
metrica_CV_classifier_rf1 = CV_calc_metricas( 'CV - Random Forest 1', classifier_dt1, X1_geral, y )
metrica_CV_classifier_rf2 = CV_calc_metricas( 'CV - Random Forest 2', classifier_dt2, X2_geral, y )
metrica_CV_classifier_rf3 = CV_calc_metricas( 'CV - Random Forest 3', classifier_dt3, X3_geral, y )
# XGBoost Classifier
metrica_CV_classifier_xgb1 = CV_calc_metricas( 'CV - XGBoost Classifier 1', classifier_xgb1, X1_geral, y )
metrica_CV_classifier_xgb2 = CV_calc_metricas( 'CV - XGBoost Classifier 2', classifier_xgb2, X2_geral, y )
metrica_CV_classifier_xgb3 = CV_calc_metricas( 'CV - XGBoost Classifier 3', classifier_xgb3, X3_geral, y )

## Tabelas MÉTRICAS / MÉTRICAS CV

In [227]:
table_information = pd.concat([ metrica_reglog1, metrica_reglog2, metrica_reglog3,
                                metrica_dt1, metrica_dt2, metrica_dt3,
                                metrica_rf1, metrica_rf2, metrica_rf3,
                                metrica_xgb1, metrica_xgb2, metrica_xgb3 ])

table_information.sort_values( "Accuracy", ascending = False ).reset_index( drop = True )

Unnamed: 0,Modelo,Precision,F1,Accuracy
0,Random Forest 1,0.86,0.77,0.82
1,XGBoost Classifier 2,0.84,0.75,0.81
2,XGBoost Classifier 3,0.83,0.75,0.81
3,Decision Tree 3,0.82,0.75,0.81
4,Decision Tree 1,0.82,0.75,0.81
5,Random Forest 3,0.81,0.73,0.8
6,Reg. Logistica 1,0.77,0.74,0.79
7,XGBoost Classifier 1,0.77,0.74,0.79
8,Decision Tree 2,0.77,0.74,0.79
9,Reg. Logistica 3,0.79,0.72,0.79


In [228]:
table2_information = pd.concat([ metrica_CV_reglog1, metrica_CV_reglog2, metrica_CV_reglog3,
                                metrica_CV_classifier_dt1, metrica_CV_classifier_dt2, metrica_CV_classifier_dt3, 
                                metrica_CV_classifier_rf1, metrica_CV_classifier_rf2, metrica_CV_classifier_rf3, 
                                metrica_CV_classifier_xgb1, metrica_CV_classifier_xgb2, metrica_CV_classifier_xgb3
                                ])

table2_information.sort_values( "Accuracy", ascending = False ).reset_index( drop = True )

Unnamed: 0,Modelo,Precision,F1,Accuracy
0,CV - XGBoost Classifier 3,0.8,0.75,0.82
1,CV - Decision Tree 3,0.8,0.74,0.81
2,CV - Random Forest 3,0.8,0.74,0.81
3,CV - Decision Tree 1,0.8,0.73,0.81
4,CV - Random Forest 1,0.8,0.73,0.81
5,CV - XGBoost Classifier 1,0.76,0.74,0.81
6,CV - Decision Tree 2,0.72,0.73,0.79
7,CV - Random Forest 2,0.72,0.73,0.79
8,CV - XGBoost Classifier 2,0.72,0.73,0.79
9,CV - Reg. Logistica 1,0.73,0.72,0.79


## Feature Importance

In [315]:
#features_importance = zip(classifier_rf1.feature_importances_, X2_geral)
#for importance, feature in sorted(features_importance, reverse=True):
    #print("%s: %f%%" % (feature, importance*100))

# 7 - Tunagem de hiperparâmetros
## Random Forest

In [127]:
# Configurando o nível de log do Optuna para suprimir saída detalhada
# optuna.logging.set_verbosity(optuna.logging.WARNING)

# Definindo a função objetivo para otimização
def objective(trial):

    # Parâmetros e faixa de valores a serem testados
    max_depth = trial.suggest_int("rf_max_depth", 2, 64)
    n_estimators = trial.suggest_int("rf_n_estimators", 2, 50)
    min_samples_split = trial.suggest_int("rf_min_samples_split", 2, 10)
      
    # Cria um modelo com os hiperparâmetros sugeridos
    rf_model = RandomForestClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        n_estimators=n_estimators,
        n_jobs=-1,
        random_state=42)
    
    # Calcula a pontuação média usando validação cruzada
    score = cross_val_score(rf_model, X3_treino, y_treino, cv=5, scoring='accuracy').mean()   
    return score

# Cria uma instância de estudo do Optuna para maximização
study = optuna.create_study(direction="maximize")

# Realiza a otimização da função objetivo com 100 iterações de teste
study.optimize(objective, n_trials=100)

# Obtém a melhor tentativa após a otimização
trial = study.best_trial

# Imprime a melhor pontuação encontrada
print("Best Score: ", trial.value)

# Imprime os hiperparâmetros correspondentes à melhor pontuação
print("Best Params: ")
for key, value in trial.params.items():
    print("  {}: {}".format(key, value))


[I 2023-08-22 22:19:36,297] A new study created in memory with name: no-name-2a99bdda-ce0d-4bc2-a383-ac2ffbbfd6a3


[I 2023-08-22 22:19:37,326] Trial 0 finished with value: 0.7913419354838711 and parameters: {'rf_max_depth': 33, 'rf_n_estimators': 40, 'rf_min_samples_split': 2}. Best is trial 0 with value: 0.7913419354838711.
[I 2023-08-22 22:19:38,531] Trial 1 finished with value: 0.8154322580645161 and parameters: {'rf_max_depth': 51, 'rf_n_estimators': 42, 'rf_min_samples_split': 10}. Best is trial 1 with value: 0.8154322580645161.
[I 2023-08-22 22:19:39,166] Trial 2 finished with value: 0.8009677419354839 and parameters: {'rf_max_depth': 54, 'rf_n_estimators': 26, 'rf_min_samples_split': 2}. Best is trial 1 with value: 0.8154322580645161.
[I 2023-08-22 22:19:40,237] Trial 3 finished with value: 0.812232258064516 and parameters: {'rf_max_depth': 26, 'rf_n_estimators': 15, 'rf_min_samples_split': 4}. Best is trial 1 with value: 0.8154322580645161.
[I 2023-08-22 22:19:41,231] Trial 4 finished with value: 0.7993806451612904 and parameters: {'rf_max_depth': 28, 'rf_n_estimators': 24, 'rf_min_samples_

Best Score:  0.8315096774193549
Best Params: 
  rf_max_depth: 10
  rf_n_estimators: 47
  rf_min_samples_split: 9


## XGBoost

In [133]:
# Configurando o nível de log do Optuna para suprimir saída detalhada
#optuna.logging.set_verbosity(optuna.logging.WARNING)

# Definindo a função objetivo para otimização
def objective(trial):

    # Parâmetros e faixa de valores a serem testados
    #alfa = trial.suggest_int("xgb_alfa", 0, 20)
    #top_k = trial.suggest_int("xgb_top_k", 0, 3)
    eta = trial.suggest_float("xgb_eta", 0.001, 0.9)
    max_depth = trial.suggest_int("xgb_max_depth", 2, 20)
    subsample = trial.suggest_float("xgb_subsample", 0, 1)

    # Cria um modelo com os hiperparâmetros sugeridos
    xgb_model = XGBClassifier(
        eta = eta,                  
        max_depth = max_depth,
        subsample = subsample,
        random_state = 42)
    
    # Calcula a pontuação média usando validação cruzada
    score = cross_val_score(xgb_model, X3_treino, y_treino, cv=5, scoring='accuracy').mean()   
    return score

# Cria uma instância de estudo do Optuna para maximização
study = optuna.create_study(direction="maximize")

# Realiza a otimização da função objetivo com 100 iterações de teste
study.optimize(objective, n_trials=100)

# Obtém a melhor tentativa após a otimização
trial = study.best_trial

# Imprime a melhor pontuação encontrada
print("Best Score: ", trial.value)

# Imprime os hiperparâmetros correspondentes à melhor pontuação
print("Best Params: ")
for key, value in trial.params.items():
    print("  {}: {}".format(key, value))


[I 2023-08-22 22:47:40,024] A new study created in memory with name: no-name-daedb577-e2c3-41b7-87ea-a5efdf8d55f6


[I 2023-08-22 22:47:40,676] Trial 0 finished with value: 0.785058064516129 and parameters: {'xgb_eta': 0.49457660742710696, 'xgb_max_depth': 11, 'xgb_subsample': 0.16048426259771265}. Best is trial 0 with value: 0.785058064516129.
[I 2023-08-22 22:47:42,115] Trial 1 finished with value: 0.7977806451612903 and parameters: {'xgb_eta': 0.26578944982710423, 'xgb_max_depth': 17, 'xgb_subsample': 0.6438184494245921}. Best is trial 1 with value: 0.7977806451612903.
[I 2023-08-22 22:47:43,442] Trial 2 finished with value: 0.8154064516129033 and parameters: {'xgb_eta': 0.1324514971188056, 'xgb_max_depth': 6, 'xgb_subsample': 0.5676577043712346}. Best is trial 2 with value: 0.8154064516129033.
[I 2023-08-22 22:47:44,243] Trial 3 finished with value: 0.8057935483870968 and parameters: {'xgb_eta': 0.4229264201055834, 'xgb_max_depth': 9, 'xgb_subsample': 0.8703506770733256}. Best is trial 2 with value: 0.8154064516129033.
[I 2023-08-22 22:47:44,962] Trial 4 finished with value: 0.7993419354838711 a

Best Score:  0.8314580645161291
Best Params: 
  xgb_eta: 0.07840454913964941
  xgb_max_depth: 2
  xgb_subsample: 0.7690182569038844


# 8 - Modelo final

In [229]:
# Instanciando o melhor modelo com os melhores parâmetros
rf_model = RandomForestClassifier(
    max_depth = 10,
    min_samples_split = 9,
    n_estimators = 47,
    n_jobs=-1,
    random_state=42)

# Treinando o modelo
rf_model.fit(X3_treino, y_treino)

# Fazendo previsões nos dados de teste com o modelo treinado
y_pred_rf = rf_model.predict(X3_teste)

# Verificando a performance finalo do modelo
metricas_rf = calcula_metricas("Random Forest 3", rf_model, y_teste, y_pred_rf)
metricas_rf

Unnamed: 0,Modelo,Precision,F1,Accuracy
0,Random Forest 3,0.86,0.77,0.82


In [230]:
# Instanciando o melhor modelo com os melhores parâmetros
xgb = XGBClassifier(
    eta = 0.0784,
    max_depth = 2,
    subsample = 0.769,
    random_state = 42)

# Treinando o modelo
xgb.fit(X3_treino, y_treino)

# Fazendo previsões nos dados de teste com o modelo treinado
y_pred_xgb = xgb.predict(X3_teste)

# Verificando a performance finalo do modelo
metricas_xgb = calcula_metricas("XGboost 3", xgb, y_teste, y_pred_xgb)
metricas_xgb

Unnamed: 0,Modelo,Precision,F1,Accuracy
0,XGboost 3,0.87,0.77,0.83


# BANCO KAGGLE

In [231]:
df_kaggle.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.83,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.69,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.66,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.29,,S


In [232]:
df_kaggle = df_kaggle[['Age', 'Sex', 'Pclass', 'Fare']]

In [233]:
df_kaggle.head()

Unnamed: 0,Age,Sex,Pclass,Fare
0,34.5,male,3,7.83
1,47.0,female,3,7.0
2,62.0,male,2,9.69
3,27.0,male,3,8.66
4,22.0,female,3,12.29


In [234]:
df_kaggle = arrumando_banco( df_kaggle )
df_kaggle.columns

Index(['Age', 'Pclass', 'Fare', 'Age_rec', 'Sex_male', 'Int_Age_Sex',
       'Int_Age_Pclass'],
      dtype='object')

In [235]:
df_kaggle = df_kaggle [[ 'Pclass', 'Fare', 'Int_Age_Sex' ]]
df_kaggle.columns

Index(['Pclass', 'Fare', 'Int_Age_Sex'], dtype='object')

In [236]:
df_kaggle.isna().sum()

Pclass         0
Fare           0
Int_Age_Sex    0
dtype: int64

In [237]:
previsoes = rf_model.predict( df_kaggle )
previsoes

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [238]:
submission = df_kaggle_raw[['PassengerId']]

In [239]:
submission['Survived'] = previsoes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['Survived'] = previsoes


In [240]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [241]:
submission.to_csv('../banco/submission.csv', index = False)