# Bibliotecas / Pacotes usados

In [27]:
#%pip install pandas
#%pip install seaborn
#%pip install numpy
#%pip install matplotlib.pyplot
#%pip install -U scikit-learn 

 
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics        import precision_score, f1_score, accuracy_score, auc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## Funções

### Configura os gráficos

In [28]:
# ===================================================================
# Configura os gráficos
def jupyter_settings():
    %matplotlib inline
    # %pylab inline
    
    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [22, 9]
    plt.rcParams['font.size'] = 21

    # display(HTML('<style>.conteiner{width:100% !important;}</style>'))

    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.expand_frame_repr', False)
    
    # configura o pandas para quantidade de casas decimais
    pd.set_option('display.float_format', lambda x: '%.2f' % x)

    sns.set()

jupyter_settings()

### Arrumando o Banco para Modelagem

In [29]:
def arrumando_banco(data):

    dataframe = data.copy()

    #1 – IDADE :: imputação de missings
    dataframe['Age'].fillna(dataframe['Age'].median(), inplace=True)
    dataframe['Pclass'].fillna(dataframe['Pclass'].mean(), inplace=True)

    #2 – IDADE :: recodificação da variával idade (quanti para quali[str])
    conditions = [
    dataframe['Age'] < 12,
        (dataframe['Age'] >= 12) & (dataframe['Age'] < 22),
        (dataframe['Age'] >= 22) & (dataframe['Age'] < 35),
        dataframe['Age'] >= 35
                ]
    choices = [
            'Age - menores que 12 anos',
            'Age - entre 12 e 22 anos',
            'Age - entre 22 e 35 anos',
            'Age - maiores que o 35 anos'
            ]
    dataframe['Age_rec'] = np.select(conditions, choices)

    #3 SEXO :: Transformando quali[str] em quali[num]
    dataframe = pd.get_dummies(dataframe, columns=['Sex'], prefix=['Sex'], dtype = int, drop_first = True)

     #4 – SEXO * IDADE :: criando um fator de interação
    dataframe['Int_Age_Sex'] = dataframe.apply(lambda x: str(x['Sex_male']) + str(x['Age_rec']), axis = 1 )

    #5 – IDADE, SEXO e INTERAÇÃO :: Transformando quali[str] em quali[num]
    label = OrdinalEncoder()
    dataframe[['Age_rec', 'Sex_male', 'Int_Age_Sex']] = label.fit_transform(
        dataframe[['Age_rec', 'Sex_male', 'Int_Age_Sex']])
    
       #3.1 Mudando tipo da variável Sexo
    dataframe['Sex_male'] = dataframe['Sex_male'].astype('int64')
    dataframe['Age_rec'] = dataframe['Age_rec'].astype('int64')
    dataframe['Pclass'] = dataframe['Pclass'].astype('int64')
    dataframe['Int_Age_Sex'] = dataframe['Int_Age_Sex'].astype('int64')
    return dataframe


# Lendo o BANCO - Parte 1

In [30]:
df_raw = pd.read_csv('../banco/train.csv')

In [31]:
df_kaggle_raw = pd.read_csv('../banco/test.csv')

In [32]:
df = df_raw.copy()
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [33]:
df_kaggle = df_kaggle_raw.copy()
df_kaggle.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# Separando os bancos "y" e "Xs"

## Banco "y"

In [34]:
y = df["Survived"]
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [35]:
y.describe()

count   891.00
mean      0.38
std       0.49
min       0.00
25%       0.00
50%       0.00
75%       1.00
max       1.00
Name: Survived, dtype: float64

## Banco "Xs"

In [36]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [37]:
X = df [ ['Age', 'Sex', 'Pclass', 'Fare'] ]

In [38]:
X.describe()

Unnamed: 0,Age,Pclass,Fare
count,714.0,891.0,891.0
mean,29.7,2.31,32.2
std,14.53,0.84,49.69
min,0.42,1.0,0.0
25%,20.12,2.0,7.91
50%,28.0,3.0,14.45
75%,38.0,3.0,31.0
max,80.0,3.0,512.33


## Separando os dados de treino e teste

In [39]:
# Separa os dados com 70% dos dados para treino e 30% dos dados para teste
X_treino, X_teste, y_treino, y_teste = train_test_split( X, y, train_size=0.7 )

In [40]:
print( len(X_treino) )
print( len(X_teste) )
print( len(y_treino) )
print( len(y_teste) )

623
268
623
268


In [41]:
Xa_treino = arrumando_banco( X_treino )

In [42]:
Xa_teste = arrumando_banco( X_teste )

In [43]:
X1_treino = Xa_treino.drop( columns = [ 'Age_rec', 'Pclass', 'Int_Age_Sex' ] )
X2_treino = Xa_treino.drop( columns = [ 'Age_rec', 'Int_Age_Sex' ] )
X3_treino = Xa_treino.drop( columns = [ 'Fare', 'Age_rec', 'Int_Age_Sex' ] )

In [44]:
Xa_teste.head()

Unnamed: 0,Age,Pclass,Fare,Age_rec,Sex_male,Int_Age_Sex
439,31.0,2,10.5,1,1,5
731,11.0,3,18.79,3,1,7
816,23.0,3,7.92,1,0,1
585,18.0,1,79.65,0,0,0
538,28.0,3,14.5,1,1,5


In [45]:
Xa_teste.dtypes

Age            float64
Pclass           int64
Fare           float64
Age_rec          int64
Sex_male         int64
Int_Age_Sex      int64
dtype: object

In [46]:
X1_teste = Xa_teste.drop( columns = [ 'Age_rec', 'Pclass', 'Int_Age_Sex' ] )
X2_teste = Xa_teste.drop( columns = [ 'Age_rec', 'Int_Age_Sex' ] )
X3_teste = Xa_teste.drop( columns = [ 'Fare', 'Age_rec', 'Int_Age_Sex' ] )

##################################################################################
##################################################################################

# BANCO1 TREINO :: Modelando Regressão Logística

##################################################################################
##################################################################################

In [None]:
def calcula_metricas(nome, y_test, y_pred):
    # Avaliando o modelo
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    # auc_ = auc(y_test, y_pred)

    # =======================================
    # Retornando a tabela de métricas
    # =======================================
    linhas = [[ nome, precision, f1, accuracy]]
    colunas = ["Modelo", "Precision", "F1", "Accuracy"]

    regressao_metricas =  pd.DataFrame(linhas, columns=colunas)

    return regressao_metricas

In [47]:
regressao1 = LogisticRegression(random_state=0)
regressao1.fit( X1_treino, y_treino )
regressao2 = LogisticRegression(random_state=0)
regressao2.fit( X2_treino, y_treino )
regressao3 = LogisticRegression(random_state=0)
regressao3.fit( X3_treino, y_treino )

y_pred1 = regressao1.predict( X1_teste )
y_pred2 = regressao2.predict( X2_teste )
y_pred3 = regressao3.predict( X3_teste )

# Métricas

In [48]:
matriz_confusao1 = confusion_matrix( y_teste, y_pred1 )
matriz_confusao2 = confusion_matrix( y_teste, y_pred2 )
matriz_confusao3 = confusion_matrix( y_teste, y_pred3 )

In [49]:
dicionarioA =  classification_report ( y_teste, y_pred1, output_dict = True )
dicionarioB =  classification_report ( y_teste, y_pred2, output_dict = True )
dicionarioC =  classification_report ( y_teste, y_pred3, output_dict = True )

In [50]:
metricasA = pd.DataFrame({'modelo': 'Modelo1: Age, Sex, Fare', 'accuracy': dicionarioA['accuracy']}, index = [0])
metricasB = pd.DataFrame({'modelo': 'Modelo2: Age, Pclass, Fare, Sex', 'accuracy': dicionarioB['accuracy']}, index=[1])
metricasC = pd.DataFrame({'modelo': 'Modelo3: Age, Pclass, Sex', 'accuracy': dicionarioC['accuracy']}, index=[2])

metricas_RL = pd.concat([metricasA, metricasB, metricasC])

In [51]:
metricas_RL

Unnamed: 0,modelo,accuracy
0,"Modelo1: Age, Sex, Fare",0.81
1,"Modelo2: Age, Pclass, Fare, Sex",0.79
2,"Modelo3: Age, Pclass, Sex",0.79


##################################################################################
##################################################################################

# Decision Tree Classifier

##################################################################################
##################################################################################

In [52]:
from sklearn.tree import DecisionTreeClassifier

classifier_dt = DecisionTreeClassifier(random_state=42,
                                criterion='entropy',    #'gini'
                                max_depth=3)
classifier_dt.fit(X1_treino, y_treino)

In [53]:
y_pred_dt = classifier_dt.predict(X1_teste)

## CROSS VALIDATION

In [54]:
from sklearn.model_selection import cross_val_score

scores_dt = cross_val_score(classifier_dt, X1_treino, y_treino,
                            scoring='accuracy', cv=5)

print(scores_dt.mean())

0.7815870967741936


## Classificação 

In [55]:
from sklearn.ensemble import RandomForestClassifier

classifier_rf = RandomForestClassifier(random_state=1986,
                           criterion='gini',
                           max_depth=10,
                           n_estimators=50,
                           n_jobs=-1)
scores_rf = cross_val_score(classifier_rf, X1_treino, y_treino,
                            scoring='accuracy', cv=5)

print(scores_rf.mean())

0.7928774193548389


## Feature Importance

In [None]:
classifier_rf.fit(X1_treino, y_treino)  # Treinando com tudo

features_importance = zip(classifier_rf.feature_importances_, features)
for importance, feature in sorted(features_importance, reverse=True):
    print("%s: %f%%" % (feature, importance*100))

##################################################################################
##################################################################################
# Random Forest Regressor
##################################################################################
##################################################################################

In [39]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [53]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(
                        max_depth=2, 
                        criterion='absolute_error', 
                        n_estimators = 100,
                        random_state = 0)

# Train the model on training data
rf.fit(X1_treino, y_treino)

In [57]:
# Use the forest's predict method on the test data
y_pred_rf1 = rf.predict(X1_teste)

rf_metricas = pd.DataFrame({'Random Forest', y_teste, y_pred_rf1}, index = [0] )
rf_metricas

TypeError: unhashable type: 'Series'

# BANCO TESTE

In [None]:
df_kaggle.head()

In [None]:
df_kaggle = df_kaggle[['Age', 'Sex']]

In [None]:
df_kaggle.head()

In [None]:
df_kaggle = arrumando_banco(df_kaggle)

In [None]:
# Preenchendo valores faltantes de uma coluna com valor especificado (mediana)
df_kaggle['Age'].fillna(df_kaggle['Age'].median(), inplace=True)

In [None]:
conditions = [
   df_kaggle['Age'] < 12,
    (df_kaggle['Age'] >= 12) & (df_kaggle['Age'] < 22),
    (df_kaggle['Age'] >= 22) & (df_kaggle['Age'] < 35),
    df_kaggle['Age'] >= 35
            ]
choices = [
           'Age - menores que 12 anos',
           'Age - entre 12 e 22 anos',
           'Age - entre 22 e 35 anos',
           'Age - maiores que o 35 anos'
           ]

# criar nova coluna "var_category" baseada em condições e escolhas
df_kaggle['Age_rec'] = np.select(conditions, choices)

In [None]:
label = OrdinalEncoder()
df_kaggle[['Age_rec']] = label.fit_transform(
    df_kaggle[['Age_rec' ]])

In [None]:
df_kaggle = pd.get_dummies(df_kaggle, columns=["Sex"], prefix=["Sex"], dtype = int, drop_first = True)

In [None]:
df_kaggle['Int_Age_Sex'] = df_kaggle.apply(lambda x: str(x['Sex_male']) + str(x['Age_rec']), axis = 1 )

In [None]:
label = OrdinalEncoder()
df_kaggle[['Int_Age_Sex']] = label.fit_transform(
    df_kaggle[['Int_Age_Sex']])

In [None]:
df_kaggle.head()

# VERIFICANDO AS VARIÁVEIS DO BANCO

In [None]:
df_kaggle.drop(columns = ['Age', 'Int_Age_Sex'], inplace = True)

In [None]:
df_kaggle.head()

In [None]:
previsoes = funcao_logistica.predict( df_kaggle )
previsoes

In [None]:
submission = df_kaggle_raw[['PassengerId']]

In [None]:
submission['Survived'] = previsoes

In [None]:
submission.head()

In [None]:
submission.to_csv('../banco/submission.csv', index = False)