# Bibliotecas / Pacotes usados

In [94]:
#%pip install pandas
#%pip install seaborn
#%pip install numpy
#%pip install matplotlib.pyplot
#%pip install -U scikit-learn 

 
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## Funções

### Configura os gráficos

In [95]:
# ===================================================================
# Configura os gráficos
def jupyter_settings():
    %matplotlib inline
    # %pylab inline
    
    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [22, 9]
    plt.rcParams['font.size'] = 21

    # display(HTML('<style>.conteiner{width:100% !important;}</style>'))

    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.expand_frame_repr', False)
    
    # configura o pandas para quantidade de casas decimais
    pd.set_option('display.float_format', lambda x: '%.2f' % x)

    sns.set()

jupyter_settings()

### Arrumando o Banco para Modelagem

In [122]:
def arrumando_banco(data):

    dataframe = data.copy()

    #1 – IDADE :: imputação de missings
    dataframe['Age'].fillna(dataframe['Age'].median(), inplace=True)

    #2 – IDADE :: recodificação da variával idade (quanti para quali[str])
    conditions = [
    dataframe['Age'] < 12,
        (dataframe['Age'] >= 12) & (dataframe['Age'] < 22),
        (dataframe['Age'] >= 22) & (dataframe['Age'] < 35),
        dataframe['Age'] >= 35
                ]
    choices = [
            'Age - menores que 12 anos',
            'Age - entre 12 e 22 anos',
            'Age - entre 22 e 35 anos',
            'Age - maiores que o 35 anos'
            ]
    dataframe['Age_rec'] = np.select(conditions, choices)

    #3 SEXO :: Transformando quali[str] em quali[num]
    dataframe = pd.get_dummies(dataframe, columns=['Sex'], prefix=['Sex'], dtype = int, drop_first = True)

    #4 – SEXO * IDADE :: criando um fator de interação
    dataframe['Int_Age_Sex'] = dataframe.apply(lambda x: str(x['Sex_male']) + str(x['Age_rec']), axis = 1 )

    #5 – IDADE, SEXO e INTERAÇÃO :: Transformando quali[str] em quali[num]
    label = OrdinalEncoder()
    dataframe[['Age_rec', 'Sex_male', 'Int_Age_Sex']] = label.fit_transform(
        dataframe[['Age_rec', 'Sex_male', 'Int_Age_Sex']])

    return dataframe


# Lendo o BANCO TREINO - Parte 1

In [96]:
df_raw = pd.read_csv('../banco/train.csv',)

In [97]:
df_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [98]:
df4 = df_raw.copy()
df4.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# Separando os bancos "y" e "Xs"

## Banco "y"

In [99]:
y = df4["Survived"]
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

## Banco "Xs"

In [100]:
df4.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [101]:
X = df4 [ ['Age', 'Sex'] ]

## Separando os dados de treino e teste

In [102]:
# Separa os dados com 70% dos dados para treino e 30% dos dados para teste
X1_treino, X2_teste, y1_treino, y2_teste = train_test_split( X, y, train_size=0.7 )

In [103]:
print( len(X1_treino) )
print( len(X2_teste) )
print( len(y1_treino) )
print( len(y2_teste) )

623
268
623
268


In [None]:
X2_treino = arrumando_banco(X1_treino)

## Preenchendo os missings

### Variável Idade

In [104]:
# Preenchendo valores faltantes de uma coluna com valor especificado (mediana)
X1_treino['Age'].fillna(X1_treino['Age'].median(), inplace=True)

## Recodificação das variáveis

### Recode variável :: Idade

In [105]:
q1, q2, q3 = np.percentile(X1_treino['Age'], [25, 50, 75])
print(q1)
print(q2)
print(q3)

22.0
28.25
35.0


In [106]:
conditions = [
   X1_treino['Age'] < 12,
    (X1_treino['Age'] >= 12) & (X1_treino['Age'] < 22),
    (X1_treino['Age'] >= 22) & (X1_treino['Age'] < 35),
    X1_treino['Age'] >= 35
            ]
choices = [
           'Age - menores que 12 anos',
           'Age - entre 12 e 22 anos',
           'Age - entre 22 e 35 anos',
           'Age - maiores que o 35 anos'
           ]

# criar nova coluna "var_category" baseada em condições e escolhas
X1_treino['Age_rec'] = np.select(conditions, choices)

In [107]:
label = OrdinalEncoder()
X1_treino[['Age_rec']] = label.fit_transform(
    X1_treino[['Age_rec']])

### Recode variável :: Tarifa do Passageiro

In [108]:
#q1, q2, q3 = np.percentile(df4['Fare'], [25, 50, 75])
#print(q1)
#print(q2)
#print(q3)

In [109]:
#conditions = [
   #df4['Fare'] < q1,
    #(df4['Fare'] >= q1) & (df4['Fare'] < q2),
    #(df4['Fare'] >= q2) & (df4['Fare'] < q3),
    #df4['Fare'] >= q3
            #]
#choices = [
          # 'Fare - menores que 1º quartil',
          # 'Fare - entre 1º e 2º quartil',
           #'Fare - entre 2º e 3º quartil',
           #'Fare - maiores que o 3º quartil'
           #]

# criar nova coluna "pop_category" baseada em condições e escolhas
#df4['Fare_rec'] = np.select(conditions, choices)

In [110]:
#label = OrdinalEncoder()
#df4[['Fare_rec']] = label.fit_transform(
    #df4[['Fare_rec']])

### Recode variável :: Sexo

In [111]:
X1_treino = pd.get_dummies(X1_treino, columns=["Sex"], prefix=["Sex"], dtype = int, drop_first = True)

In [112]:
X1_treino.head()

Unnamed: 0,Age,Age_rec,Sex_male
563,28.25,1.0,1
626,57.0,2.0,1
306,28.25,1.0,0
152,55.5,2.0,1
196,28.25,1.0,1


## Criando Interação entre variáveis categóricas

### Interação entre faixa_idade e sexo

In [113]:
X1_treino['Int_Age_Sex'] = X1_treino.apply(lambda x: str(x['Sex_male']) + str(x['Age_rec']), axis = 1 )

In [114]:
label = OrdinalEncoder()
X1_treino[['Int_Age_Sex']] = label.fit_transform(
    X1_treino[['Int_Age_Sex']])

In [115]:
X1_treino['Int_Age_Sex'].value_counts()

Int_Age_Sex
5.00    214
6.00    107
1.00    101
4.00     63
2.00     59
7.00     30
0.00     30
3.00     19
Name: count, dtype: int64

### Interação entre tarifa_passageiro e sexo

In [116]:
#df4['Int_Fare_Sex'] = df4.apply(lambda x: x['Sex_male'] * x['Fare_rec'], axis = 1 )

### Interação entre Pclass e sexo

In [117]:
#df4['Int_Pclass_Sex'] = df4.apply(lambda x: x['Sex_male'] * x['Pclass'], axis = 1 )

## Banco1 Xs Final

In [118]:
X1_treino.drop(columns = ['Age', 'Int_Age_Sex'], inplace = True)

In [119]:
X1_treino.head()

Unnamed: 0,Age_rec,Sex_male
563,1.0,1
626,2.0,1
306,1.0,0
152,2.0,1
196,1.0,1


# BANCO1 TREINO :: Modelando Regressão Logística

In [120]:
funcao_logistica = LogisticRegression()
funcao_logistica.fit( X1_treino, y1_treino )


# BANCO2 TESTE ::

In [121]:
X2_teste.head()

Unnamed: 0,Age,Sex
254,41.0,female
724,27.0,male
448,5.0,female
502,,female
845,42.0,male


## PASSO A PASSO - Igualando os bancos Treino e Teste

In [123]:
X3_teste = arrumando_banco(X2_teste)

In [124]:
X3_teste.head()

Unnamed: 0,Age,Age_rec,Sex_male,Int_Age_Sex
254,41.0,2.0,0.0,2.0
724,27.0,1.0,1.0,5.0
448,5.0,3.0,0.0,3.0
502,27.5,1.0,0.0,1.0
845,42.0,2.0,1.0,6.0


In [125]:
X3_teste.drop(columns = ['Age', 'Int_Age_Sex'], inplace = True) #X1, x2, x3


In [126]:
X3_teste.head()

Unnamed: 0,Age_rec,Sex_male
254,2.0,0.0
724,1.0,1.0
448,3.0,0.0
502,1.0,0.0
845,2.0,1.0


In [204]:
X2_teste['Age'].fillna(X2_teste['Age'].median(), inplace=True)

In [205]:
conditions = [
   X2_teste['Age'] < 12,
    (X2_teste['Age'] >= 12) & (X2_teste['Age'] < 22),
    (X2_teste['Age'] >= 22) & (X2_teste['Age'] < 35),
    X2_teste['Age'] >= 35
            ]
choices = [
           'Age - menores que 12 anos',
           'Age - entre 12 e 22 anos',
           'Age - entre 22 e 35 anos',
           'Age - maiores que o 35 anos'
           ]

# criar nova coluna "var_category" baseada em condições e escolhas
X2_teste['Age_rec'] = np.select(conditions, choices)

In [206]:
label = OrdinalEncoder()
X2_teste[['Age_rec']] = label.fit_transform(
    X2_teste[['Age_rec']])

### Recode variável :: Sexo

In [207]:
X2_teste = pd.get_dummies(X2_teste, columns=["Sex"], prefix=["Sex"], dtype = int, drop_first = True)

## Criando Interação entre variáveis categóricas

### Interação entre faixa_idade e sexo

In [208]:
X2_teste['Int_Age_Sex'] = X2_teste.apply(lambda x: str(x['Sex_male']) + str(x['Age_rec']), axis = 1 )

In [209]:
label = OrdinalEncoder()
X2_teste[['Int_Age_Sex']] = label.fit_transform(
    X2_teste[['Int_Age_Sex']])

## Banco2 Xs Final

In [210]:
X2_teste.drop(columns = ['Age', 'Age_rec', 'Sex_male'], inplace = True)

In [211]:
X2_teste.head()

Unnamed: 0,Int_Age_Sex
38,0.0
110,6.0
237,3.0
250,5.0
300,1.0


In [127]:
previsoes = funcao_logistica.predict( X3_teste )
previsoes

array([1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0], dtype=int64)

In [128]:
confusion_matrix ( y2_teste, previsoes )

array([[133,  26],
       [ 30,  79]], dtype=int64)

In [133]:
dicionario =  classification_report ( y2_teste, previsoes, output_dict = True )

In [142]:
metricas = pd.DataFrame({'modelo': 'y3_teste', 'accuracy': dicionario['accuracy']}, index=[0])
metricas2 = pd.DataFrame({'modelo': 'y4_teste', 'accuracy': dicionario['accuracy']}, index=[1])

metricas3 = pd.concat([metricas, metricas2])

In [143]:
metricas3

Unnamed: 0,modelo,accuracy
0,y3_teste,0.79
1,y4_teste,0.79


In [130]:
y2_teste.head()

254    0
724    1
448    1
502    0
845    0
Name: Survived, dtype: int64

# BANCO TESTE

In [72]:
df_kaggle_raw = pd.read_csv('../banco/test.csv')

In [73]:
df_kaggle = df_kaggle_raw.copy()

In [74]:
df_kaggle.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.83,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.69,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.66,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.29,,S


In [75]:
df_kaggle = df_kaggle[['Age', 'Sex']]

In [76]:
df_kaggle.head()

Unnamed: 0,Age,Sex
0,34.5,male
1,47.0,female
2,62.0,male
3,27.0,male
4,22.0,female


In [77]:
df_kaggle = arrumando_banco(df_kaggle)

In [220]:
# Preenchendo valores faltantes de uma coluna com valor especificado (mediana)
df_kaggle['Age'].fillna(df_kaggle['Age'].median(), inplace=True)

In [221]:
conditions = [
   df_kaggle['Age'] < 12,
    (df_kaggle['Age'] >= 12) & (df_kaggle['Age'] < 22),
    (df_kaggle['Age'] >= 22) & (df_kaggle['Age'] < 35),
    df_kaggle['Age'] >= 35
            ]
choices = [
           'Age - menores que 12 anos',
           'Age - entre 12 e 22 anos',
           'Age - entre 22 e 35 anos',
           'Age - maiores que o 35 anos'
           ]

# criar nova coluna "var_category" baseada em condições e escolhas
df_kaggle['Age_rec'] = np.select(conditions, choices)

In [222]:
label = OrdinalEncoder()
df_kaggle[['Age_rec']] = label.fit_transform(
    df_kaggle[['Age_rec' ]])

In [223]:
df_kaggle = pd.get_dummies(df_kaggle, columns=["Sex"], prefix=["Sex"], dtype = int, drop_first = True)

In [224]:
df_kaggle['Int_Age_Sex'] = df_kaggle.apply(lambda x: str(x['Sex_male']) + str(x['Age_rec']), axis = 1 )

In [225]:
label = OrdinalEncoder()
df_kaggle[['Int_Age_Sex']] = label.fit_transform(
    df_kaggle[['Int_Age_Sex']])

In [230]:
df_kaggle.head()

Unnamed: 0,Age_rec,Sex_male,Int_Age_Sex
0,1.0,1,5.0
1,2.0,0,2.0
2,2.0,1,6.0
3,1.0,1,5.0
4,1.0,0,1.0


# VERIFICANDO AS VARIÁVEIS DO BANCO

In [79]:
df_kaggle.drop(columns = ['Age', 'Int_Age_Sex'], inplace = True)

In [80]:
df_kaggle.head()

Unnamed: 0,Age_rec,Sex_male
0,1.0,1.0
1,2.0,0.0
2,2.0,1.0
3,1.0,1.0
4,1.0,0.0


In [81]:
previsoes = funcao_logistica.predict( df_kaggle )
previsoes

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [82]:
submission = df_kaggle_raw[['PassengerId']]

In [89]:
submission['Survived'] = previsoes

AttributeError: 'numpy.ndarray' object has no attribute 'loc'

In [84]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [85]:
submission.to_csv('../banco/submission.csv', index = False)