In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [2]:
df_original = pd.read_csv('train.csv')

In [3]:
df_original.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [4]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
mediana_classe1 = df_original[df_original['Pclass'] == 1]['Age'].median()
mediana_classe2 = df_original[df_original['Pclass'] == 2]['Age'].median()
mediana_classe3 = df_original[df_original['Pclass'] == 3]['Age'].median()

for index, row in df_original.iterrows():
    if row['Pclass'] == 1 and pd.isnull(row['Age']):
        df_original.at[index, 'Age'] = mediana_classe1
    elif row['Pclass'] == 2 and pd.isnull(row['Age']):
        df_original.at[index, 'Age'] = mediana_classe2
    elif row['Pclass'] == 3 and pd.isnull(row['Age']):
        df_original.at[index, 'Age'] = mediana_classe3

In [6]:
df_original_1 = df_original.drop(['Cabin', 'Name', 'Ticket'], axis=1)

In [7]:
df_numerico = df_original_1.select_dtypes(include='number').drop(['PassengerId', 'Survived', 'Pclass'], axis=1)

In [8]:
#Padronização com Sklearn
standard_data = StandardScaler().fit_transform(df_numerico)
df_standard = pd.DataFrame(standard_data, columns=df_numerico.columns)
df_standard

Unnamed: 0,Age,SibSp,Parch,Fare
0,-0.533834,0.432793,-0.473674,-0.502445
1,0.674891,0.432793,-0.473674,0.786845
2,-0.231653,-0.474545,-0.473674,-0.488854
3,0.448255,0.432793,-0.473674,0.420730
4,0.448255,-0.474545,-0.473674,-0.486337
...,...,...,...,...
886,-0.156107,-0.474545,-0.473674,-0.386671
887,-0.760469,-0.474545,-0.473674,-0.044381
888,-0.382743,0.432793,2.008933,-0.176263
889,-0.231653,-0.474545,-0.473674,-0.044381


In [9]:
df_categorico = df_original_1.drop(columns=df_numerico.columns)
df_categorico

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Embarked
0,1,0,3,male,S
1,2,1,1,female,C
2,3,1,3,female,S
3,4,1,1,female,S
4,5,0,3,male,S
...,...,...,...,...,...
886,887,0,2,male,S
887,888,1,1,female,S
888,889,0,3,female,S
889,890,1,1,male,C


In [10]:
#codificação one-hot

one_hot_encoded_data = pd.get_dummies(df_categorico, columns=['Sex', 'Embarked'])

In [11]:
one_hot_encoded_data

Unnamed: 0,PassengerId,Survived,Pclass,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,0,1,0,0,1
1,2,1,1,1,0,1,0,0
2,3,1,3,1,0,0,0,1
3,4,1,1,1,0,0,0,1
4,5,0,3,0,1,0,0,1
...,...,...,...,...,...,...,...,...
886,887,0,2,0,1,0,0,1
887,888,1,1,1,0,0,0,1
888,889,0,3,1,0,0,0,1
889,890,1,1,0,1,1,0,0


In [12]:
pre_df = df_standard.join(one_hot_encoded_data, how="inner")
pre_df.isnull().sum()

Age            0
SibSp          0
Parch          0
Fare           0
PassengerId    0
Survived       0
Pclass         0
Sex_female     0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64

In [13]:
pre_df['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [15]:
#divisão 70:30
X_treino, X_teste, y_treino, y_teste = train_test_split(pre_df.drop(['Survived'], axis=1), pre_df['Survived'], test_size=0.3, random_state=0)

In [81]:
#treinar modelo
lr = LogisticRegression(max_iter=3000)
lr.fit(X_treino, y_treino)
predictions = lr.predict(X_teste)

In [82]:
#imprimir relatório
print(classification_report(y_teste, predictions))
print("-----------------------------------------")
print(confusion_matrix(y_teste, predictions))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       168
           1       0.75      0.72      0.73       100

    accuracy                           0.81       268
   macro avg       0.79      0.79      0.79       268
weighted avg       0.80      0.81      0.81       268

-----------------------------------------
[[144  24]
 [ 28  72]]


In [37]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2)

X_treino_res, y_treino_res = sm.fit_resample(X_treino, y_treino.ravel())

In [43]:
#Após o Oversampling
print(sum(y_treino_res == 1))
print(sum(y_treino_res == 0))

381
381


In [79]:
#treinar modelo
lr1 = LogisticRegression(max_iter=3000)
lr1.fit(X_treino_res, y_treino_res)
predictions = lr1.predict(X_teste)

In [80]:
#imprimir relatório
print(classification_report(y_teste, predictions))
print("-----------------------------------------")
print(confusion_matrix(y_teste, predictions))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84       168
           1       0.72      0.75      0.74       100

    accuracy                           0.80       268
   macro avg       0.78      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268

-----------------------------------------
[[139  29]
 [ 25  75]]


In [54]:
from imblearn.under_sampling import NearMiss

nr = NearMiss()
X_treino_miss, y_treino_miss = nr.fit_resample(X_treino, y_treino.ravel())

In [56]:
#Apos o Undersampling
print(sum(y_treino_miss == 1))
print(sum(y_treino_miss == 0))

242
242


In [77]:
#treinar modelo
lr2 = LogisticRegression(max_iter=3000)
lr2.fit(X_treino_miss, y_treino_miss)
predictions = lr2.predict(X_teste)

In [78]:
#imprimir relatório
print(classification_report(y_teste, predictions))
print("-----------------------------------------")
print(confusion_matrix(y_teste, predictions))

              precision    recall  f1-score   support

           0       0.85      0.80      0.82       168
           1       0.69      0.77      0.73       100

    accuracy                           0.79       268
   macro avg       0.77      0.78      0.78       268
weighted avg       0.79      0.79      0.79       268

-----------------------------------------
[[134  34]
 [ 23  77]]
