In [22]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

### Construção do Dataframe

In [23]:
# Lendo o DF.
df = pd.read_csv('titanic.csv', index_col=False)

# Selecionando os 3 primeiros.
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [24]:
# Selecionando os 3 últimos.
df.tail(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [25]:
# Observando os indexes.
df.index

RangeIndex(start=0, stop=891, step=1)

In [26]:
# Observando informações das colunas.
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [27]:
# Tamanho do DF (linhas e colunas).
df.shape

(891, 12)

In [28]:
# Observando informações das variáveis.
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [29]:
# Mais informações das variáveis.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [30]:
# Removendo as colunas que não serão utilizadas.
df.drop(['Name', 'Ticket', 'SibSp', 'Parch', 'Cabin'], axis = 1, inplace = True)

### Padronização com StandardScaler

In [31]:
# Criando o escalonador StandardScaler
scaler = preprocessing.StandardScaler()

In [32]:
# Pegando a informação que será escalonada.
fare = df['Fare'].values

fare

array([  7.25  ,  71.2833,   7.925 ,  53.1   ,   8.05  ,   8.4583,
        51.8625,  21.075 ,  11.1333,  30.0708,  16.7   ,  26.55  ,
         8.05  ,  31.275 ,   7.8542,  16.    ,  29.125 ,  13.    ,
        18.    ,   7.225 ,  26.    ,  13.    ,   8.0292,  35.5   ,
        21.075 ,  31.3875,   7.225 , 263.    ,   7.8792,   7.8958,
        27.7208, 146.5208,   7.75  ,  10.5   ,  82.1708,  52.    ,
         7.2292,   8.05  ,  18.    ,  11.2417,   9.475 ,  21.    ,
         7.8958,  41.5792,   7.8792,   8.05  ,  15.5   ,   7.75  ,
        21.6792,  17.8   ,  39.6875,   7.8   ,  76.7292,  26.    ,
        61.9792,  35.5   ,  10.5   ,   7.2292,  27.75  ,  46.9   ,
         7.2292,  80.    ,  83.475 ,  27.9   ,  27.7208,  15.2458,
        10.5   ,   8.1583,   7.925 ,   8.6625,  10.5   ,  46.9   ,
        73.5   ,  14.4542,  56.4958,   7.65  ,   7.8958,   8.05  ,
        29.    ,  12.475 ,   9.    ,   9.5   ,   7.7875,  47.1   ,
        10.5   ,  15.85  ,  34.375 ,   8.05  , 263.    ,   8.0

In [33]:
# Realizando o escalonamento e armazenando.
fare_scaled = scaler.fit_transform(fare.reshape(-1, 1))

fare_scaled

array([[-5.02445171e-01],
       [ 7.86845294e-01],
       [-4.88854258e-01],
       [ 4.20730236e-01],
       [-4.86337422e-01],
       [-4.78116429e-01],
       [ 3.95813561e-01],
       [-2.24083121e-01],
       [-4.24256141e-01],
       [-4.29555021e-02],
       [-3.12172378e-01],
       [-1.13845709e-01],
       [-4.86337422e-01],
       [-1.87093118e-02],
       [-4.90279793e-01],
       [-3.26266659e-01],
       [-6.19988892e-02],
       [-3.86670720e-01],
       [-2.85997284e-01],
       [-5.02948539e-01],
       [-1.24919787e-01],
       [-3.86670720e-01],
       [-4.86756223e-01],
       [ 6.63597416e-02],
       [-2.24083121e-01],
       [-1.64441595e-02],
       [-5.02948539e-01],
       [ 4.64700108e+00],
       [-4.89776426e-01],
       [-4.89442190e-01],
       [-9.02720170e-02],
       [ 2.30172882e+00],
       [-4.92377828e-01],
       [-4.37007438e-01],
       [ 1.00606170e+00],
       [ 3.98582080e-01],
       [-5.02863973e-01],
       [-4.86337422e-01],
       [-2.8

In [34]:
# Criando a coluna no dataframe e armazenando os valores escalonados.
df['Fare_StandardScaler'] = fare_scaled

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Fare_StandardScaler
0,1,0,3,male,22.0,7.25,S,-0.502445
1,2,1,1,female,38.0,71.2833,C,0.786845
2,3,1,3,female,26.0,7.925,S,-0.488854
3,4,1,1,female,35.0,53.1,S,0.42073
4,5,0,3,male,35.0,8.05,S,-0.486337


### Normalização com MinMaxScaler

In [35]:
# Criando o escalonamento utilizando MinMaxScaler.
scaler = preprocessing.MinMaxScaler()

In [36]:
# Pegando a informação que será escalonada.
fare_scaled = scaler.fit_transform(fare.reshape(-1, 1))

In [38]:
# Criando uma coluna no dataframe e armazenando os valores escalonados.
df['Fare_MinMaxScaler'] = fare_scaled

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Fare_StandardScaler,Fare_MinMaxScaler
0,1,0,3,male,22.0,7.25,S,-0.502445,0.014151
1,2,1,1,female,38.0,71.2833,C,0.786845,0.139136
2,3,1,3,female,26.0,7.925,S,-0.488854,0.015469
3,4,1,1,female,35.0,53.1,S,0.42073,0.103644
4,5,0,3,male,35.0,8.05,S,-0.486337,0.015713


### Transformação de dados categóricos

#### Ordinal Encoding

In [40]:
# Criando conversor (OrdinalEncoder) de variável categórica para numérica.
encoder = preprocessing.OrdinalEncoder()

In [41]:
# Armazenando em uma variável.
sex = df['Sex'].values

sex

array(['male', 'female', 'female', 'female', 'male', 'male', 'male',
       'male', 'female', 'female', 'female', 'female', 'male', 'male',
       'female', 'female', 'male', 'male', 'female', 'female', 'male',
       'male', 'female', 'male', 'female', 'female', 'male', 'male',
       'female', 'male', 'male', 'female', 'female', 'male', 'male',
       'male', 'male', 'male', 'female', 'female', 'female', 'female',
       'male', 'female', 'female', 'male', 'male', 'female', 'male',
       'female', 'male', 'male', 'female', 'female', 'male', 'male',
       'female', 'male', 'female', 'male', 'male', 'female', 'male',
       'male', 'male', 'male', 'female', 'male', 'female', 'male', 'male',
       'female', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'female', 'male', 'male', 'female', 'male', 'female', 'female',
       'male', 'male', 'female', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'female', 'male', 'female', 'male',
      

In [42]:
# Realizando a conversão e armazenando na variável.
sex_encoder = encoder.fit_transform(sex.reshape(-1, 1))

In [43]:
# Criando uma coluna no dataframe e armazenando os valores convertidos.
df['Sex_OrdinalEncoder'] = sex_encoder

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Fare_StandardScaler,Fare_MinMaxScaler,Sex_OrdinalEncoder
0,1,0,3,male,22.0,7.25,S,-0.502445,0.014151,1.0
1,2,1,1,female,38.0,71.2833,C,0.786845,0.139136,0.0
2,3,1,3,female,26.0,7.925,S,-0.488854,0.015469,0.0
3,4,1,1,female,35.0,53.1,S,0.42073,0.103644,0.0
4,5,0,3,male,35.0,8.05,S,-0.486337,0.015713,1.0


#### One hot Encoding

In [44]:
# Criando conversor (OneHotEncoder) de variável categórica para numérica.
one_hot_encoder = preprocessing.OneHotEncoder()

In [45]:
# Realizando a conversão e armazenando na variável.
sex_one_hot_encoder = one_hot_encoder.fit_transform(df['Sex'].values.reshape(-1, 1))

In [46]:
# Verificando as novas colunas que serão criadas.
one_hot_encoder.get_feature_names_out(['Sex'])

array(['Sex_female', 'Sex_male'], dtype=object)

In [47]:
# Salvando as informações em uma variável.
sex_one_hot_encoder_values = sex_one_hot_encoder.toarray()

In [48]:
# Verificando o tamanho (linha x coluna).
sex_one_hot_encoder_values.shape

(891, 2)

In [49]:
# Criando as novas colunas e armazenando os novos valores.
df[one_hot_encoder.get_feature_names_out(['Sex'])] = sex_one_hot_encoder_values

df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Fare_StandardScaler,Fare_MinMaxScaler,Sex_OrdinalEncoder,Sex_female,Sex_male
0,1,0,3,male,22.0,7.2500,S,-0.502445,0.014151,1.0,0.0,1.0
1,2,1,1,female,38.0,71.2833,C,0.786845,0.139136,0.0,1.0,0.0
2,3,1,3,female,26.0,7.9250,S,-0.488854,0.015469,0.0,1.0,0.0
3,4,1,1,female,35.0,53.1000,S,0.420730,0.103644,0.0,1.0,0.0
4,5,0,3,male,35.0,8.0500,S,-0.486337,0.015713,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,13.0000,S,-0.386671,0.025374,1.0,0.0,1.0
887,888,1,1,female,19.0,30.0000,S,-0.044381,0.058556,0.0,1.0,0.0
888,889,0,3,female,,23.4500,S,-0.176263,0.045771,0.0,1.0,0.0
889,890,1,1,male,26.0,30.0000,C,-0.044381,0.058556,1.0,0.0,1.0


### Tratando dados nulos

In [50]:
# Contando a quantidade de valores nulos no Dataframe.
df.isnull().sum()

PassengerId              0
Survived                 0
Pclass                   0
Sex                      0
Age                    177
Fare                     0
Embarked                 2
Fare_StandardScaler      0
Fare_MinMaxScaler        0
Sex_OrdinalEncoder       0
Sex_female               0
Sex_male                 0
dtype: int64

In [51]:
# Deletando as 2 linhas que tem valores nulos para Embarked
df['Embarked'].fillna('S', inplace = True)

In [52]:
# Verificando o valor médio para Age.
df['Age'].mean()

29.69911764705882

In [53]:
# Preenchendo os valores nulos de Age com o valor médio.
df['Age'].fillna(df['Age'].mean(), inplace = True)

In [54]:
df.isnull().sum()

PassengerId            0
Survived               0
Pclass                 0
Sex                    0
Age                    0
Fare                   0
Embarked               0
Fare_StandardScaler    0
Fare_MinMaxScaler      0
Sex_OrdinalEncoder     0
Sex_female             0
Sex_male               0
dtype: int64