In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### 1. Cargamos los datos del set de entrenamiento

In [2]:
dataset = pd.read_csv("../data/raw/train.csv")
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
dataset.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


### 2. Porcentaje de Nulos

In [4]:
dataset.isnull().mean()

Survived    0.000000
Pclass      0.000000
Sex         0.000000
Age         0.198653
SibSp       0.000000
Parch       0.000000
Fare        0.000000
Cabin       0.771044
Embarked    0.002245
dtype: float64

    Eliminamos la columna Cabin dado que tiene 77.1% de faltantes.

In [5]:
dataset.drop(['Cabin'], axis=1, inplace=True)

    Imputamos la media en la columna Age

In [6]:
media_age = dataset['Age'].mean()
dataset['Age'] = dataset['Age'].fillna(media_age).astype(int)

    Imputamos Embarked

In [7]:
dataset['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [8]:
dataset['Embarked'] = dataset['Embarked'].fillna('S')

### 3. Codificación de Variables Categóricas 

In [9]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22,1,0,7.25,S
1,1,1,female,38,1,0,71.2833,C
2,1,3,female,26,0,0,7.925,S
3,1,1,female,35,1,0,53.1,S
4,0,3,male,35,0,0,8.05,S


In [10]:
dataset['Pclass'].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [11]:
dataset['SibSp'].value_counts()

SibSp
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: count, dtype: int64

In [12]:
dataset['Sex'].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

    Codificamos Variable Sex con One Hot Encoding

In [13]:
dataset['Sex'] = pd.get_dummies(dataset['Sex'], drop_first=True).astype(int)

    Codificamos Variables Embarked con Frecuency Encoding 

In [14]:
codificador_embarked = dataset['Embarked'].value_counts()
dataset['Embarked'] = dataset['Embarked'].map(codificador_embarked)

In [15]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22,1,0,7.25,646
1,1,1,0,38,1,0,71.2833,168
2,1,3,0,26,0,0,7.925,646
3,1,1,0,35,1,0,53.1,646
4,0,3,1,35,0,0,8.05,646


### 4. Estandarización de Variables

In [16]:
from sklearn.preprocessing import MinMaxScaler

In [17]:
mm_scaler = MinMaxScaler()

X = dataset.drop('Survived', axis=1)
scaled_feature = mm_scaler.fit_transform(X)
scaled_df = pd.DataFrame(scaled_feature, columns=X.columns)

In [18]:
scaled_df['Survived'] = dataset['Survived']
scaled_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,1.0,1.0,0.275,0.125,0.0,0.014151,1.0,0
1,0.0,0.0,0.475,0.125,0.0,0.139136,0.15993,1
2,1.0,0.0,0.325,0.0,0.0,0.015469,1.0,1
3,0.0,0.0,0.4375,0.125,0.0,0.103644,1.0,1
4,1.0,1.0,0.4375,0.0,0.0,0.015713,1.0,0


In [19]:
scaled_df.to_csv('../data/processed/features_for_model.csv') 