# Más Modelos de Regresión y Transformers

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

## Cargando data del Titanic 

In [2]:
df = pd.read_csv('titanic.csv')
df['Fare'] = df.Fare*1000 # Se infla un poquito para ver el efecto de los Transformers

In [3]:
X = df[['Pclass','SibSp','Fare']]
y = df.Age.fillna(df.Age.mean())

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size = 0.3, random_state = 123)

In [4]:
# creando una función utilizando las métricas más importantes para medir en Regresión
def evaluation(y, y_pred):
    rmse = np.sqrt(mean_squared_error(y,y_pred))
    mae = np.sqrt(mean_absolute_error(y,y_pred))
    r2 = r2_score(y, y_pred)
    
    print('RMSE:',rmse)
    print('MAE:', mae)
    print('R2:', r2)
    print('')
 

## Presentando KNN 

In [5]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
y_pred_train = knn.predict(X_train)

evaluation(y_train, y_pred_train)
evaluation(y_test, y_pred)

RMSE: 11.097068145502814
MAE: 2.846933508737516
R2: 0.23770948069151343

RMSE: 13.78107055311991
MAE: 3.289857574753015
R2: -0.022693752495068553



In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

knn = KNeighborsRegressor()
knn.fit(X_train_sc, y_train)
y_pred = knn.predict(X_test_sc)

evaluation(y_test, y_pred)

RMSE: 12.834229356766496
MAE: 3.201102442614478
R2: 0.11300885745398226



# Transformers más básicos en Scikit Learn

## Estandarizar

In [7]:
df = pd.read_csv('mtcars.csv', index_col = 0)

In [8]:
# se puede ajustar y luego transformar
sc = StandardScaler()
sc.fit(df)
sc.transform(df)

array([[ 1.53299135e-01, -1.06667720e-01, -5.79750316e-01,
        -5.43654869e-01,  5.76594481e-01, -6.20166581e-01,
        -7.89600577e-01, -8.81917104e-01,  1.20894105e+00,
         4.30331483e-01,  7.46967077e-01],
       [ 1.53299135e-01, -1.06667720e-01, -5.79750316e-01,
        -5.43654869e-01,  5.76594481e-01, -3.55382189e-01,
        -4.71201785e-01, -8.81917104e-01,  1.20894105e+00,
         4.30331483e-01,  7.46967077e-01],
       [ 4.56736599e-01, -1.24445674e+00, -1.00602601e+00,
        -7.95569902e-01,  4.81584062e-01, -9.31677630e-01,
         4.32823359e-01,  1.13389342e+00,  1.20894105e+00,
         4.30331483e-01, -1.14010764e+00],
       [ 2.20729683e-01, -1.06667720e-01,  2.23615417e-01,
        -5.43654869e-01, -9.81576392e-01, -2.33633287e-03,
         9.04735855e-01,  1.13389342e+00, -8.27170192e-01,
        -9.46729262e-01, -1.14010764e+00],
       [-2.34426513e-01,  1.03112130e+00,  1.05977159e+00,
         4.19549669e-01, -8.48561806e-01,  2.31296954e-01,
  

In [9]:
# o hacer ambas cosas a la vez
sc = StandardScaler()
sc.fit_transform(df)

array([[ 1.53299135e-01, -1.06667720e-01, -5.79750316e-01,
        -5.43654869e-01,  5.76594481e-01, -6.20166581e-01,
        -7.89600577e-01, -8.81917104e-01,  1.20894105e+00,
         4.30331483e-01,  7.46967077e-01],
       [ 1.53299135e-01, -1.06667720e-01, -5.79750316e-01,
        -5.43654869e-01,  5.76594481e-01, -3.55382189e-01,
        -4.71201785e-01, -8.81917104e-01,  1.20894105e+00,
         4.30331483e-01,  7.46967077e-01],
       [ 4.56736599e-01, -1.24445674e+00, -1.00602601e+00,
        -7.95569902e-01,  4.81584062e-01, -9.31677630e-01,
         4.32823359e-01,  1.13389342e+00,  1.20894105e+00,
         4.30331483e-01, -1.14010764e+00],
       [ 2.20729683e-01, -1.06667720e-01,  2.23615417e-01,
        -5.43654869e-01, -9.81576392e-01, -2.33633287e-03,
         9.04735855e-01,  1.13389342e+00, -8.27170192e-01,
        -9.46729262e-01, -1.14010764e+00],
       [-2.34426513e-01,  1.03112130e+00,  1.05977159e+00,
         4.19549669e-01, -8.48561806e-01,  2.31296954e-01,
  

# Data leakage 
Ejemplo de Estandarizar de manera incorrecto y mostrar diferencias en las métricas de evaluación.

In [10]:
df = pd.read_csv('titanic.csv')
df['Fare'] = df.Fare*1000

In [11]:
X = df[['Pclass','SibSp','Fare']]
y = df.Age.fillna(df.Age.mean())

## Manera Incorrecta 

In [12]:
sc = StandardScaler()
X_sc = sc.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size = 0.3, random_state = 123)


knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

evaluation(y_test, y_pred)

RMSE: 12.848766382620424
MAE: 3.2011839337500954
R2: 0.11099837200965901



## Manera Correcta 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

knn = KNeighborsRegressor()
knn.fit(X_train_sc, y_train)
y_pred = knn.predict(X_test_sc)

evaluation(y_test, y_pred)

RMSE: 12.834229356766496
MAE: 3.201102442614478
R2: 0.11300885745398226



## Encoders 

In [14]:
df[['Embarked']]

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S
...,...
886,S
887,S
888,S
889,C


## Ordinal Encoder

In [15]:
X = df.Embarked

ord_enc = OrdinalEncoder()
ord_enc.fit_transform(X).Embarked.value_counts()

1    644
2    168
3     77
4      2
Name: Embarked, dtype: int64

In [16]:
ord_enc.get_params() # un diccionario que permite obtener los parámetros del encoder

{'cols': ['Embarked'],
 'drop_invariant': False,
 'handle_missing': 'value',
 'handle_unknown': 'value',
 'mapping': [{'col': 'Embarked',
   'mapping': S      1
   C      2
   Q      3
   NaN    4
   dtype: int64,
   'data_type': dtype('O')}],
 'return_df': True,
 'verbose': 0}

## OneHotEncoder 

In [17]:
X = df.Embarked
ord_enc = OneHotEncoder(use_cat_names = True)
ord_enc.fit_transform(X)

Unnamed: 0,Embarked_S,Embarked_C,Embarked_Q,Embarked_nan
0,1,0,0,0
1,0,1,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0
...,...,...,...,...
886,1,0,0,0
887,1,0,0,0
888,1,0,0,0
889,0,1,0,0


# Ejercicios

Generar un modelo de Regresión que permita predecir la Edad de un pasajero mediante las variables dadas.`.

## KNN + OneHot

In [18]:
df = pd.read_csv('titanic.csv')
df['Fare'] = df.Fare*1000

In [19]:
X = df[['Pclass','SibSp','Fare', 'Embarked']]
y = df.Age.fillna(df.Age.mean())

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size = 0.3, random_state = 123)

In [20]:
enc = OneHotEncoder()
X_train_enc = enc.fit_transform(X_train)
X_test_enc = enc.transform(X_test)


In [21]:
knn = KNeighborsRegressor()
knn.fit(X_train_enc, y_train)
y_pred = knn.predict(X_test_enc)
y_pred_train = knn.predict(X_train_enc)

evaluation(y_train, y_pred_train)
evaluation(y_test, y_pred)

RMSE: 11.00269224278825
MAE: 2.8408323081786064
R2: 0.2506202670938994

RMSE: 13.689808779188434
MAE: 3.283374226699551
R2: -0.0091935227859401



## KNN + OneHotEncoder + StandardScaler 

In [22]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train_enc)
X_test_sc = sc.transform(X_test_enc)



In [23]:
knn = KNeighborsRegressor()
knn.fit(X_train_sc, y_train)
y_pred = knn.predict(X_test_sc)
y_pred_train = knn.predict(X_train_sc)

print('KNN + OneHot')
evaluation(y_train, y_pred_train)
evaluation(y_test, y_pred)

KNN + OneHot
RMSE: 10.596655261524903
MAE: 2.79805071619602
R2: 0.30490906614089563

RMSE: 12.716024352894255
MAE: 3.1656873129387146
R2: 0.12927223594120496



## Mismo procedimiento anterior con Pipelines 

In [24]:
from sklearn.pipeline import Pipeline

pipe = Pipeline(steps = [
    ('ohe', OneHotEncoder()),
    ('sc', StandardScaler()),
    ('knn', KNeighborsRegressor())
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('KNN + OneHot, Test:')
evaluation(y_test, y_pred)



KNN + OneHot, Test:
RMSE: 12.716024352894255
MAE: 3.1656873129387146
R2: 0.12927223594120496



## Repitiendo pero esta vez aplicando LR 

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

pipe = Pipeline(steps = [
    ('ohe', OneHotEncoder()),
    ('sc', StandardScaler()),
    ('knn', LinearRegression())
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print('LR + OneHot, Test:')
evaluation(y_test, y_pred)

LR + OneHot, Test:
RMSE: 12.478825703086661
MAE: 3.103176867446452
R2: 0.16145354315592952

