# Modelos de pre-procesamiento

In [86]:
from sklearn import preprocessing

import pandas as pd

In [87]:
df = pd.read_csv('performance.csv')

In [88]:
df.head()

Unnamed: 0,Name,GamesPlayed,MinutesPlayed,PointsPerGame,FieldGoalsMade,FieldGoalsAttempt,FieldGoalPercent,3PointMade,3PointAttempt,3PointPercent,...,FreeThrowAttempt,FreeThrowPercent,OffensiveRebounds,DefensiveRebounds,Rebounds,Assists,Steals,Blocks,Turnovers,Target
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1


### Label Encoder

Este modelo se entrena para el reconocimiento y la obtención de etiquetas

In [89]:
# 1er paso: Inicializar el modelo
encoder = preprocessing.LabelEncoder()

In [90]:
# 2d. paso: Entrenar el modelo
encoder.fit(df.Name)

In [91]:
# Recuperamos los valores transformados, usando el método transform del modelo
# df.Name = df.Name.map({'Juanito': 0}...... n)
df.Name = encoder.transform(df.Name)

In [92]:
df.head()

Unnamed: 0,Name,GamesPlayed,MinutesPlayed,PointsPerGame,FieldGoalsMade,FieldGoalsAttempt,FieldGoalPercent,3PointMade,3PointAttempt,3PointPercent,...,FreeThrowAttempt,FreeThrowPercent,OffensiveRebounds,DefensiveRebounds,Rebounds,Assists,Steals,Blocks,Turnovers,Target
0,128,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,45,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,521,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,820,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,871,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1


In [93]:
encoder.classes_

array(['A.C. Green', 'A.J. English', 'A.J. Price', ..., 'Xavier McDaniel',
       'Zach LaVine', 'Zach Randolph'], dtype=object)

In [94]:
# Se puede entrenar y transformar al mismo tiempo
example = encoder.fit_transform(df.Name)

In [95]:
example

array([128,  45, 521, ..., 855, 795, 644], dtype=int64)

In [96]:
# Puedo retornar al texto usando la inversa de la transformación
inverse = encoder.inverse_transform(example)

In [97]:
inverse

array([128,  45, 521, ..., 855, 795, 644])

### Proceso de Estandarización - StandardScaler

Este algoritmo sigue la fórmula de la distribución estándar --> Z = (x - m) / sd

In [98]:
standard = preprocessing.StandardScaler()

In [99]:
standard.fit(df.GamesPlayed.to_numpy().reshape(-1, 1))

In [100]:
# Analizamos el resultado del entrenamiento
standard.mean_

array([60.4141791])

In [101]:
standard.scale_

array([17.42748581])

In [102]:
standarized = standard.transform(df.GamesPlayed.to_numpy().reshape(-1, 1))

In [103]:
standarized

array([[-1.40090082],
       [-1.45828144],
       [ 0.77956287],
       ...,
       [-0.99923645],
       [-0.48281084],
       [-0.76971396]])

In [104]:
standard.transform([[27]])

array([[-1.91732643]])

In [105]:
standard.inverse_transform([[-1.91732643]])

array([[26.99999995]])

#### Este modelo tiene a posibilidad de entrenarse con multiples columnas


In [106]:
multiple = preprocessing.StandardScaler()

In [107]:
test = multiple.fit_transform([df.GamesPlayed, df.MinutesPlayed])

In [108]:
test[0]

array([1., 1., 1., ..., 1., 1., 1.])

### Proceso de Estandarizacion - MinMaxScaler

Se basa en encontrar el valor de X estándar bajo la siguiente fórmula

x_std = ((x - x_min(axis=0))) / (x_max(axis=0) - x_min(axis=0)) 

x_scale = (x_std * (max - min)) + min

In [109]:
min_max = preprocessing.MinMaxScaler()

In [110]:
min_max.fit(df.GamesPlayed.to_numpy().reshape(-1, 1))

In [111]:
min_max.data_min_

array([11.])

In [112]:
min_max.data_max_

array([82.])

In [113]:
minimized = min_max.transform(df.GamesPlayed.to_numpy().reshape(-1, 1))

In [114]:
minimized

array([[0.35211268],
       [0.33802817],
       [0.88732394],
       ...,
       [0.45070423],
       [0.57746479],
       [0.50704225]])

In [115]:
df.describe()

Unnamed: 0,Name,GamesPlayed,MinutesPlayed,PointsPerGame,FieldGoalsMade,FieldGoalsAttempt,FieldGoalPercent,3PointMade,3PointAttempt,3PointPercent,...,FreeThrowAttempt,FreeThrowPercent,OffensiveRebounds,DefensiveRebounds,Rebounds,Assists,Steals,Blocks,Turnovers,Target
count,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1329.0,...,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0
mean,641.172388,60.414179,17.624627,6.801493,2.629104,5.885299,44.169403,0.247612,0.779179,19.308126,...,1.82194,70.300299,1.009403,2.025746,3.034478,1.550522,0.618507,0.368582,1.193582,0.620149
std,373.264201,17.433992,8.307964,4.357545,1.683555,3.593488,6.137679,0.383688,1.061847,16.022916,...,1.322984,10.578479,0.777119,1.360008,2.057774,1.471169,0.409759,0.429049,0.722541,0.485531
min,0.0,11.0,3.1,0.7,0.3,0.8,23.8,0.0,0.0,0.0,...,0.0,0.0,0.0,0.2,0.3,0.0,0.0,0.0,0.1,0.0
25%,314.75,47.0,10.875,3.7,1.4,3.3,40.2,0.0,0.0,0.0,...,0.9,64.7,0.4,1.0,1.5,0.6,0.3,0.1,0.7,0.0
50%,639.5,63.0,16.1,5.55,2.1,4.8,44.1,0.1,0.3,22.4,...,1.5,71.25,0.8,1.7,2.5,1.1,0.5,0.2,1.0,1.0
75%,961.25,77.0,22.9,8.8,3.4,7.5,47.9,0.4,1.2,32.5,...,2.3,77.6,1.4,2.6,4.0,2.0,0.8,0.5,1.5,1.0
max,1293.0,82.0,40.9,28.2,10.2,19.8,73.7,2.3,6.5,100.0,...,10.2,100.0,5.3,9.6,13.9,10.6,2.5,3.9,4.4,1.0


### Proceso de Estandarización - MaxAbsScaler

In [116]:
# ESTE MODELO NO FUNCIONA CON UNA ÚNICA CARACTERÍSTICA
max_abs_scaler = preprocessing.MaxAbsScaler()

In [117]:
# 2do Paso - Entrenar el Scaler
max_abs_scaler.fit([df.GamesPlayed, df.PointsPerGame])

In [118]:
# Paso opcional. Evaluar los valores máximo absoluto
max_abs_scaler.max_abs_

array([36., 35., 74., ..., 43., 52., 47.])

In [119]:
# 3er. Paso Aplicar el transform
abs_scaled = max_abs_scaler.transform([df.GamesPlayed, df.PointsPerGame])

In [120]:
abs_scaled

array([[1.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [0.20555556, 0.20571429, 0.07027027, ..., 0.1255814 , 0.08653846,
        0.09361702]])

### Normalización - Normalize

In [121]:
# ESTE MÉTODO NO SE ENTRENA, SIMPLEMENTE SE EJECUTA
normal = preprocessing.normalize([df.GamesPlayed, df.PointsPerGame])

In [122]:
# Evaluamos la data generada
normal

array([[0.01564063, 0.01520617, 0.03215019, ..., 0.01868186, 0.02259202,
        0.02041971],
       [0.02502887, 0.02435241, 0.01758785, ..., 0.01826431, 0.01522026,
        0.01488203]])

### Discretización - K-bins Disccretization

In [123]:
# ESTE MODELO NO SE PUEDE APLICAR SOBRE UNA SOLA COLUMNA 
k_bins = preprocessing.KBinsDiscretizer()

In [124]:
k_bins.fit([df.GamesPlayed, df.PointsPerGame])

In [125]:
# LOS EJES SON LOS RANGOS DE VALORES PERMITIDOS (DISCRETOS)
k_bins.bin_edges_

array([array([ 7.4 , 13.12, 18.84, 24.56, 30.28, 36.  ]),
       array([ 7.2 , 12.76, 18.32, 23.88, 29.44, 35.  ]),
       array([ 5.2 , 18.96, 32.72, 46.48, 60.24, 74.  ]), ...,
       array([ 5.4 , 12.92, 20.44, 27.96, 35.48, 43.  ]),
       array([ 4.5, 14. , 23.5, 33. , 42.5, 52. ]),
       array([ 4.4 , 12.92, 21.44, 29.96, 38.48, 47.  ])], dtype=object)

In [126]:
discrete = k_bins.transform([df.GamesPlayed, df.PointsPerGame])

In [127]:
# EL RESULTADO DE ESTE MODELO ES UNA MATRIZ DE ESPARCIMIENTO
discrete

<2x6700 sparse matrix of type '<class 'numpy.float64'>'
	with 2680 stored elements in Compressed Sparse Row format>

In [128]:
# Por lo cual, se separa los datos en filas y columnas
x1, x2 = discrete.nonzero()

In [129]:
x1

array([0, 0, 0, ..., 1, 1, 1])

In [130]:
x2

array([   4,    9,   14, ..., 6685, 6690, 6695])