### Dado un dataset de estudiantes con distintas caracteristicas, vamos a generar 3 modelos para predecir el puntaje de los examenes en: matematicas, lectura y escritura

## 1.- Preprocesamiento

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('StudentsPerformance.csv')

In [3]:
# reemplazamos los espacios por _ 
df.columns = [x.replace(' ', '_') for x in df.columns]
df.head(4)

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44


In [4]:
columnas_categoricas = ['gender', 'race/ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
columnas_respuestas  = ['math_score', 'reading_score', 'writing_score']

df_dummies = pd.get_dummies(df, columns = columnas_categoricas)
df_dummies.sample(4)

Unnamed: 0,math_score,reading_score,writing_score,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental_level_of_education_associate's degree,parental_level_of_education_bachelor's degree,parental_level_of_education_high school,parental_level_of_education_master's degree,parental_level_of_education_some college,parental_level_of_education_some high school,lunch_free/reduced,lunch_standard,test_preparation_course_completed,test_preparation_course_none
908,67,75,72,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1
645,65,81,81,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0
132,87,74,70,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0
346,62,61,57,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1


In [5]:
X = df_dummies.drop(labels = columnas_respuestas, axis=1)

y = df_dummies[[x for x in df_dummies.columns if x in columnas_respuestas]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

#### Observemos las dimensiones de las variables independientes (X) y las dependientes (y)

In [6]:
X.shape, y.shape

((1000, 17), (1000, 3))

## 2.- Generamos los Modelos

In [7]:
from sklearn.linear_model import ElasticNetCV as eNetCv

#### Para matemáticas

In [8]:
model_math = eNetCv(alphas=np.linspace(0.001, 10, 100), cv = 3, n_jobs=-1, verbose=1)

model_math.fit(X_train, y_train['math_score'])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
............................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished


ElasticNetCV(alphas=array([1.000e-03, 1.020e-01, ..., 9.899e+00, 1.000e+01]),
       copy_X=True, cv=3, eps=0.001, fit_intercept=True, l1_ratio=0.5,
       max_iter=1000, n_alphas=100, n_jobs=-1, normalize=False,
       positive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=0.0001, verbose=1)

In [9]:
# Pedimos una prediccion
model_math.predict([[0 for x in range(X.shape[1])]])

array([65.73447615])

In [10]:
# Recordemos que es cada dummy
df_dummies.columns

Index(['math_score', 'reading_score', 'writing_score', 'gender_female',
       'gender_male', 'race/ethnicity_group A', 'race/ethnicity_group B',
       'race/ethnicity_group C', 'race/ethnicity_group D',
       'race/ethnicity_group E',
       'parental_level_of_education_associate's degree',
       'parental_level_of_education_bachelor's degree',
       'parental_level_of_education_high school',
       'parental_level_of_education_master's degree',
       'parental_level_of_education_some college',
       'parental_level_of_education_some high school', 'lunch_free/reduced',
       'lunch_standard', 'test_preparation_course_completed',
       'test_preparation_course_none'],
      dtype='object')

#### Realizamos ajuste para lectura

In [11]:
model_read = eNetCv(alphas=np.linspace(0.001, 10, 100), cv = 3, n_jobs=-1)

model_read.fit(X_train, y_train['reading_score'])

ElasticNetCV(alphas=array([1.000e-03, 1.020e-01, ..., 9.899e+00, 1.000e+01]),
       copy_X=True, cv=3, eps=0.001, fit_intercept=True, l1_ratio=0.5,
       max_iter=1000, n_alphas=100, n_jobs=-1, normalize=False,
       positive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=0.0001, verbose=0)

#### Realizamos ajuste para escritura

In [12]:
model_write = eNetCv(alphas=np.linspace(0.001, 10, 100), cv = 3, n_jobs=-1)

model_write.fit(X_train, y_train['writing_score'])

ElasticNetCV(alphas=array([1.000e-03, 1.020e-01, ..., 9.899e+00, 1.000e+01]),
       copy_X=True, cv=3, eps=0.001, fit_intercept=True, l1_ratio=0.5,
       max_iter=1000, n_alphas=100, n_jobs=-1, normalize=False,
       positive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=0.0001, verbose=0)

## 3.- Persistencia

#### a) Tomamos los modelos entrenados y los guardamos, esto nos permite disponer a futuro de estos objetos.

In [13]:
import os
import pickle

In [14]:
with open('math_model.pkl', 'wb') as f_math:
    pickle.dump(model_math, f_math)

In [15]:
with open('write_model.pkl', 'wb') as f_write:
    pickle.dump(model_write, f_write)

In [16]:
with open('read_model.pkl', 'wb') as f_read:
    pickle.dump(model_read, f_read)

In [17]:
# observamos si se encuentran en carpeta
[x for x in os.listdir() if 'pkl' in x]

['math_model.pkl', 'read_model.pkl', 'write_model.pkl']

#### b) Debemos hacer un seguimiento para las variables dummies

In [19]:
# En esta lista estan ordenadas las dummies segun el indice

dummies_encoder = [x.replace(' ', '_').replace("'", '').replace('/', '_') for x in list(X.columns) ]

with open('dummies_order.pkl', 'wb') as f_orden_dummy:
    pickle.dump(list(dummies_encoder), f_orden_dummy)
