# Importar librerías

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Random Forest

Debido a que es una clasificación no binaria utilizaremos Random Forest para analizar su utilidad en predecir el modelo.

In [22]:
columns_names = np.array(['id', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'type'])
columns_names

array(['id', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'type'],
      dtype='<U4')

In [23]:
df = pd.read_csv("glass.data", names=columns_names)
df.head()

Unnamed: 0,id,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,type
0,1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [24]:
y = df.type
X = df.drop(["type"], axis=1) 

In [25]:
# Verificamos valores faltantes en y
y.isnull().sum()

0

In [26]:
# Verificamos valores faltantes en X
X.isnull().sum()

id    0
RI    0
Na    0
Mg    0
Al    0
Si    0
K     0
Ca    0
Ba    0
Fe    0
dtype: int64

In [27]:
# Variables cuantitativas en X
X.dtypes

id      int64
RI    float64
Na    float64
Mg    float64
Al    float64
Si    float64
K     float64
Ca    float64
Ba    float64
Fe    float64
dtype: object

In [28]:
y.value_counts()

2    76
1    70
7    29
3    17
5    13
6     9
Name: type, dtype: int64

## Construyendo el modelo de Random Forest

In [29]:
# 1er paso: Particionamos (usando muestreo estratificado siguiendo la 
#distribución de la variable dependiente)

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y,
                                                test_size=0.15,
                                                stratify=y)

In [31]:
# 2do paso: Instanciamos la clase RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)

In [32]:
# 3er paso: Ajustamos el modelo con los datos de entrenamiento
clf.fit(Xtrain,ytrain)

In [33]:
# 4to paso: Calculamos el score del modelo
clf.score(Xtest, ytest)

0.7878787878787878

# Optimización de hiperparámetros

**Lista de Hiperparámetros a optimizar:**

n_estimators: The number of trees in the forest.

max_depth: The maximum depth of the tree.

criterion: The function to measure the quality of a split

In [34]:
rfc = RandomForestClassifier(random_state=42)

In [35]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_depth' : [4,5,6,7,8],
    'criterion' : ['gini', 'entropy']
}

In [36]:
from sklearn.model_selection import GridSearchCV

In [37]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(Xtrain, ytrain)

## Mejor modelo

In [38]:
CV_rfc.best_params_

{'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 200}

In [39]:
CV_rfc.best_estimator_

In [41]:
CV_rfc.best_score_

0.9777777777777776

# Serializar el mejor modelo obtenido
Guardarlo en disco duro

In [42]:
# Creamos el mejor modelo
ModGlassFinal = CV_rfc.best_estimator_
ModGlassFinal.fit(Xtrain, ytrain)
# En este punto el modelo esta listo para consumir

In [43]:
# Guardamos el mejor modelo ajustado (memoria ram) al disco duro

import pickle
ArchivoModelGlass = "Practica_N1_ModGlassFInal.pkl"
with open(ArchivoModelGlass, "wb") as file:
  pickle.dump(ModGlassFinal, file)