# Implementación 
### Equipo: 

In [1]:
# Parámetros generales del notebook
ruta_bases = 'bases/'
sample_size = 400000

In [24]:
#!pip install xgboost
#!pip install lightgbm

In [25]:
# Importación de librerías
import json
import glob
import pandas as pd
import numpy as np
#import missingno as msgno
import matplotlib.pyplot as plt
import funciones as fn
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from ml_classes import PrepML, MLModel
from matplotlib.pyplot import rcParams
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge
from lib.get_nhtsa_json import get_nhtsa_json

In [26]:
# Parámetros generales para plots
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = 15, 8
# Semilla pseudo-aleatoria
rd_seed = 1234

# 0. Obtener la información

Esta es una demostración del proceso realizado por `get_features.py` para poder obtener información relacionada con el `Vin` de los vehículos a través de una API. El proceso consite en los siguientes pasos:
* 1.- Extraer todos los `Vin` de la base completa `true_car_listings.csv`.
* 2.- Requerir a través de la Api 'chunks' de 50 registros por cada petición.
* 3.- Guardar en la memoria el json en formato texto, agregando 50 registros por cada iteración.
* 4.- Una vez completada las iteraciones guardar en formato json todos los registros requeridos.

In [4]:
# Requerimos todos los 'Vin'    
all_vins = pd.read_csv(f'{ruta_bases}true_car_listings.csv')['Vin'].to_list()
# Parámetros de muestra
start = 4
end = 4

json_text = '['
for i in range(start, end+1):
    # Generar requerimiento con 50 registros Vin
    vin_list = all_vins[50 * (i - 1):50 * i]
    json_text += get_nhtsa_json(vin_list, i)

# Cerrar lista de Json
json_text = json_text[:-2] + ']'
# Exportar resultados a archivo json
with open(f'api_test/data_{start}_{end}.json', 'w') as json_file:
    json_file.write(json_text)

4: 0.2s


Paralelamente, en base a una muestra de la base total, se definió el primer filtro de variables requeridas a través la API: que tengan menos del 10% de datos perdidos, las cuales se presentan a continaución:

In [5]:
cols = ['AirBagLocFront', 'BodyClass', 'BusFloorConfigType', 'BusType',
       'CustomMotorcycleType', 'DisplacementCC', 'DisplacementCI',
       'DisplacementL', 'Doors', 'EngineCylinders', 'EngineHP', 'EngineKW',
       'ErrorCode', 'ErrorText', 'FuelTypePrimary', 'Make', 'Manufacturer',
       'ManufacturerId', 'Model', 'ModelYear', 'MotorcycleChassisType',
       'MotorcycleSuspensionType', 'PlantCity', 'PlantCountry', 'TPMS',
       'TrailerBodyType', 'TrailerType', 'VIN', 'VehicleType']

Con estas columnas seleccionadas, se procede a importar los archivos json (varios en el proceso original) para luego mapearlos para retraer solo aquellas columnas, creando un DataFrame con ellas y luego exportarlas en un csv.

In [6]:
# Importación de archivo json
filenames = glob.glob('api_test/*.json')
json_list = []

for filename in filenames:
    print(filename)
    with open(filename, 'r') as file:
        # Mapeamos considerando solo las columnas seleccionadas
        data = list(map(fn.get_info, 
                        json.loads(file.read())
                       )
                   )
    json_list += data

# Creación y exortación de DataFrame con features extraídos
data_json = pd.DataFrame(data=json_list,
                         columns=cols)
data_json.to_csv('api_test/data_api.csv')

api_test/data_4_4.json
api_test/data_10_10.json


ValueError: 29 columns passed, passed data had 0 columns

# 1. Creación del Dataset 

## 1.1 Bases Originales

In [27]:
# Importación de las bases para muestras de entrenamiento y prueba
df_train = pd.read_csv(f'{ruta_bases}true_cars_train.csv',
                       delimiter=";")
df_test = pd.read_csv(f'{ruta_bases}true_cars_test.csv',
                      delimiter=";")
# Dimensiones de las bases
print(f'Base Train: {df_train.shape}\nBase Test: {df_test.shape}')

Base Train: (639145, 8)
Base Test: (212977, 8)


In [28]:
# Información general de muestra de entrenamiento
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639145 entries, 0 to 639144
Data columns (total 8 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Price    639145 non-null  int64 
 1   Year     639145 non-null  int64 
 2   Mileage  639145 non-null  int64 
 3   City     639145 non-null  object
 4   State    639145 non-null  object
 5   Vin      639145 non-null  object
 6   Make     639145 non-null  object
 7   Model    639145 non-null  object
dtypes: int64(3), object(5)
memory usage: 39.0+ MB


In [29]:
# Creación de atributo 'sample'
df_train['sample'] = 'train'
df_test['sample'] = 'test'

In [30]:
# Unión de ambas bases
df_data = pd.concat([df_train, df_test])
print(f'Base Data: {df_data.shape}')

Base Data: (852122, 9)


## 1.2 Base API

In [31]:
# Importación de la base extraída por el requerimiento a la api
df_api = pd.read_csv(f'{ruta_bases}api_features.csv').drop(columns='Unnamed: 0')
print(f'Base API: {df_api.shape}')

Base API: (846562, 29)


In [32]:
df_api.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846562 entries, 0 to 846561
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   AirBagLocFront            797265 non-null  object 
 1   BodyClass                 845535 non-null  object 
 2   BusFloorConfigType        843524 non-null  object 
 3   BusType                   843524 non-null  object 
 4   CustomMotorcycleType      846525 non-null  object 
 5   DisplacementCC            839096 non-null  float64
 6   DisplacementCI            839096 non-null  float64
 7   DisplacementL             839096 non-null  float64
 8   Doors                     739918 non-null  float64
 9   EngineCylinders           735744 non-null  object 
 10  EngineHP                  380764 non-null  object 
 11  EngineKW                  380764 non-null  object 
 12  ErrorCode                 846550 non-null  object 
 13  ErrorText                 846550 non-null  o

In [33]:
# Identificar columnas con solo valores "Not Applicable" 
notapp_series = df_api\
                    .isin(['Not Applicable'])\
                    .sum()
cols2drop = list(notapp_series[notapp_series > 1].index)

In [34]:
# Identificar columnas con más de un 15% de datos perdidos
null_series = df_api\
                .isnull()\
                .sum()\
                /df_api.shape[0] 
cols2drop += list(null_series[null_series > .15].index)

In [35]:
# Columnas repetidas en la data original
cols2drop += ['ModelYear', 'Make']

In [36]:
# Borrar aquellas columnas 
df_api = df_api.drop(columns=cols2drop)
print(f'Base API: {df_api.shape}')

Base API: (846562, 16)


In [37]:
df_api=df_api.rename(columns={"AirBagLocFront":"Airag_LocFront",
                        "BodyClass":"d_Body_Class",
                        "DisplacementCC":"d_DisplacementCC",
                        "DisplacementCI":"d_DisplacementCI",
                        "DisplacementL":"d_DisplacementL",
                        "Doors":"d_Doors",
                        "EngineCylinders ":"d_EngineCylinders",
                        "EngineHP":"d_EngineHP",
                        "EngineKW":"d_EngineKW",
                        "ErrorCode":"d_ErrorCode",
                        "ErrorText":"d_ErrorText",
                        "FuelTypePrimary":"d_FuelTypePrimary",
                        "Make":"d_Make",
                        "Manufacturer":"d_Manufacturer",
                        "ManufacturerId":"d_ManufacturerId",
                        "Model":"d_Model",
                        "ModelYear":"d_ModelYear",
                        "PlantCity":"d_PlantCity",
                        "PlantCountry":"d_PlantCountry",
                        "TPMS":"d_TPMS",
                        "VIN":"Vin",
                        "VehicleType":"d_VehicleType"})

## 1.3 Unión de Bases

In [38]:
# Unión de bases
df = pd.merge(left=df_data, 
              right=df_api, 
              how='inner',
              on='Vin')
# Dimensiones de la base
print(f'Dataset: {df.shape}')

Dataset: (846644, 24)


In [39]:
# Información general de la base
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 846644 entries, 0 to 846643
Data columns (total 24 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Price              846644 non-null  int64  
 1   Year               846644 non-null  int64  
 2   Mileage            846644 non-null  int64  
 3   City               846644 non-null  object 
 4   State              846644 non-null  object 
 5   Vin                846644 non-null  object 
 6   Make               846644 non-null  object 
 7   Model              846644 non-null  object 
 8   sample             846644 non-null  object 
 9   Airag_LocFront     797359 non-null  object 
 10  d_Body_Class       845629 non-null  object 
 11  d_DisplacementCC   839190 non-null  float64
 12  d_DisplacementCI   839190 non-null  float64
 13  d_DisplacementL    839190 non-null  float64
 14  d_Doors            740010 non-null  float64
 15  EngineCylinders    735838 non-null  object 
 16  d_

In [40]:
#df['d_ErrorText'].value_counts()

# 2. Análisis exploratorio de datos

## 2.1 Análisis de los datos perdidos

In [41]:
#msgno.matrix(df)

# 3. Preproceso

In [42]:
# Selección de variables para modelos
select_vars = ['Price', 'Model', 'Mileage', 'sample']

In [43]:
# Muestra aleatoria
df_sample = df[select_vars]\
                .sample(sample_size)\
                .reset_index(drop=True)

In [44]:
# Instanciar clase para realizar preproceso
#df_prep = PrepML(df.loc[:, select_vars])
df_prep = PrepML(df_sample)

In [45]:
# Realizamos OneHot Encoder a las columnas categóricas seleccionadas
df_prep.one_hot_encoder(['Model'])

Unnamed: 0,Price,Mileage,sample,Model_Grand,Model_Accord,Model_F_1504WD,Model_Altima2_5,Model_Sierra,Model_Civic,Model_3,...,Model_Regal4DR,Model_S602_5T,Model_WranglerHard,Model_LWLW1,Model_Camaro2SS,Model_QuattroporteSport,Model_PriusHATCACK,Model_XJ,Model_SL_ClassSL320,Model_LFA2dr
0,18991,19787,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,18000,42183,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,54900,17186,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,29250,22063,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16700,28106,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,16250,47136,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
399996,21246,44820,test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
399997,14900,39507,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
399998,20990,48611,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
# Removemos outliers (opcional)
#df_prep.remove_outliers(['Price', 'Mileage'], multiplier=1.5)

In [47]:
# Estandarizamos variables continuas seleccionadas
df_prep.standard_scaler(['Price', 'Mileage'])

Unnamed: 0,sample,Model_Grand,Model_Accord,Model_F_1504WD,Model_Altima2_5,Model_Sierra,Model_Civic,Model_3,Model_Super,Model_Wrangler,...,Model_WranglerHard,Model_LWLW1,Model_Camaro2SS,Model_QuattroporteSport,Model_PriusHATCACK,Model_XJ,Model_SL_ClassSL320,Model_LFA2dr,Price,Mileage
0,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.181361,-0.778738
1,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.254231,-0.244226
2,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.459078,-0.840814
3,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.572997,-0.724418
4,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.349822,-0.580193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.382911,-0.126015
399996,test,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.015548,-0.181290
399997,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.482179,-0.308092
399998,train,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.034372,-0.090812


In [48]:
# Separar muestras según
X_train, y_train, X_test, y_test = df_prep.to_train_test_samples('sample', 'Price')

Realizado en 21.0s


# 4. Modelamiento 

In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

## 4.1 Ridge Regression

In [50]:
# Establecemos parámetros a evaluar en el modelo
ridge_grid = {'alpha': [0, .1, .2, .5],
              'solver': ['sag', 'sparse_cg']}
# Instanciamos Clase auxiliar para entrenar, ajustar y evaluar modelos de ML
ridge_reg = MLModel(model=Ridge(fit_intercept=True))
# Implementación del grid search
ridge_reg.grid_search(X_train,
                      y_train,
                      param_grid=ridge_grid,
                      cv=5)

MemoryError: Unable to allocate 5.58 GiB for an array with shape (2496, 300129) and data type object

In [None]:
# Métricas
ridge_reg.metrics(X_test, y_test)

In [None]:
# Instancibest_modelos Clase auxiliar para entrenar, ajustar y evaluar modelos de ML
linear2_reg = MLModel(model=LinearRegression(fit_intercept=True))
# Implementación del grid search
linear2_reg.fit(X_train, y_train)

In [None]:
linear2_reg.metrics(X_test, y_test)

## 4.2 LightGBM

In [None]:
# Establecemos parámetros a evaluar en el modelo
lgb_grid = {'max_depth': [3, 4, 5], 
            'n_estimators': [50, 60, 70],
            'num_leaves': [30, 50, 70]}
# Instanciamos Clase auxiliar para entrenar, ajustar y evaluar modelos de ML
lgb_reg = MLModel(model=LGBMRegressor(n_jobs=-1,
                                      random_state=rd_seed))
# Implementación del grid search
lgb_reg.grid_search(X_train,
                    y_train,
                    param_grid=lgb_grid,
                    cv=5)

In [None]:
lgb_reg.metrics(X_test, y_test)

## 4.3 XGBoost

In [None]:
# Establecemos parámetros a evaluar en el modelo
xgb_grid = {'max_depth': [3, 4, 5, 6], 
            'n_estimators': [50, 60, 70]}
# Instanciamos Clase auxiliar para entrenar, ajustar y evaluar modelos de ML
xgb_reg = MLModel(model=XGBRegressor(n_jobs=-1,
                                     seed=rd_seed))
# Implementación del grid search
xgb_reg.grid_search(X_train,
                    y_train,
                    param_grid=xgb_grid,
                    cv=5)

In [None]:
# Métricas
xgb_reg.metrics(X_test, y_test)

## 4.4 RandomForest

In [None]:
# Establecemos parámetros a evaluar en el modelo
rf_grid = {'n_estimators': [400, 500], 
           'max_depth': [3, 5, 7], 
           'oob_score': [True]}
# Instanciamos Clase auxiliar para entrenar, ajustar y evaluar modelos de ML
rf_reg = MLModel(model=RandomForestRegressor(n_jobs=-1,
                                             random_state=rd_seed))
# Implementación del grid search
rf_reg.grid_search(X_train,
                    y_train,
                    param_grid=rf_grid,
                    cv=5)

In [None]:
rf_reg.metrics(X_test, y_test)