# Training model

By: Javier Martínez

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, confusion_matrix, ConfusionMatrixDisplay,\
                             accuracy_score,precision_score,recall_score, roc_curve, auc)
from xgboost import XGBClassifier

Data Base

In [None]:
pd_MeLi_ = pd.read_pickle('./data/data_base.pkl')
pd_MeLi = pd_MeLi_.set_index('id',drop=True).copy()

pd_MeLi['initial_quantity'] = pd_MeLi['initial_quantity'].astype(int)


In [None]:
all_features = ['title_new',
                'local_pick_up',
                'free_shipping',
                'mode',
                'variations_boolean',
                'accepts_mercadopago_boolean',
                'currency_id_boolean',
                #'date_created_format',
                'attributes_boolean',
                'automatic_relist_boolean',
                'video_id_boolean',
                #'sub_status_new',
                # 'deal_ids_new',
                #'seller_address_country',
                'seller_address_state',
                #'seller_address_city',
                'base_price',
                #'seller_id',
                'category_id',
                'listing_type_id',
                'buying_mode',
                #'last_updated',
                #'start_time',
                #'parent_item_id',
                'initial_quantity',
                #**'price',
                'status',
                #'original_price',
                #'official_store_id',
                'sold_quantity',
                #'catalog_product_id',
                'available_quantity',
                #'deal_ids',
                'condition_new',
                'training_data'
                ]


pd_model = pd_MeLi[all_features].copy()

In [None]:
dummies = [ 'mode',
            #'category_id',
            'listing_type_id',
            'buying_mode',
            'status',
            'seller_address_state'
            ]

pd_x_data = pd.get_dummies(pd_model,columns=dummies)

In [None]:
numbers = ['initial_quantity',
            #'price',
            'base_price',
            'available_quantity',
            'sold_quantity'
            ]

scaler = MinMaxScaler()
scaler.fit(pd_x_data[numbers])

In [None]:
pd_x_data[numbers] = scaler.transform(pd_x_data[numbers])

In [None]:
out = 'condition_new'

Data for Model

In [None]:
# Training data
pandas_x_training = pd_x_data.query('training_data==1').copy()
pandas_y_training = pandas_x_training[out]
pandas_x_training.drop(labels=['training_data',out],axis=1,inplace=True)
pandas_x_training.head(3)

In [None]:
# Test Data
pandas_x_test = pd_x_data.query('training_data==0').copy()
pandas_y_test = pandas_x_test[[out]]
pandas_x_test.drop(labels=['training_data',out],axis=1,inplace=True)
pandas_x_test.head(3)

Training Models

In [None]:

class XGB_MODEL():
    """
    model training
    """

    def __init__(self,pandas_x_training,
                    pandas_y_training,
                    pandas_x_test,
                    pandas_y_test):

        self.pandas_x_training = pandas_x_training
        self.pandas_y_training = pandas_y_training
        self.pandas_x_test = pandas_x_test
        self.pandas_y_test = pandas_y_test

    def training(self,n_estimators):

        np.random.seed(0)
        # Model
        self.model = XGBClassifier(n_estimators=n_estimators,
                                   verbosity=0)

        # Fit
        self.model.fit(self.pandas_x_training.values,
                      self.pandas_y_training.values
                    )

        # Metrics
        self.prediction = self.model.predict(pandas_x_test.values)
        self.accuracy = accuracy_score(self.pandas_y_test.values, self.prediction)
        self.precision = precision_score(pandas_y_test.values, self.prediction)
        self.recall =recall_score(self.pandas_y_test.values, self.prediction)

        fpr, tpr, thresholds = roc_curve(self.pandas_y_test.values, self.prediction, pos_label=1)
        self.auc = auc(fpr, tpr)

        self.confusion_matrix = confusion_matrix(self.pandas_y_test.values,self.prediction)

        # Plot
        self.plot_matrix = ConfusionMatrixDisplay(confusion_matrix=self.confusion_matrix)

        # Summary
        self.summary = pd.DataFrame({'accuracy':self.accuracy,
                                    'precision':self.precision,
                                    'recall':self.recall,
                                    'auc':self.auc,
                                    'n_estimators':n_estimators},index=[0])

In [None]:
#==================
def select_model(n_estimators):
    xgb_model = XGB_MODEL(pandas_x_training,
                        pandas_y_training,
                        pandas_x_test,
                        pandas_y_test)
    xgb_model.training(n_estimators=n_estimators)
    return xgb_model
#==================


# Training models
models = list(map(lambda x: select_model(x),[50,100,150,200,250,300,350]))

In [None]:
# Summary models
summary_models = pd.concat(list(map(lambda x: x.summary, models)))
summary_models

In [None]:
# Best Model
best_model = list(filter(lambda x: x.accuracy == summary_models.accuracy.max(),models))

In [None]:
# Confusion Matrix Best Model
best_model[0].confusion_matrix

In [None]:
# Confusion Matrix Plot Best Model
best_model[0].plot_matrix.plot()