<h1>Training<h1/>

In [1]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

import pandas as pd
from data.new_or_used import build_dataset

from src.preprocessing import preprocess_training

<h2> 1. Loading Dataset<h2/>

In [2]:
X_train, _, _, _ = build_dataset()

In [3]:
df = pd.DataFrame.from_records(X_train)

<h2> 2. Preprocessing<h2/>

In [4]:
preprocessed_df = preprocess_training(df)
del df, X_train

In [5]:
preprocessed_df.head()

Unnamed: 0,accepts_mercadopago,automatic_relist,available_quantity,buy_mode._auction,buy_mode._buy_it_now,buy_mode._classified,cat._MLA1227,cat._MLA1383,cat._MLA15171,cat._MLA15328,...,shim._custom,shim._me1,shim._me2,shim._not_specified,sold_quantity,tag.dragged_bids_and_visits,tag.dragged_visits,tag.free_relist,tag.good_quality_thumbnail,tag.poor_quality_thumbnail
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [6]:
preprocessed_df.isnull().sum()

accepts_mercadopago             0
automatic_relist                0
available_quantity              0
buy_mode._auction               0
buy_mode._buy_it_now            0
buy_mode._classified            0
cat._MLA1227                    0
cat._MLA1383                    0
cat._MLA15171                   0
cat._MLA15328                   0
cat._MLA2038                    0
cat._MLA2044                    0
cat._MLA3530                    0
cat._MLA41287                   0
cat._others                     0
condition                       0
initial_quantity                0
listing_type_id                 0
nmp.Acordar con el comprador    0
nmp.American Express            0
nmp.Cheque certificado          0
nmp.Contra reembolso            0
nmp.Diners                      0
nmp.Efectivo                    0
nmp.Giro postal                 0
nmp.MasterCard                  0
nmp.Mastercard Maestro          0
nmp.MercadoPago                 0
nmp.N/a                         0
nmp.Tarjeta de

In [7]:
preprocessed_df[preprocessed_df.duplicated()]

Unnamed: 0,accepts_mercadopago,automatic_relist,available_quantity,buy_mode._auction,buy_mode._buy_it_now,buy_mode._classified,cat._MLA1227,cat._MLA1383,cat._MLA15171,cat._MLA15328,...,shim._custom,shim._me1,shim._me2,shim._not_specified,sold_quantity,tag.dragged_bids_and_visits,tag.dragged_visits,tag.free_relist,tag.good_quality_thumbnail,tag.poor_quality_thumbnail


In [8]:
print(f"DataFrame Shape: {preprocessed_df.shape}")

DataFrame Shape: (68197, 47)


In [9]:
X = preprocessed_df.loc[:, preprocessed_df.columns != 'condition'].values
y = preprocessed_df['condition'].values

In [10]:
preprocessed_df.loc[:, preprocessed_df.columns != 'condition'].columns

Index(['accepts_mercadopago', 'automatic_relist', 'available_quantity',
       'buy_mode._auction', 'buy_mode._buy_it_now', 'buy_mode._classified',
       'cat._MLA1227', 'cat._MLA1383', 'cat._MLA15171', 'cat._MLA15328',
       'cat._MLA2038', 'cat._MLA2044', 'cat._MLA3530', 'cat._MLA41287',
       'cat._others', 'initial_quantity', 'listing_type_id',
       'nmp.Acordar con el comprador', 'nmp.American Express',
       'nmp.Cheque certificado', 'nmp.Contra reembolso', 'nmp.Diners',
       'nmp.Efectivo', 'nmp.Giro postal', 'nmp.MasterCard',
       'nmp.Mastercard Maestro', 'nmp.MercadoPago', 'nmp.N/a',
       'nmp.Tarjeta de crédito', 'nmp.Transferencia bancaria', 'nmp.Visa',
       'nmp.Visa Electron', 'num_att', 'num_pic', 'num_var', 'price',
       'shim._custom', 'shim._me1', 'shim._me2', 'shim._not_specified',
       'sold_quantity', 'tag.dragged_bids_and_visits', 'tag.dragged_visits',
       'tag.free_relist', 'tag.good_quality_thumbnail',
       'tag.poor_quality_thumbnail'],
 

<h2> 2. Hyperparameters Tunning<h2/>

In [11]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10],
            'kernel': ['rbf']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}

In [12]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], n_jobs=5, cv=5, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    print({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

results_df = pd.DataFrame(scores, columns=['model','best_score','best_params'])
results_df

{'model': 'svm', 'best_score': 0.8100504407857493, 'best_params': {'C': 10, 'kernel': 'rbf'}}
{'model': 'random_forest', 'best_score': 0.795636401693522, 'best_params': {'n_estimators': 10}}
{'model': 'logistic_regression', 'best_score': 0.8077189270289464, 'best_params': {'C': 10}}
{'model': 'naive_bayes_gaussian', 'best_score': 0.48458137770783677, 'best_params': {}}
{'model': 'naive_bayes_multinomial', 'best_score': 0.6971861010097188, 'best_params': {}}
{'model': 'decision_tree', 'best_score': 0.7657084307786517, 'best_params': {'criterion': 'entropy'}}


Unnamed: 0,model,best_score,best_params
0,svm,0.81005,"{'C': 10, 'kernel': 'rbf'}"
1,random_forest,0.795636,{'n_estimators': 10}
2,logistic_regression,0.807719,{'C': 10}
3,naive_bayes_gaussian,0.484581,{}
4,naive_bayes_multinomial,0.697186,{}
5,decision_tree,0.765708,{'criterion': 'entropy'}


<h2> 3. Training<h2/>

From before experiment, SVM is the best approach, so:

In [17]:
clf =  GridSearchCV(svm.SVC(gamma='auto', C=10, kernel="rbf", probability=True), {}, n_jobs=5, cv=5, return_train_score=False)
clf.fit(X, y)

GridSearchCV(cv=5, estimator=SVC(C=10, gamma='auto', probability=True),
             n_jobs=5, param_grid={})

In [18]:
from joblib import dump, load
dump(clf, 'models/best_model.joblib')

['models/best_model.joblib']