<h1>Training<h1/>

<h1>Training<h1/>

In [1]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

import pandas as pd
from data.new_or_used import build_dataset

from src.preprocessing import preprocess_training

<h2> 1. Loading Dataset<h2/>

In [2]:
X_train, _, _, _ = build_dataset()

In [3]:
df = pd.DataFrame.from_records(X_train)

<h2> 2. Preprocessing<h2/>

In [4]:
preprocessed_df = preprocess_training(df)
del df, X_train

In [5]:
preprocessed_df.head()

Unnamed: 0,condition,base_price,listing_type_id,price,accepts_mercadopago,automatic_relist,initial_quantity,sold_quantity,available_quantity,num_pic,...,nmp.Visa,nmp.Diners,nmp.American Express,nmp.Giro postal,nmp.MercadoPago,nmp.Cheque certificado,shim._custom,shim._me1,shim._me2,shim._not_specified
0,0.0,7e-06,0.166667,7e-06,1.0,0.0,0.0,0.0,0.0,0.055556,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.000238,0.333333,0.000238,1.0,0.0,0.0,0.0,0.0,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,5e-06,0.166667,5e-06,1.0,0.0,0.0,0.0,0.0,0.027778,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,5.2e-05,0.333333,5.2e-05,1.0,0.0,0.0,0.0,0.0,0.055556,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,3e-06,0.166667,3e-06,1.0,0.0,0.0,0.0,0.0,0.055556,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
preprocessed_df.isnull().sum()

condition                       0
base_price                      0
listing_type_id                 0
price                           0
accepts_mercadopago             0
automatic_relist                0
initial_quantity                0
sold_quantity                   0
available_quantity              0
num_pic                         0
num_var                         0
num_att                         0
buy_mode._auction               0
buy_mode._buy_it_now            0
buy_mode._classified            0
tag.dragged_bids_and_visits     0
tag.good_quality_thumbnail      0
tag.dragged_visits              0
tag.free_relist                 0
tag.poor_quality_thumbnail      0
cat._MLA1227                    0
cat._MLA1383                    0
cat._MLA15171                   0
cat._MLA15328                   0
cat._MLA2038                    0
cat._MLA2044                    0
cat._MLA3530                    0
cat._MLA41287                   0
cat._others                     0
nmp.Transferen

In [7]:
preprocessed_df[preprocessed_df.duplicated()]

Unnamed: 0,condition,base_price,listing_type_id,price,accepts_mercadopago,automatic_relist,initial_quantity,sold_quantity,available_quantity,num_pic,...,nmp.Visa,nmp.Diners,nmp.American Express,nmp.Giro postal,nmp.MercadoPago,nmp.Cheque certificado,shim._custom,shim._me1,shim._me2,shim._not_specified


In [8]:
print(f"DataFrame Shape: {preprocessed_df.shape}")

DataFrame Shape: (68198, 48)


In [9]:
X = preprocessed_df.loc[:, preprocessed_df.columns != 'condition'].values
y = preprocessed_df['condition'].values

<h2> 2. Hyperparameters Tunning<h2/>

In [10]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}

In [None]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], n_jobs=9, cv=5, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

results_df = pd.DataFrame(scores, columns=['model','best_score','best_params'])
results_df

<h2> 3. Training<h2/>

In [None]:
clf = 
clf.fit(X,y)

In [None]:
from joblib import dump, load
dump(clf, 'models/best_model.joblib')