# <center>Training

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import warnings
from sklearn import metrics
warnings.simplefilter(action='ignore', category=FutureWarning)

In [32]:
def run_model(fold):
    df = pd.read_csv('TRAIN_CLEANED_FOLDS.csv')
    #extract training set and validation set
    train_df = df[df.kfold != fold]
    valid_df = df[df.kfold == fold]
    #extract labels for each set
    y_train = train_df.label.values
    y_valid = valid_df.label.values
    #drop unnecessary columns from training set and validation set
    train_df = train_df.drop(['label','kfold'], axis=1)
    valid_df = valid_df.drop(['label','kfold'], axis=1)
    #maintain the order of the variables, maybe not needed
    valid_df = valid_df[train_df.columns]
    #now we are ready to train
    #---------------------------------------------------------------

    #training, you can use any model you want, here we use a random forest classifier because it's fast
    rfc = RandomForestClassifier(n_estimators = 100, n_jobs=-1, verbose=0, random_state=128) 
    rfc.fit(train_df, y_train)
    preds = rfc.predict(valid_df)
    print('the score you had for this fold is: ', metrics.accuracy_score(preds, y_valid))

In [33]:
#a quick demo of how the above function works
run_model(fold=0)

the score you had for this fold is:  0.8836159233321047


# <center> Dispatcher + updated Training

In [1]:
import sklearn.ensemble as ensemble
import joblib
MODELS = {
    'randomforest': ensemble.RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=0),
    'extratrees': ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1, verbose=0),
}

In [2]:
def run_model(fold, model):
    df = pd.read_csv('TRAIN_CLEANED_FOLDS.csv')
    #extract training set and validation set
    train_df = df[df.kfold != 0]
    valid_df = df[df.kfold == 0]
    #extract labels for each set
    y_train = train_df.label.values
    y_valid = valid_df.label.values
    #drop unnecessary columns from training set and validation set
    train_df = train_df.drop(['label','kfold'], axis=1)
    valid_df = valid_df.drop(['label','kfold'], axis=1)
    #maintain the order of the variables, maybe not needed
    valid_df = valid_df[train_df.columns]
    #now we are ready to train
    #---------------------------------------------------------------

    #training, you can use any model you want, here we use a random forest classifier because it's fast
    rfc = model
    rfc.fit(train_df, y_train)
    preds = rfc.predict(valid_df)
    print('the score you had for this fold is: ', metrics.accuracy_score(preds, y_valid))
    
    model_name = str(model)[:10]
    joblib.dump(rfc,f'models/{model_name}_{fold}.pkl') #save it to a pkl file
    joblib.dump(train_df.columns, f"models/{model_name}_{fold}_columns.pkl")

In [5]:
run_model(fold=0, model=MODELS['randomforest'])

the score you had for this fold is:  0.8865646885366752


In [6]:
run_model(fold=1, model=MODELS['randomforest'])

the score you had for this fold is:  0.8862882417987468


In [7]:
run_model(fold=2, model=MODELS['randomforest'])

the score you had for this fold is:  0.8851824548470328


In [8]:
run_model(fold=3, model=MODELS['randomforest'])

the score you had for this fold is:  0.8849060081091044


In [9]:
run_model(fold=4, model=MODELS['randomforest'])

the score you had for this fold is:  0.8838923700700332


now we have 5 models in the folder, let's create an inference