### Bunch of functions to automate ML models testing process.

Necessary imports.

In [15]:
import matplotlib.pyplot as plt
import pandas as pd
import scikitplot as skplt

from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.utils import resample

Data upsampling.

In [8]:
def upsample_data(df, minority_var, n):
    """
    Input: df:pandas.DataFrame, minority_var:str, n:int.
    Output: df_upsampled:pandas.DataFrame.
    
    Returns dataset with equal number of observations in each class.
    Samples from minority class are multiplied.
    """
    df_minority = df[df[minority_var] == 1]
    df_majority = df[df[minority_var] == 0]
    df_minority_up = resample(df_minority, replace=True, n_samples=n)
    df_upsampled = pd.concat([df_majority, df_minority_up])
    return df_upsampled

Data downsampling.

In [9]:
def downsample_data(df, minority_var, n):
    """
    Input: df:pandas.DataFrame, minority_var:str, n:int.
    Output: df_upsampled:pandas.DataFrame.
    
    Returns dataset with equal number of observations in each class.
    Samples from majority class are selected to match the number of 
    samples in the minority class.
    """
    df_minority = df[df[minority_var] == 1]
    df_majority = df[df[minority_var] == 0]
    df_majority_down = resample(df_majority, replace=False, n_samples=n)
    df_downsampled = pd.concat([df_minority, df_majority_down])
    return df_downsampled

Split dataframe into format suitable for ML models.

In [10]:
def split_data_frame(df, dep_var):
    """
    Input: df:pandas.DataFrame, dep_var:str.
    Output: X:padas.DataFrame, y:pandas.Series.
    
    Splits data into dataframe with independent variables
    and data series with the dependent variable.
    """
    X = df.drop(dep_var, axis=1)
    y = df[dep_var]
    return X, y

Model testing routine.

In [14]:
def execute_model_flow(model, X, y, test_size=0.25, visualisations=False):
    """
    Input: model:sklearn model, X:pandas.DataFame, y:padas.Series, test_size:float, visualisations:Boolean.
    Output: None.
    
    For given model and data performes the training and returns multiple testing measures.
    If visualisations == True, produces plots to visualise certain measures.
    """                   
                       
    name = type(model).__name__

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    model.fit(X, y)
    accuracy = round(model.score(X_test, y_test), 3)

    prob_y_vis = model.predict_proba(X_test)
    prob_y = [p[1] for p in prob_y_vis]
    y_pred = model.predict(X_test)

    roc_auc = round(roc_auc_score(y_test, prob_y), 3)
    precision = round(precision_score(y_test, y_pred), 3)
    recall = round(recall_score(y_test, y_pred), 3)
    f1 = round(f1_score(y_test, y_pred), 3)
    
    if visualisations:
        skplt.metrics.plot_roc(y_test, prob_y_vis)
        plt.title('{} ROC Curves'.format(name))
        plt.show()
        
        skplt.metrics.plot_precision_recall(y_test, prob_y_vis)
        plt.title('{} Precision-Recall Curve'.format(name))
        plt.show()

    print('{} - precision: {}, recall: {}, F1: {}, ROC: {}'.format(name, precision, recall, f1, roc_auc))

In [31]:
def execute_cross_calidation_flow(model, X, y, cv=5):
    name = type(model).__name__
    measures = ['accuracy', 'precision', 'recall', 'f1']
    print(name)
    for measure in measures:
        res = cross_val_score(model, X, y, cv=cv, scoring=measure)
        print(measure, *res)

### Test methods

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('../data/train.csv', '|')
dfu = upsample_data(df, 'fraud', 1775)
X, y = split_data_frame(dfu, 'fraud')

clf = RandomForestClassifier()
execute_cross_calidation_flow(clf, X, y)

RandomForestClassifier
accuracy 0.9929577464788732 0.9887323943661972 0.9915492957746479 0.995774647887324 0.9971830985915493
precision 0.9943977591036415 0.9806629834254144 0.9806629834254144 0.9943977591036415 0.9916201117318436
recall 1.0 1.0 1.0 1.0 1.0
f1 0.9957924263674615 0.9861111111111112 0.9847434119278778 0.993006993006993 0.9957924263674615


In [33]:
df = pd.read_csv('../data/train.csv', '|')
dfu = downsample_data(df, 'fraud', 104)
X, y = split_data_frame(dfu, 'fraud')

clf = RandomForestClassifier()
execute_cross_calidation_flow(clf, X, y)

RandomForestClassifier
accuracy 0.9523809523809523 0.9761904761904762 0.9761904761904762 0.9047619047619048 0.95
precision 0.95 1.0 0.9090909090909091 0.875 0.9444444444444444
recall 0.9047619047619048 0.9523809523809523 0.9047619047619048 0.9523809523809523 0.95
f1 0.9500000000000001 0.9767441860465117 0.9767441860465117 0.9333333333333333 0.9743589743589743
