### Bunch of functions to automate ML models testing process.

Necessary imports.

In [7]:
import matplotlib.pyplot as plt
import pandas as pd
import scikitplot as skplt

from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

Data upsampling.

In [8]:
def upsample_data(df, minority_var, n):
    """
    Input: df:pandas.DataFrame, minority_var:str, n:int.
    Output: df_upsampled:pandas.DataFrame.
    
    Returns dataset with equal number of observations in each class.
    Samples from minority class are multiplied.
    """
    df_minority = df[df[minority_var] == 1]
    df_majority = df[df[minority_var] == 0]
    df_minority_up = resample(df_minority, replace=True, n_samples=n)
    df_upsampled = pd.concat([df_majority, df_minority_up])
    return df_upsampled

Data downsampling.

In [9]:
def downsample_data(df, minority_var, n):
    """
    Input: df:pandas.DataFrame, minority_var:str, n:int.
    Output: df_upsampled:pandas.DataFrame.
    
    Returns dataset with equal number of observations in each class.
    Samples from majority class are selected to match the number of 
    samples in the minority class.
    """
    df_minority = df[df[minority_var] == 1]
    df_majority = df[df[minority_var] == 0]
    df_majority_down = resample(df_majority, replace=False, n_samples=n)
    df_downsampled = pd.concat([df_minority, df_majority_down])
    return df_downsampled

Split dataframe into format suitable for ML models.

In [10]:
def split_data_frame(df, dep_var):
    """
    Input: df:pandas.DataFrame, dep_var:str.
    Output: X:padas.DataFrame, y:pandas.Series.
    
    Splits data into dataframe with independent variables
    and data series with the dependent variable.
    """
    X = df.drop(dep_var, axis=1)
    y = df[dep_var]
    return X, y

Model testing routine.

In [14]:
def execute_model_flow(model, X, y, test_size=0.25, visualisations=False):
    """
    Input: model:sklearn model, X:pandas.DataFame, y:padas.Series, test_size:float, visualisations:Boolean.
    Output: None.
    
    For given model and data performes the training and returns multiple testing measures.
    If visualisations == True, produces plots to visualise certain measures.
    """                   
                       
    name = type(model).__name__

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    model.fit(X, y)
    accuracy = round(model.score(X_test, y_test), 3)

    prob_y_vis = model.predict_proba(X_test)
    prob_y = [p[1] for p in prob_y_vis]
    y_pred = model.predict(X_test)

    roc_auc = round(roc_auc_score(y_test, prob_y), 3)
    precision = round(precision_score(y_test, y_pred), 3)
    recall = round(recall_score(y_test, y_pred), 3)
    f1 = round(f1_score(y_test, y_pred), 3)
    
    if visualisations:
        skplt.metrics.plot_roc(y_test, prob_y_vis)
        plt.title('{} ROC Curves'.format(name))
        plt.show()
        
        skplt.metrics.plot_precision_recall(y_test, prob_y_vis)
        plt.title('{} Precision-Recall Curve'.format(name))
        plt.show()

    print('{} - precision: {}, recall: {}, F1: {}, ROC: {}'.format(name, precision, recall, f1, roc_auc))