# Random Subsampling & Hold Out

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score

In [2]:
df = pd.read_csv('data/naive-db.csv')

In [3]:
df.head()

Unnamed: 0,label,Amplitude_bin,Amplitude,Std_bin,Std,PeriodLS_bin,PeriodLS,Mean_bin,Mean,MaxSlope_bin,MaxSlope,Meanvariance_bin,Meanvariance,LinearTrend,LinearTrend_bin
0,0,7,0.533,6,0.283701,5,1.089524,17,19.338182,15,7.039598,3,0.014671,3.6e-05,11
1,0,6,0.2715,3,0.127947,7,3.209689,18,19.586202,15,10.640067,2,0.006533,-1.5e-05,10
2,0,6,0.3015,3,0.138996,6,2.268446,18,19.913902,15,21.778886,2,0.00698,-7e-06,11
3,0,5,0.164,2,0.095606,5,1.561432,9,15.151209,15,0.353493,2,0.00631,1e-06,10
4,0,5,0.11125,2,0.056008,5,1.166648,15,18.361809,15,17.002882,1,0.00305,2e-06,11


In [7]:
def hold_out(data, clf, target='label'):
    
    X = data
    y = data[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    f1 = f1_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    prec = precision_score(y_test, y_pred, average='micro')
    accuracy = accuracy_score(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)
    
    return recall, prec, f1, accuracy, matrix

In [36]:
def random_subsampling(data, clf, B, target='label'):
    
    recall_sum = 0
    prec_sum = 0
    f1_sum = 0
    accuracy_sum = 0
    matrix_sum = None
    
    for i in range(B):
        recall, prec, f1, accuracy, matrix = hold_out(data, clf)
        
        recall_sum += recall
        prec_sum += prec
        f1_sum += f1
        accuracy_sum += accuracy

        if matrix_sum is None:
            matrix_sum = matrix
        else:
            matrix_sum += matrix

    return recall_sum / B, prec_sum / B, f1_sum / B, accuracy_sum / B, matrix_sum

In [32]:
clf = RandomForestClassifier(n_estimators=100, max_depth=5)

In [22]:
recall, precision, f1, acc, confmat = hold_out(df, clf)
print("recall: {:.3f}\nprecision: {:.3f}\nf1: {:.3f}\nAccuracy: {:.3f}\nConf. Matrix:\n {}".format(recall, precision, f1, acc, confmat))

recall: 0.979
precision: 0.979
f1: 0.979
Accuracy: 0.979
Conf. Matrix:
 [[2999    1    0    0    0    0    0]
 [   1 2491    0    0    0   10    0]
 [   1    1 2997    0    0    1    0]
 [   0    0    2 2995    0    3    0]
 [   0    0    0   34 2948   17    1]
 [   0    0    6    2    1 2989    2]
 [   0    0   37  111   73   80  221]]


In [37]:
recall, precision, f1, acc, confmat = random_subsampling(df, clf, 2)
print("recall: {:.3f}\nprecision: {:.3f}\nf1: {:.3f}\nAccuracy: {:.3f}\nConf. Matrix:\n {}".format(recall, precision, f1, acc, confmat))

recall: 0.973
precision: 0.973
f1: 0.973
Accuracy: 0.973
Conf. Matrix:
 [[5972    4   24    0    0    0    0]
 [   3 4965    0    2    0   34    0]
 [   1    1 5913    0    0   85    0]
 [   0    0    8 5977   12    3    0]
 [   5    0    1   79 5895   17    3]
 [   1    0   21    5    2 5970    1]
 [  13    0  107  271  135  147  371]]
