In [60]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score

In [9]:
df = pd.read_csv('data/naive-db.csv')

In [10]:
df.head()

Unnamed: 0,label,Amplitude_bin,Amplitude,Std_bin,Std,PeriodLS_bin,PeriodLS,Mean_bin,Mean,MaxSlope_bin,MaxSlope,Meanvariance_bin,Meanvariance,LinearTrend,LinearTrend_bin
0,0,7,0.533,6,0.283701,5,1.089524,17,19.338182,15,7.039598,3,0.014671,3.6e-05,11
1,0,6,0.2715,3,0.127947,7,3.209689,18,19.586202,15,10.640067,2,0.006533,-1.5e-05,10
2,0,6,0.3015,3,0.138996,6,2.268446,18,19.913902,15,21.778886,2,0.00698,-7e-06,11
3,0,5,0.164,2,0.095606,5,1.561432,9,15.151209,15,0.353493,2,0.00631,1e-06,10
4,0,5,0.11125,2,0.056008,5,1.166648,15,18.361809,15,17.002882,1,0.00305,2e-06,11


In [73]:
def hold_out(data, clf, target='label'):
    X = data
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    prec = precision_score(y_test, y_pred, average='micro')
    accuracy = accuracy_score(y_test, y_pred)
    matriz = confusion_matrix(y_test, y_pred)
    return recall, prec, f1, accuracy, matriz

In [90]:
def random_subsampling(data, clf, B, target='label'):
    recall_sum = 0
    prec_sum = 0
    f1_sum = 0
    accuracy_sum = 0
    for i in range(B):
        recall, prec, f1, accuracy, matriz = hold_out(data, clf)
        recall_sum += recall / B
        prec_sum += prec / B
        f1_sum += f1 / B
        accuracy_sum += accuracy / B
    return recall_sum, prec_sum, f1_sum, accuracy_sum    

In [91]:
clf = RandomForestClassifier(n_estimators=100, max_depth=5)

In [92]:
hold_out(df, clf)

(0.9705947625388371,
 0.9705947625388371,
 0.9705947625388371,
 0.9705947625388371,
 array([[2998,    2,    0,    0,    0,    0,    0],
        [   0, 2490,    0,    1,    0,   11,    0],
        [   0,    1, 2900,    0,    0,   99,    0],
        [   0,    0,    4, 2992,    3,    1,    0],
        [   1,    0,    2,   22, 2965,   10,    0],
        [   1,    8,   15,    0,    1, 2975,    0],
        [   0,    0,   50,  133,   86,   79,  174]]))

In [93]:
random_subsampling(df, clf, 2)

(0.9694296493564136,
 0.9694296493564136,
 0.9694296493564136,
 0.9694296493564136)

In [85]:
recall, prec, f1, accuracy, matriz = hold_out(data, clf)