In [1]:
import random
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import (
    classification_report,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score
)
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from classifier_trainer import ClassifierTrainer

In [2]:
df_fac = pd.read_csv('./dataset/mfeat-fac.csv', header=None, delim_whitespace=True)
df_fou = pd.read_csv('./dataset/mfeat-fou.csv', header=None, delim_whitespace=True)
df_zer = pd.read_csv('./dataset/mfeat-zer.csv', header=None, delim_whitespace=True)

In [3]:
print(df_fac.shape)
df_fac.head()

(2000, 216)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,98,236,531,673,607,647,2,9,3,6,...,474,536,628,632,18,36,8,15,12,13
1,121,193,607,611,585,665,7,9,2,4,...,520,458,570,634,15,32,11,13,15,11
2,115,141,590,605,557,627,12,6,3,3,...,535,498,572,656,20,35,16,14,13,6
3,90,122,627,692,607,642,0,6,4,5,...,576,549,628,621,16,35,7,12,15,9
4,157,167,681,666,587,666,8,6,1,4,...,594,525,568,653,16,35,10,15,13,13


In [4]:
print(df_fou.shape)
df_fou.head()

(2000, 76)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,0.065882,0.197312,0.103826,0.270362,0.616078,0.035856,0.424572,0.089701,0.367773,0.037065,...,0.123448,0.113374,0.074343,0.281666,0.067795,0.344182,0.038963,0.394366,0.049971,0.344871
1,0.049142,0.175971,0.105515,0.227095,0.59928,0.041217,0.431078,0.096801,0.326739,0.059661,...,0.04531,0.069337,0.045386,0.254264,0.045447,0.335659,0.021719,0.445277,0.083978,0.354092
2,0.034172,0.227649,0.108766,0.127697,0.612494,0.056554,0.470639,0.041903,0.324267,0.044569,...,0.019858,0.218842,0.041087,0.360464,0.047154,0.377408,0.052099,0.445029,0.071234,0.261465
3,0.062336,0.217979,0.080243,0.289592,0.546316,0.045779,0.425545,0.022841,0.331454,0.119052,...,0.083995,0.085479,0.087658,0.15188,0.07595,0.293462,0.022675,0.408291,0.06301,0.401376
4,0.06197,0.198358,0.111239,0.25346,0.608455,0.023631,0.415246,0.091866,0.30931,0.049142,...,0.021004,0.10696,0.032283,0.248565,0.015674,0.386276,0.039481,0.434701,0.069218,0.405403


In [5]:
print(df_zer.shape)
df_zer.head()

(2000, 47)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,0.011033,0.831466,15.351804,75.806559,171.554214,490.156556,206.416027,0.122135,2.601646,11.472709,...,1.713529,33.81034,9.858915,1.399891,148.138058,326.239452,9.71107,20.007248,47.032578,539.208457
1,0.038271,1.166746,10.526913,42.369276,85.187116,420.360566,253.569574,0.033657,0.390566,11.70083,...,2.590208,35.400531,70.681899,6.674412,155.135985,377.832675,8.140633,44.536711,46.338954,518.496567
2,0.042698,1.225007,8.273804,31.744786,54.448177,404.103204,389.980746,0.041733,0.937399,11.629045,...,1.476285,19.47723,30.09359,7.858211,150.126419,419.565747,4.530921,26.29217,44.574822,549.912691
3,0.032418,1.638247,19.205283,51.196682,57.18176,429.052011,256.174645,0.073624,1.973268,13.057108,...,1.349613,14.179518,30.564085,7.097728,173.840759,441.350376,3.706023,13.432311,51.73993,574.887814
4,0.015866,0.611561,8.627839,37.325052,48.509025,459.909634,238.572767,0.046477,1.117292,10.012169,...,0.667971,8.705403,30.242473,9.015714,167.021185,332.479997,1.806273,23.6893,50.40777,492.227513


In [6]:
def create_labels():
    labels = []
    for i in range(10):
        labels.extend([i] * 200)
    return labels

In [7]:
labels = create_labels()
df_fac["label"] = labels
df_fou["label"] = labels
df_zer["label"] = labels

In [8]:
datasets = [df_fac, df_fou, df_zer]

trainer = ClassifierTrainer(datasets)
trainer.train_classifiers_with_random_states(n_iterations=3)

100%|█████████████████████████████████████████████| 3/3 [00:31<00:00, 10.62s/it]

Dataset - Precision Confidence Interval: [0.92340374 0.93747121]
Dataset - Recall Confidence Interval: [0.923125 0.937375]
Dataset - F1 Confidence Interval: [0.92260038 0.93664581]
Dataset - Accuracy Confidence Interval: [0.923125 0.937375]



