In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.utils import resample

In [10]:
np.random.seed(2)

dataset_size = ["small", "medium", "large", "beta", "alpha", "gamma"][-1]

dataset_info = {
    "small": {
        "dataset_name": "wine",
        "class_name": "Class",
        "drop_fields": []
    },
    "medium": {
        "dataset_name": "breast-cancer-wisconsin",
        "class_name": "Class",
        "drop_fields": ["Sample code number"]
    },
    "large": {
        "dataset_name": "seismic-bumps",
        "class_name": "class",
        "drop_fields": []
    },
    "beta": {
        "dataset_name": "Acoustic_Extinguisher_Fire_Dataset",
        "class_name": "Class",
        "drop_fields": []
    },
    "alpha": {
        "dataset_name": "ThoracicSurgery",
        "class_name": "Risk1Yr",
        "drop_fields": []
    },
    "gamma": {
        "dataset_name": "agaricus-lepiota",
        "class_name": "Class",
        "drop_fields": []
    },
}

dataset_name = dataset_info[dataset_size]["dataset_name"]
class_name = dataset_info[dataset_size]["class_name"]
drop_fields = dataset_info[dataset_size]["drop_fields"]

df = pd.read_csv('../data/' + dataset_name + ".csv")
df = df.drop(drop_fields, axis=1)
df = df.iloc[np.random.permutation(len(df))]

if dataset_name == "breast-cancer-wisconsin":
    df[class_name].replace({2: 0, 4: 1}, inplace=True)
    
if dataset_name == "ThoracicSurgery":
    df[class_name].replace({'T': 1, 'F': 0}, inplace=True)
    
if dataset_name == "agaricus-lepiota":
    df[class_name].replace({'p': 1, 'e': 0}, inplace=True)

n_cut = int(0.8*len(df))
df_trn = df[:n_cut]
df_tst = df[n_cut:]

if dataset_name == "ThoracicSurgery":
    df_1 = df_trn[df_trn[class_name] == 1]
    df_0 = df_trn[df_trn[class_name] != 1]  
    df_1_upsampled = resample(df_1, random_state=2, n_samples=len(df_0), replace=True)

    df_upsampled = pd.concat([df_1_upsampled, df_0], ignore_index=True)
    df_upsampled = df_upsampled.iloc[np.random.permutation(df_upsampled.index)]
    
    df_0_downsampled = resample(df_0, random_state=20, n_samples=len(df_1), replace=False)
    df_downsampled = pd.concat([df_0_downsampled, df_1], ignore_index=True)
    df_downsampled = df_downsampled.iloc[np.random.permutation(df_downsampled.index)]
    
    # df_trn = df_upsampled
    df_trn = df_downsampled

X_trn = df_trn.drop(class_name, axis=1)
y_trn = df_trn[class_name]

X_tst = df_tst.drop(class_name, axis=1)
y_tst = df_tst[class_name]


In [13]:
df.head()

Unnamed: 0,Class,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
606,0,x,s,g,f,n,f,w,b,n,...,f,w,w,p,w,o,e,n,s,g
3783,1,f,f,g,f,f,f,c,b,g,...,k,p,b,p,w,o,l,h,v,d
7701,0,f,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,v,l
4202,1,x,f,y,f,f,f,c,b,g,...,k,b,b,p,w,o,l,h,v,g
6956,1,k,s,e,f,s,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l
