In [7]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import VotingClassifier

from src.data.prepare_data import prepare_data
from src.data.utils import resample_data
from src.models.classification import Classification
from src.models.evaluation import Evaluation

In [8]:
# read config
with open('../'+'config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [9]:
# load and prepare data
df = pd.read_csv('../'+config['data_loader']['path'])
df = prepare_data(df=df)
display(df.head())

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,target
0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,0.637735,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,0.529808,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,0.690708,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,0.575231,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,0.968046,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0


In [None]:
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,:-1], df['label']
    , test_size=config['train_test_split']['test_size']
    , random_state=123
    , shuffle=True
    , stratify=df['label']
    )

# check class distributions
print(
    y_train.value_counts(normalize=True)
    , y_test.value_counts(normalize=True)
    )

target
1    0.5
0    0.5
Name: proportion, dtype: float64 target
1    0.5
0    0.5
Name: proportion, dtype: float64


In [11]:
# build ensemble
clf = VotingClassifier(
    estimators = [
        ('DummyClassifier', DummyClassifier())
        , ('LogisticRegression', Classification(algorithm='LogisticRegression').model)
        , ('DecisionTree', Classification(algorithm='DecisionTreeClassifier', **{'max_depth': 5}).model)
    ]
    , voting='soft'
)
clf.fit(X_train, y_train)

In [12]:
# check individual and overall accuracy
for name, model in clf.named_estimators_.items():
    print(f'{name}: {model.score(X_test, y_test)}')

clf.score(X_test, y_test)

DummyClassifier: 0.5
LogisticRegression: 0.9586029825548678
DecisionTree: 0.9596581316826112


0.9625070343275183