In [None]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import yaml
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import VotingClassifier

from src.data.prepare_data import prepare_data
from src.data.utils import resample_data
from src.models.classification import Classification
from src.models.evaluation import Evaluation

import warnings

warnings.filterwarnings('ignore')

In [None]:
# read config
with open('../'+'config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [None]:
# load and prepare data
df = pd.read_csv('../'+config['data_loader']['path'])
df = prepare_data(df=df)
display(df.head())

In [None]:
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,:-1], df['target']
    , test_size=config['train_test_split']['test_size']
    , random_state=123
    , shuffle=True
    , stratify=df['target']
    )

# check class distributions
print(
    y_train.value_counts(normalize=True)
    , y_test.value_counts(normalize=True)
    )

In [None]:
# build ensemble
clf = VotingClassifier(
    estimators = [
        ('DummyClassifier', DummyClassifier())
        , ('LogisticRegression', Classification(algorithm='LogisticRegression').model)
        , ('DecisionTree', Classification(algorithm='DecisionTreeClassifier').model)
    ]
    , voting='soft'
)
clf.fit(X_train, y_train)

In [None]:
# check individual and overall accuracy
for name, model in clf.named_estimators_.items():
    print(f'{name}: {model.score(X_test, y_test)}')

clf.score(X_test, y_test)