In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# models
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


from utils.constants import STAGE_DIR, ANALYSIS_DIR
from utils.dataload import load_data
from utils.functions import split_train_test
from utils.transformers import SequentialFeatureSelectorTransformer

# Configuration

In [21]:
RUN_WRAPPER_SELECTION = False

# Load data

In [4]:
# Load all dataset
X_all = load_data(
    STAGE_DIR / 'preprocess' / 'X_train.parquet',
    load_func=pd.read_parquet).sort_index()

# Load selected dataset using univariate analysis
X_mi = load_data(
    STAGE_DIR / 'selection' / 'X_train_mi.parquet',
    load_func=pd.read_parquet).sort_index()

X_x2 = load_data(
    STAGE_DIR / 'selection' / 'X_train_x2.parquet',
    load_func=pd.read_parquet).sort_index()

X_anova = load_data(
    STAGE_DIR / 'selection' / 'X_train_anova.parquet',
    load_func=pd.read_parquet).sort_index()

# Load selected dataset using multivariate analysis
X_relief = load_data(
    STAGE_DIR / 'selection' / 'X_train_relief.parquet',
    load_func=pd.read_parquet).sort_index()

In [5]:
y = load_data(
    STAGE_DIR / 'preprocess' / 'y.parquet',
    load_func=pd.read_parquet).sort_index()

In [6]:
X_train_all, X_test_all, y_train_all, y_test_all = split_train_test(X_all, y)
X_train_mi, X_test_mi, y_train_mi, y_test_mi = split_train_test(X_mi, y)
X_train_x2, X_test_x2, y_train_x2, y_test_x2 = split_train_test(X_x2, y)
X_train_anova, X_test_anova, y_train_anova, y_test_anova = split_train_test(X_anova, y)
X_train_relief, X_test_relief, y_train_relief, y_test_relief = split_train_test(X_relief, y)

# Logistic Regression

In [9]:
lr_parameters = {
    'max_iter': 1000
}

## All features

### Training and score in the sample (81% - 0.01 std)

In [10]:
lr_all = LogisticRegression(**lr_parameters)
lr_all_scores = cross_val_score(
    lr_all,
    X_train_all,
    y_train_all['y'],
    cv=5
)

print("%0.2f accuracy with a standard deviation of %0.2f" % (
    lr_all_scores.mean(),
    lr_all_scores.std())
)

0.81 accuracy with a standard deviation of 0.01


### Out-of-sample score

In [11]:
lr_all = LogisticRegression(
    **lr_parameters
).fit(X_train_all, y_train_all['y'])
y_test_predict_all = lr_all.predict(X_test_all)

print(classification_report(y_test_all['y'], y_test_predict_all))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89       577
           1       0.68      0.39      0.50       170

    accuracy                           0.82       747
   macro avg       0.76      0.67      0.69       747
weighted avg       0.80      0.82      0.80       747



## Univariate - Mutual Information

### Training and score in the sample (81% - 0.01 std)

In [12]:
lr_mi = LogisticRegression(**lr_parameters)
lr_mi_scores = cross_val_score(
    lr_mi,
    X_train_mi,
    y_train_mi['y'],
    cv=5
)

print("%0.2f accuracy with a standard deviation of %0.2f" % (
    lr_mi_scores.mean(),
    lr_mi_scores.std())
)

0.81 accuracy with a standard deviation of 0.01


### Out-of-sample score

In [13]:
lr_mi = LogisticRegression(
    **lr_parameters
).fit(X_train_mi, y_train_mi['y'])
y_test_predict_mi = lr_mi.predict(X_test_mi)

print(classification_report(y_test_mi['y'], y_test_predict_mi))

              precision    recall  f1-score   support

           0       0.83      0.92      0.88       582
           1       0.56      0.36      0.44       165

    accuracy                           0.80       747
   macro avg       0.70      0.64      0.66       747
weighted avg       0.77      0.80      0.78       747



## Univariate - $\chi^2$

### Training and score in the sample (79% - 0.01 std)

In [14]:
lr_x2 = LogisticRegression(**lr_parameters)
lr_x2_scores = cross_val_score(
    lr_x2,
    X_train_x2,
    y_train_x2['y'],
    cv=5
)

print("%0.2f accuracy with a standard deviation of %0.2f" % (
    lr_x2_scores.mean(),
    lr_x2_scores.std())
)

0.79 accuracy with a standard deviation of 0.01


### Out-of-sample score

In [15]:
lr_x2 = LogisticRegression(
    **lr_parameters
).fit(X_train_x2, y_train_x2['y'])
y_test_predict_x2 = lr_x2.predict(X_test_x2)

print(classification_report(y_test_x2['y'], y_test_predict_x2))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88       588
           1       0.52      0.28      0.37       159

    accuracy                           0.79       747
   macro avg       0.68      0.61      0.62       747
weighted avg       0.76      0.79      0.77       747



## Univariate - ANOVA

### Training and score in the sample (81% - 0.01 std)

In [16]:
lr_anova = LogisticRegression(**lr_parameters)
lr_anova_scores = cross_val_score(
    lr_anova,
    X_train_anova,
    y_train_anova['y'],
    cv=5
)

print("%0.2f accuracy with a standard deviation of %0.2f" % (
    lr_anova_scores.mean(),
    lr_anova_scores.std())
)

0.81 accuracy with a standard deviation of 0.01


### Out-of-sample score

In [17]:
lr_anova = LogisticRegression(
    **lr_parameters
).fit(X_train_anova, y_train_anova['y'])
y_test_predict_anova = lr_anova.predict(X_test_anova)

print(classification_report(y_test_anova['y'], y_test_predict_anova))

              precision    recall  f1-score   support

           0       0.85      0.92      0.89       598
           1       0.53      0.37      0.44       149

    accuracy                           0.81       747
   macro avg       0.69      0.64      0.66       747
weighted avg       0.79      0.81      0.80       747



## Multivariate - Relief

### Training and score in the sample (79% - 0.02 std)

In [18]:
lr_relief = LogisticRegression(**lr_parameters)
lr_relief_scores = cross_val_score(
    lr_relief,
    X_train_relief,
    y_train_relief['y'],
    cv=5
)

print("%0.2f accuracy with a standard deviation of %0.2f" % (
    lr_relief_scores.mean(),
    lr_relief_scores.std())
)

0.79 accuracy with a standard deviation of 0.02


### Out-of-sample score

In [19]:
lr_relief = LogisticRegression(
    **lr_parameters
).fit(X_train_relief, y_train_relief['y'])
y_test_predict_relief = lr_relief.predict(X_test_relief)

print(classification_report(y_test_relief['y'], y_test_predict_relief))

              precision    recall  f1-score   support

           0       0.81      0.94      0.87       569
           1       0.62      0.29      0.40       178

    accuracy                           0.79       747
   macro avg       0.71      0.62      0.63       747
weighted avg       0.76      0.79      0.76       747



## Wrapper

### Select features

In [22]:
if RUN_WRAPPER_SELECTION:
    lr_wrapper = LogisticRegression(**lr_parameters)

    lr_fs_wrapper = SequentialFeatureSelectorTransformer(
        lr_wrapper, n_features_to_select=75)

    print('Fitting model. This may take a while...')
    X_lr_wrapper = lr_fs_wrapper.fit_transform(X_all, y['y'])
    print('We have a winner!')

    X_train_lr_wrapper, X_test_lr_wrapper, y_train_lr_wrapper, y_test_lr_wrapper = split_train_test(X_lr_wrapper, y)

    X_lr_wrapper.to_parquet(STAGE_DIR / 'selection' / 'X_train_lr.parquet')

Fitting model. This may take a while...
We have a winner!


### Training and score in the sample (81% - 0.01 std)

In [None]:
if not RUN_WRAPPER_SELECTION:
    X_lr_wrapper = pd.read_parquet(STAGE_DIR / 'selection' / 'X_train_lr.parquet')
    X_train_lr_wrapper, X_test_lr_wrapper, y_train_lr_wrapper, y_test_lr_wrapper = split_train_test(X_lr_wrapper, y)

In [24]:
lr_wrapper = LogisticRegression(**lr_parameters)
lr_wrapper_scores = cross_val_score(
    lr_wrapper,
    X_train_lr_wrapper,
    y_train_lr_wrapper['y'],
    cv=5
)

print("%0.2f accuracy with a standard deviation of %0.2f" % (
    lr_wrapper_scores.mean(),
    lr_wrapper_scores.std())
)

0.81 accuracy with a standard deviation of 0.01


### Out-of-sample score

In [25]:
lr_wrapper = LogisticRegression().fit(X_train_lr_wrapper, y_train_lr_wrapper['y'])
y_test_predict_lr_wrapper = lr_wrapper.predict(X_test_lr_wrapper)

print(classification_report(y_test_lr_wrapper['y'], y_test_predict_lr_wrapper))

              precision    recall  f1-score   support

           0       0.82      0.95      0.88       581
           1       0.61      0.25      0.36       166

    accuracy                           0.80       747
   macro avg       0.71      0.60      0.62       747
weighted avg       0.77      0.80      0.76       747



## Metaclassifier - Bagging

### Training and score in the sample (81% - 0.01 std)

In [31]:
lr_bagging = BaggingClassifier(
    LogisticRegression(**lr_parameters)
)
lr_bagging_scores = cross_val_score(
    lr_bagging,
    X_train_all,
    y_train_all['y'],
    cv=5
)

print("%0.2f accuracy with a standard deviation of %0.2f" % (
    lr_bagging_scores.mean(),
    lr_bagging_scores.std())
)

0.81 accuracy with a standard deviation of 0.01


### Out-of-sample score

In [30]:
lr_bagging = BaggingClassifier(
    LogisticRegression(**lr_parameters)
).fit(X_train_all, y_train_all['y'])
y_test_predict_lr_bagging = lr_bagging.predict(X_test_all)

print(classification_report(y_test_all['y'], y_test_predict_lr_bagging))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89       577
           1       0.65      0.43      0.52       170

    accuracy                           0.82       747
   macro avg       0.75      0.68      0.70       747
weighted avg       0.80      0.82      0.80       747



# Bayes Models

# Discriminant Analysis