In [1]:
%load_ext autoreload
%autoreload 2

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

from wittgenstein import RIPPER

from utils.constants import STAGE_DIR
from utils.dataload import load_data
from utils.display import cdisplay
from utils.functions import split_train_test

# Load data

In [3]:
# Load all dataset
X_all = load_data(
    STAGE_DIR / 'preprocess' / 'X_train.parquet',
    load_func=pd.read_parquet).sort_index()

# Load selected dataset using univariate analysis
X_mi = load_data(
    STAGE_DIR / 'selection' / 'X_train_mi.parquet',
    load_func=pd.read_parquet).sort_index()

X_x2 = load_data(
    STAGE_DIR / 'selection' / 'X_train_x2.parquet',
    load_func=pd.read_parquet).sort_index()

X_anova = load_data(
    STAGE_DIR / 'selection' / 'X_train_anova.parquet',
    load_func=pd.read_parquet).sort_index()

# Load selected dataset using multivariate analysis
X_relief = load_data(
    STAGE_DIR / 'selection' / 'X_train_relief.parquet',
    load_func=pd.read_parquet).sort_index()

In [4]:
y = load_data(
    STAGE_DIR / 'preprocess' / 'y.parquet',
    load_func=pd.read_parquet).sort_index()

In [5]:
X_train_all, X_test_all, y_train_all, y_test_all = split_train_test(X_all, y)
X_train_mi, X_test_mi, y_train_mi, y_test_mi = split_train_test(X_mi, y)
X_train_x2, X_test_x2, y_train_x2, y_test_x2 = split_train_test(X_x2, y)
X_train_anova, X_test_anova, y_train_anova, y_test_anova = split_train_test(X_anova, y)
X_train_relief, X_test_relief, y_train_relief, y_test_relief = split_train_test(X_relief, y)

# Support Vector Machine

## All features

In [6]:
svm_all = SVC(kernel='linear')
svm_all_scores = cross_val_score(
    svm_all,
    X_train_all,
    y_train_all['y'].values,
    cv=5
)

print("%0.2f accuracy with a standard deviation of %0.2f" % (
    svm_all_scores.mean(),
    svm_all_scores.std())
)

0.78 accuracy with a standard deviation of 0.02


In [15]:
svm_all_fitted = svm_all.fit(X_train_all, y_train_all['y'].values)

In [24]:
_X = PCA(n_components=2).fit_transform(X_train_all, y_train_all)

decision_boundary = DecisionBoundaryDisplay.from_estimator(
    SVC(kernel='linear').fit(_X, y_train_all['y'].values),
    _X,
    response_method='predict',
    cmap=plt.cm.coolwarm,
    alpha=0.8
)
plt.scatter(_X, c=y_train_all['y'].values, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
# ax.set_xticks(())
# ax.set_yticks(())

# fig.set_title('Decision boundary of SVM with all features')

plt.show()

## Univariate - Mutual Information

## Univariate - $\chi^2$

## Univariate - ANOVA

## Multivariate - Relief

## Wrapper

# Classification Tree

## All features

## Univariate - Mutual Information

## Univariate - $\chi^2$

## Univariate - ANOVA

## Multivariate - Relief

## Wrapper

# Rule Induction - RIPPER

## All features

## Univariate - Mutual Information

## Univariate - $\chi^2$

## Univariate - ANOVA

## Multivariate - Relief

## Wrapper

# K-Nearest Neighbors

## All features

## Univariate - Mutual Information

## Univariate - $\chi^2$

## Univariate - ANOVA

## Multivariate - Relief

## Wrapper