# Experiment on feature engineering via HPF on quaternion graph signal

In [None]:
# If gspx is not installed, we add it to the path
import os, sys
gdir = os.path.dirname(os.getcwd())  # parent folder
sys.path.insert(0, gdir)

In [None]:
import numpy as np

from gspx.utils.display import plot_graph
from gspx.datasets import WeatherGraphData, uk_weather
from gspx.signals import QuaternionSignal
from gspx.qgsp import create_quaternion_weights, QGFT, QMatrix

In [None]:
from sklearn.datasets import make_classification

n_features=16
X, y = make_classification(
    n_samples=1000, n_features=n_features, n_informative=6, n_redundant=10,
    n_classes=2, flip_y=0.5, class_sep=1.0, shuffle=True, random_state=44)

In [None]:
import pandas as pd

def ks2(y_true, y_score):
    from scipy.stats import ks_2samp
    y_pred0 = y_score[y_true == 0]
    y_pred1 = y_score[y_true == 1]
    ks, _ = ks_2samp(y_pred0, y_pred1)
    return ks

def best_features(df, k=4, thres=0.3):
    arr = df.values
    corr = df.corr() - np.eye(df.shape[1])

    _, ncols = arr.shape
    ks_list = []
    for c in range(ncols):
        ks_list.append(ks2(y, arr[:, c]))

    corr['KS'] = ks_list
    corr.sort_values(by='KS', ascending=False)
    return corr

df = pd.DataFrame(X, columns=[f"V{i}" for i in range(n_features)])
cor = best_features(df, k=4)

In [None]:
pd.DataFrame(cor.abs().values > 0.8, columns=cor.columns).any(axis=1)

In [None]:
import pandas as pd

cor.abs().style.highlight_between(
    left=0.65, right=0.99,
    props='font-weight:bold;color:#e83e8c')

In [None]:
from sklearn.decomposition import PCA

pca = PCA(
    n_components=2, svd_solver='full', tol=0.0, random_state=42,
    whiten=True)
X_pca = pca.fit_transform(X)

In [None]:
import matplotlib.pyplot as plt

plt.plot(X_pca[y == 0, 0], X_pca[y == 0, 1], 'b.', label='Class 0')
plt.plot(X_pca[y == 1, 0], X_pca[y == 1, 1], 'r.', label='Class 1')
plt.legend(loc="upper right")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=100, criterion='gini', max_depth=None,
    min_samples_split=2, min_samples_leaf=1,
    min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)[source]