# Experiment on feature engineering via HPF on quaternion graph signal

In [None]:
# If gspx is not installed, we add it to the path
import os, sys
gdir = os.path.dirname(os.getcwd())  # parent folder
sys.path.insert(0, gdir)

In [None]:
import numpy as np

from gspx.utils.display import plot_graph
from gspx.datasets import WeatherGraphData, uk_weather
from gspx.signals import QuaternionSignal
from gspx.qgsp import create_quaternion_weights, QGFT, QMatrix

In [None]:
from sklearn.datasets import make_classification

n_features = 16
X, y = make_classification(
    n_samples=1000, n_features=n_features, n_informative=3, n_redundant=4,
    n_classes=2, flip_y=0.4, class_sep=1.0, shuffle=True, random_state=44)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(
    n_components=2, svd_solver='full', tol=0.0, random_state=42,
    whiten=True)
X_pca = pca.fit_transform(X)

In [None]:
import matplotlib.pyplot as plt

plt.plot(X_pca[y == 0, 0], X_pca[y == 0, 1], 'b.', label='Class 0')
plt.plot(X_pca[y == 1, 0], X_pca[y == 1, 1], 'r.', label='Class 1')
plt.legend(loc="upper right")
plt.show()

### Benchmark classification model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd

from gspx.utils.utils import ks2

df = pd.DataFrame(X, columns=[f"V{i}" for i in range(n_features)])
df['TARGET'] = y

train, test = train_test_split(df, test_size=0.3)

clf = LogisticRegression()
clf.fit(train.drop(columns=['TARGET']), y=train['TARGET'])
y_train = clf.predict_proba(train.drop(columns=['TARGET']))[:, 1]
y_test = clf.predict_proba(test.drop(columns=['TARGET']))[:, 1]

print(f"TRAIN: {ks2(train['TARGET'].to_numpy().ravel(), y_train)}")
print(f"TEST: {ks2(test['TARGET'].to_numpy().ravel(), y_test)}")

In [None]:
{df.columns[i]: c for i, c in enumerate(clf.coef_.ravel())}

### QGFT on the nearest neighbors graph

In [None]:
import pandas as pd
from gspx.utils.utils import best_features

df = pd.DataFrame(X, columns=[f"V{i}" for i in range(n_features)])
best_cols = best_features(df, y, nbest=4, thres=0.4)
df_ = df[best_cols]
df_.head()

In [None]:
from gspx.utils.graph import nearest_neighbors

A = nearest_neighbors(
    df_.values, n_neighbors=4, algorithm='ball_tree',
    mode='distance').todense()
A = A + A.T

In [None]:
from gspx.utils.graph import to_networkx
import networkx as nx

G = to_networkx(A)
nx.is_connected(G)

In [None]:
len(np.where(A != 0)[0])

In [None]:
cols = df_.columns.tolist()
Aq = create_quaternion_weights(
    A, df_, icols=[cols[1]], jcols=[cols[2]],
    kcols=[cols[3]], gauss_den=1.0)

In [None]:
plot_graph(
    Aq.abs(), coords=X_pca,
    figsize=(8, 8), colormap='viridis',
    node_size=10)

In [None]:
qgft = QGFT()
qgft.fit(Aq)

### Feature based on high-pass quaternion filtering

In [None]:
s = QuaternionSignal.from_rectangular(df[best_cols].values)

In [None]:
ss = qgft.transform(s)

In [None]:
QuaternionSignal.show(ss, ordering=qgft.idx_freq)