In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from ripser import ripser
import random

# Load data
df = pd.read_csv("dataset.csv")
X = df.drop(columns=["Unnamed: 0", "label"])
y = df["label"]

# Reduce dimensionality before TDA
pca = PCA(n_components=10)  # Reduce to 10D
X_reduced = pca.fit_transform(X)

# Sample subset to avoid memory error
sample_indices = random.sample(range(len(X_reduced)), 500)  # Sample 500 points
X_subset = X_reduced[sample_indices]

# Apply TDA (persistent homology)
diagrams = ripser(X_subset)['dgms']

# You would normally analyze persistence diagrams here to reweight features
# Placeholder: Assume we give more weight to top 3 PCA components based on domain insight
X_reweighted = X_reduced * np.array([2, 1.5, 1.2] + [1]*(X_reduced.shape[1] - 3))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_reweighted, y, test_size=0.2, random_state=42)

# Train classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99     67005
           1       0.95      0.64      0.76       461
           2       1.00      1.00      1.00     11189
           3       0.94      0.88      0.91      6870

    accuracy                           0.98     85525
   macro avg       0.97      0.88      0.91     85525
weighted avg       0.98      0.98      0.98     85525

