In [12]:
# for data manipulation
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import gensim.downloader as api
from transformers import BertTokenizer, BertModel
import torch
import warnings
import scipy.sparse

# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# DL Libraries
import tensorflow as tf
from tensorflow import keras
from keras.layers import Conv2D , MaxPooling2D ,Dropout , Flatten , Dense ,BatchNormalization ,Concatenate ,Input 
from keras.models import Sequential ,Model
import json

# other libraries
import cv2
import os
import glob
from PIL import Image
import string
import re
from diptest import diptest 
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, TruncatedSVD  
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.ensemble import RandomForestClassifier # Added RandomForestClassifier
from sklearn.metrics.cluster import adjusted_mutual_info_score, adjusted_rand_score # Changed metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # Import classification metrics

# Model
import tensorflow as tf
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers, models, applications
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import MobileNetV2, ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, LabelEncoder
from tensorflow.keras.applications.mobilenet import MobileNet, preprocess_input
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score
from sklearn.svm import SVC
from scipy.stats import iqr # For Silverman's rule
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

# RLAC MODEL
from sklearn.random_projection import GaussianRandomProjection
from scipy.stats import gaussian_kde
from scipy.signal import find_peaks  
from sklearn.neighbors import KernelDensity  
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from scipy import stats
from scipy.special import eval_hermitenorm  # For normalized Hermite polynomials H_n(x)
from scipy.stats import skew
from scipy.stats import norm
from sklearn.metrics import (adjusted_mutual_info_score, adjusted_rand_score, 
                             homogeneity_score, completeness_score, v_measure_score,
                             fowlkes_mallows_score, silhouette_score, calinski_harabasz_score,
                             davies_bouldin_score)
import warnings
warnings.simplefilter(action='ignore')

In [2]:
import arff 

# Load the ARFF file
dataset = arff.load(open(r'aggregation.arff'))

# Access the data and attributes
data = np.array(dataset['data'])
attributes = dataset['attributes']

# Print some information (optional)
print("Attributes:", attributes)
print("Data shape:", data.shape)

Attributes: [('x', 'REAL'), ('y', 'REAL'), ('class', ['1', '2', '3', '4', '5', '6', '7'])]
Data shape: (788, 3)


In [3]:
df = pd.DataFrame(data, columns=[attr[0] for attr in attributes])
print(df.head())

       x      y class
0  15.55  28.65     2
1   14.9  27.55     2
2  14.45  28.35     2
3  14.15   28.8     2
4  13.75  28.05     2


In [4]:
# Prepare features (X) and labels (y)
X = df[['x', 'y']].values  # Features (x and y coordinates)
y_train = df['class'].values      # Labels


In [5]:
from sklearn.preprocessing import StandardScaler

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X)

In [6]:
results = []

In [7]:
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering  
from sklearn.cluster import SpectralClustering 

In [13]:
# 4. Apply clustering algorithms
warnings.simplefilter(action='ignore', category=FutureWarning)

#kmeans = KMeans(n_clusters=7, n_init= 10)  
kmeans = KMeans(n_clusters=7, n_init=10, init='random', algorithm='lloyd', max_iter=10, random_state=42)
kmeans_labels = kmeans.fit_predict(X_train)

ami_km = adjusted_mutual_info_score(y_train, kmeans_labels)
ari_km = adjusted_rand_score(y_train, kmeans_labels)

results.append(["KMeans", ami_km, ari_km])


In [14]:
ncut = SpectralClustering(n_clusters=7, affinity='nearest_neighbors', assign_labels='kmeans', n_jobs=-1)  
ncut_labels = ncut.fit_predict(X_train)

ami_nc = adjusted_mutual_info_score(y_train, ncut_labels)
ari_nc = adjusted_rand_score(y_train, ncut_labels)

results.append(["NClust", ami_nc, ari_nc])


In [10]:
#hclust = AgglomerativeClustering(n_clusters=7, linkage='ward') 
hclust = AgglomerativeClustering(n_clusters=7, linkage='single', metric='euclidean')
hclust_labels = hclust.fit_predict(X_train)

ami_hc = adjusted_mutual_info_score(y_train, hclust_labels)
ari_hc = adjusted_rand_score(y_train, hclust_labels)

results.append(["HClust(single)", ami_hc, ari_hc])


In [11]:
# Create Pandas DataFrame
results_df = pd.DataFrame(results, columns=["Model", "AMI", "ARI"])

print(results_df)

            Model       AMI       ARI
0          KMeans  0.837045  0.737531
1          NClust  0.781503  0.543884
2  HClust(single)  0.883347  0.805773


In [12]:
import sys
import os

# 1. Get the path of the parent directory (Project_Root)
# '..' means "go up one level"
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# 2. Add it to Python's search path if not already there
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# 3. Now you can import normally
from rlac import RLAC
from mdh import MDH

#print("Successfully imported models from:", parent_dir)

In [15]:
import pandas as pd
import warnings
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

# Import your custom models
from rlac import RLAC
from mdh import MDH

# --- CONFIGURATION ---
# Target clusters for Aggregation dataset (should be 7)
n_clusters = len(set(y_train))

# RLAC Parameters
rlac_methods = [
    'depth_ratio', 'dip', 'holes', 'min_kurt', 'max_kurt', 
    'negentropy', 'skewness', 'fisher', 'hermite', 'friedman_tukey'
]
rlac_params = {
    'random_state': [44],
    'bw_adjust': [0.3],
    'r': [50]
}

# MDH Parameters (Same as Anuran configuration)
mdh_config = {
    "h_multiplier": 1.0,
    "alphamax_val": 0.9,
    "alpha_steps": 5,
    "random_state": 42
}

results = []

print(f"\nStarting Benchmark on Aggregation Dataset (n={len(X_train)}, k={n_clusters})...")
print("-" * 80)

# ==========================================
# 1. RLAC LOOP
# ==========================================
for method in rlac_methods:
    for r_val in rlac_params['r']:
        for bw in rlac_params['bw_adjust']:
            for seed in rlac_params['random_state']:
                
                # 1. Print parameters BEFORE running
                param_str = f"r={r_val}, bw={bw}, s={seed}"
                print(f"\nRunning RLAC {method:<15} | {param_str} ... ", end="")
                
                try:
                    # 2. Instantiate & Fit
                    model = RLAC(
                        n_clusters=n_clusters,
                        method=method,
                        r=r_val,
                        bw_adjust=bw,
                        random_state=seed,
                        plot=False
                    )
                    
                    with warnings.catch_warnings():
                        warnings.filterwarnings("ignore", category=UserWarning)
                        model.fit(X_train)
                    
                    # 3. Evaluate
                    ami = adjusted_mutual_info_score(y_train, model.labels_)
                    ari = adjusted_rand_score(y_train, model.labels_)
                    
                    # 4. Log Success
                    print(f"Done (AMI: {ami:.4f})")
                    
                    results.append({
                        'Model': 'RLAC',
                        'Method': method,
                        'Params': param_str,
                        'AMI': ami,
                        'ARI': ari
                    })
                    
                except Exception as e:
                    print(f"FAILED. Error: {e}")
                    results.append({
                        'Model': 'RLAC', 'Method': method, 'Params': param_str,
                        'AMI': -1, 'ARI': -1
                    })

# ==========================================
# 2. MDH RUN
# ==========================================
print(f"\nRunning MDH {'Standard':<15} | h=1.0, a=0.9 ... ")
try:
    mdh_model = MDH(
        n_clusters=n_clusters,
        h_multiplier=mdh_config['h_multiplier'],
        alphamax_val=mdh_config['alphamax_val'],
        alpha_steps=mdh_config['alpha_steps'],
        random_state=mdh_config['random_state'],
        verbose=False,
        plot=False
    )
    
    mdh_model.fit(X_train)
    
    ami_mdh = adjusted_mutual_info_score(y_train, mdh_model.labels_)
    ari_mdh = adjusted_rand_score(y_train, mdh_model.labels_)
    
    print(f"Done (AMI: {ami_mdh:.4f})")
    
    results.append({
        'Model': 'MDH',
        'Method': 'Standard',
        'Params': 'Fixed',
        'AMI': ami_mdh,
        'ARI': ari_mdh
    })
    
except Exception as e:
    print(f"FAILED. Error: {e}")

# ==========================================
# 3. RESULTS TABLE
# ==========================================
print("\n" + "="*80)
print("FINAL RESULTS (ALL MODELS - SORTED BY AMI)")
print("="*80)

# 1. Create DataFrame
results_df = pd.DataFrame(results).sort_values(by='AMI', ascending=False)

# 2. Print FULL table (no truncation)
# index=False hides the row numbers for a cleaner look
print(results_df.to_string(index=False))

Starting Benchmark on Aggregation Dataset (n=788, k=7)...
--------------------------------------------------------------------------------

Running RLAC depth_ratio     | r=50, bw=0.3, s=44 ... [DEPTH_RATIO] Generating 50 sparse random projections...
Starting Clustering: Target=7 clusters.
Iter 1: Split Cluster 0 (Size: 788) via Proj 0 | Score: 0.8761
Iter 2: Split Cluster 0 (Size: 232) via Proj 14 | Score: 0.7453
Iter 3: Split Cluster 0 (Size: 556) via Proj 20 | Score: 0.7132
Iter 4: Split Cluster 2 (Size: 215) via Proj 0 | Score: 0.3899
Iter 5: Split Cluster 2 (Size: 341) via Proj 31 | Score: 0.2980
Iter 6: Split Cluster 4 (Size: 307) via Proj 6 | Score: 0.2682
RLAC (depth_ratio) complete. Final clusters: 7
Done (AMI: 0.9914)

Running RLAC dip             | r=50, bw=0.3, s=44 ... [DIP] Generating 50 sparse random projections...
Starting Clustering: Target=7 clusters.
Iter 1: Split Cluster 0 (Size: 788) via Proj 3 | Score: 0.0612
Iter 2: Split Cluster 1 (Size: 232) via Proj 20 | Score