In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                            balanced_accuracy_score, confusion_matrix,
                            matthews_corrcoef, cohen_kappa_score)
from sklearn.impute import SimpleImputer
import time
import os
from imblearn.over_sampling import ADASYN

# Create directory for visualizations
os.makedirs("confusion_matrices", exist_ok=True)
os.makedirs("visualizations", exist_ok=True)
os.makedirs("results", exist_ok=True)

# Set the number of K folds
K_FOLDS = 2

# Helper function for confusion matrix metrics
def confusion_matrix_metrics(cm, classes):
    metrics = {}
    for idx, class_label in enumerate(classes):
        TP = cm[idx, idx]  # True Positives for this class
        FP = cm[:, idx].sum() - TP  # False Positives for this class
        FN = cm[idx, :].sum() - TP  # False Negatives for this class
        TN = cm.sum() - (TP + FP + FN)  # True Negatives for this class

        metrics[class_label] = {
            'TPR': TP / (TP + FN + 1e-10) if (TP + FN) > 0 else 0,
            'TNR': TN / (TN + FP + 1e-10) if (TN + FP) > 0 else 0,
            'FPR': FP / (FP + TN + 1e-10) if (FP + TN) > 0 else 0,
            'FNR': FN / (FN + TP + 1e-10) if (FN + TP) > 0 else 0
        }
    return metrics

# Load dataset
print("Loading dataset...")
try:
    df = pd.read_csv('C:/Users/ddihora1604/Downloads/IIT Patna/Darshan_Dihora_ID_17_Task_2/Dataset 2/part-00001_preprocessed_dataset.csv')
    print(f"Dataset loaded successfully with shape: {df.shape}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit(1)

# Take 20% of the data for faster processing (if needed)
df = df.sample(frac=0.2, random_state=42)
print(f"Sampled dataset shape: {df.shape}")

# Rename the last column as 'label' if not already named
df.rename(columns={df.columns[-1]: 'label'}, inplace=True)

# Data preprocessing
print("Preprocessing data...")

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X = df.drop(columns=['label'])
X_columns = X.columns
X = imputer.fit_transform(X)
y = df['label'].values

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize results storage
results = []
timing_results = []

# Set up KFold cross-validation
kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)

# Ridge classifier with hyperparameters (removed deprecated 'normalize' parameter)
ridge_classifier = RidgeClassifier(
    alpha=1.0,  # Regularization strength
    fit_intercept=True,
    copy_X=True,
    max_iter=1000,
    tol=0.001,
    class_weight=None,
    solver='auto',
    random_state=42
)

# Cross-validation process
print(f"Starting {K_FOLDS}-fold cross-validation with Ridge Classifier...")
fold_idx = 1
for train_index, test_index in kf.split(X):
    print(f"Processing fold {fold_idx}/{K_FOLDS}...")
    
    # Split the data for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Apply ADASYN SMOTE to the training data
    print(f"Applying ADASYN SMOTE resampling for fold {fold_idx}...")
    
    # Check class distribution before resampling
    class_counts = np.bincount(y_train)
    min_samples_needed = 6  # Default n_neighbors + 1
    
    try:
        # First attempt with ADASYN - may fail if a class has too few samples
        adasyn = ADASYN(random_state=42, n_neighbors=min(5, np.min(class_counts[class_counts > 0])-1))
        X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)
        print("ADASYN resampling successful")
    except ValueError as e:
        print(f"ADASYN failed: {e}")
        print("Falling back to simple oversampling for minority classes")
        
        # Manual simple oversampling for rare classes
        X_train_resampled = X_train.copy()
        y_train_resampled = y_train.copy()
        
        # For each minority class, duplicate samples until we reach a minimum count
        classes = np.unique(y_train)
        target_count = max(np.bincount(y_train))  # Use majority class count as target
        
        for cls in classes:
            cls_indices = np.where(y_train == cls)[0]
            if len(cls_indices) < target_count:
                # Number of duplications needed
                n_duplicates = target_count - len(cls_indices)
                
                # Generate indices to duplicate (with replacement if needed)
                duplicate_indices = np.random.choice(cls_indices, size=n_duplicates, replace=True)
                
                # Add duplicated samples
                X_train_resampled = np.vstack([X_train_resampled, X_train[duplicate_indices]])
                y_train_resampled = np.append(y_train_resampled, y_train[duplicate_indices])
    
    # Display class distribution after resampling
    resampled_class_dist = pd.Series(y_train_resampled).value_counts()
    print(f"Class distribution after resampling:\n{resampled_class_dist}")
    
    # Record training time
    start_train_time = time.time()
    ridge_classifier.fit(X_train_resampled, y_train_resampled)
    train_time = time.time() - start_train_time
    
    # Record prediction time
    start_test_time = time.time()
    y_pred = ridge_classifier.predict(X_test)
    test_time = time.time() - start_test_time
    
    # Record timing info
    timing_results.append({
        'Classifier': 'RidgeClassifier',
        'Fold': fold_idx,
        'Training Time (s)': train_time,
        'Testing Time (s)': test_time,
        'Total Time (s)': train_time + test_time
    })
    
    # Compute evaluation metrics
    unique_classes = np.unique(y)
    cm = confusion_matrix(y_test, y_pred, labels=unique_classes)
    cm_metrics = confusion_matrix_metrics(cm, unique_classes)
    
    # Calculate metrics for each class
    for class_label in unique_classes:
        # Binary classification metrics for this class
        y_test_binary = (y_test == class_label).astype(int)
        y_pred_binary = (y_pred == class_label).astype(int)
        
        # Calculate metrics
        class_metrics = {
            'Classifier': 'RidgeClassifier',
            'Fold': fold_idx,
            'Class': class_label,
            'Accuracy': accuracy_score(y_test_binary, y_pred_binary),
            'Precision': precision_score(y_test_binary, y_pred_binary, zero_division=0),
            'Recall': recall_score(y_test_binary, y_pred_binary),
            'F1 Score': f1_score(y_test_binary, y_pred_binary),
            'Balanced Accuracy': balanced_accuracy_score(y_test_binary, y_pred_binary),
            'Matthews Correlation Coefficient': matthews_corrcoef(y_test_binary, y_pred_binary),
            'Cohen Kappa Score': cohen_kappa_score(y_test_binary, y_pred_binary),
            'True Positive Rate (TPR)': cm_metrics[class_label]['TPR'],
            'True Negative Rate (TNR)': cm_metrics[class_label]['TNR'],
            'False Positive Rate (FPR)': cm_metrics[class_label]['FPR'],
            'False Negative Rate (FNR)': cm_metrics[class_label]['FNR'],
            'Training Time (s)': train_time,
            'Testing Time (s)': test_time
        }
        
        results.append(class_metrics)
    
    # Plot and save confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=unique_classes, yticklabels=unique_classes)
    plt.title(f"Ridge Classifier - Fold {fold_idx} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig(f"confusion_matrices/fold_{fold_idx}.png")
    plt.close()
    
    # Feature importance visualization (coefficients of Ridge Classifier)
    if fold_idx == 1:  # Only for the first fold to avoid redundancy
        coef = ridge_classifier.coef_
        # For multiclass, average the absolute coefficients across classes
        if coef.ndim > 1:
            importance = np.mean(np.abs(coef), axis=0)
        else:
            importance = np.abs(coef)
        
        # Plot feature importance
        plt.figure(figsize=(12, 8))
        feature_importance = pd.DataFrame({
            'Feature': X_columns,
            'Importance': importance
        })
        feature_importance = feature_importance.sort_values('Importance', ascending=False)
        
        # Plot top 20 features or all if less than 20
        top_n = min(20, len(feature_importance))
        sns.barplot(x='Importance', y='Feature', data=feature_importance.head(top_n))
        plt.title('Ridge Classifier Feature Importance')
        plt.tight_layout()
        plt.savefig(f"visualizations/feature_importance.png")
        plt.close()
    
    # Pair plot for the first 5 features and target
    if fold_idx == 1:  # Only for the first fold
        # Convert numpy array back to DataFrame for visualization
        sample_size = min(1000, X_test.shape[0])  # Limit sample size for visualization
        vis_df = pd.DataFrame(X_test[:sample_size], columns=X_columns)
        vis_df['label'] = y_test[:sample_size]
        
        # Select first 5 features (or less if fewer features exist)
        plot_features = list(X_columns[:min(5, len(X_columns))])
        plot_features.append('label')
        
        # Create pair plot
        plt.figure(figsize=(12, 10))
        pair_plot = sns.pairplot(vis_df[plot_features], hue='label', height=2.5)
        plt.suptitle('Pair Plot of Top Features by Ridge Classifier', y=1.02)
        plt.tight_layout()
        pair_plot.savefig("visualizations/pair_plot.png")
        plt.close()
    
    fold_idx += 1

# Create DataFrames for results and save to CSV
timing_df = pd.DataFrame(timing_results)
timing_df.to_csv("results/timing.csv", index=False)

results_df = pd.DataFrame(results)
print("Classification Metrics Across Folds:")
print(results_df.head())

# Save results to CSV
results_df.to_csv("results/metrics.csv", index=False)

# Calculate and display average metrics across folds
avg_metrics = results_df.groupby(['Classifier', 'Class']).mean().reset_index()
avg_metrics.to_csv("results/avg_metrics.csv", index=False)
print("\nAverage Metrics Across Folds:")
print(avg_metrics[['Classifier', 'Class', 'Accuracy', 'Precision', 'Recall', 'F1 Score']])

print("\nRidge Classifier implementation completed successfully!")


Loading dataset...
Dataset loaded successfully with shape: (218805, 47)
Sampled dataset shape: (43761, 47)
Preprocessing data...
Starting 2-fold cross-validation with Ridge Classifier...
Processing fold 1/2...
Applying ADASYN SMOTE resampling for fold 1...
ADASYN failed: The 'n_neighbors' parameter of ADASYN must be an int in the range [1, inf) or an object implementing 'kneighbors' and 'kneighbors_graph'. Got 0 instead.
Falling back to simple oversampling for minority classes
Class distribution after resampling:
14    3362
16    3362
2     3362
17    3362
32    3362
3     3362
30    3362
33    3362
18    3362
5     3362
26    3362
11    3362
22    3362
15    3362
27    3362
1     3362
29    3362
4     3362
25    3362
10    3362
31    3362
8     3362
20    3362
9     3362
7     3362
24    3362
12    3362
21    3362
19    3362
6     3362
13    3362
23    3362
0     3362
Name: count, dtype: int64


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Processing fold 2/2...
Applying ADASYN SMOTE resampling for fold 2...
ADASYN failed: The 'n_neighbors' parameter of ADASYN must be an int in the range [1, inf) or an object implementing 'kneighbors' and 'kneighbors_graph'. Got 0 instead.
Falling back to simple oversampling for minority classes
Class distribution after resampling:
14    3405
7     3405
3     3405
0     3405
30    3405
33    3405
11    3405
28    3405
17    3405
29    3405
32    3405
26    3405
16    3405
5     3405
18    3405
27    3405
22    3405
13    3405
24    3405
15    3405
23    3405
4     3405
9     3405
12    3405
1     3405
10    3405
8     3405
19    3405
21    3405
6     3405
20    3405
25    3405
2     3405
Name: count, dtype: int64


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Metrics Across Folds:
        Classifier  Fold  Class  Accuracy  Precision    Recall  F1 Score  \
0  RidgeClassifier     1      0  0.996938   0.014706  1.000000  0.028986   
1  RidgeClassifier     1      1  0.983045   0.785455  0.409091  0.537983   
2  RidgeClassifier     1      2  0.995658   0.000000  0.000000  0.000000   
3  RidgeClassifier     1      3  0.998218   0.000000  0.000000  0.000000   
4  RidgeClassifier     1      4  0.998218   0.918919  0.772727  0.839506   

   Balanced Accuracy  Matthews Correlation Coefficient  Cohen Kappa Score  \
0           0.998469                          0.121082           0.028898   
1           0.703164                          0.559718           0.530218   
2           0.497852                         -0.000444          -0.000090   
3           0.499177                         -0.000475          -0.000253   
4           0.886157                          0.841801           0.838617   

   True Positive Rate (TPR)  True Negative 

<Figure size 1200x1000 with 0 Axes>