In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             balanced_accuracy_score, confusion_matrix,
                             matthews_corrcoef, cohen_kappa_score)
from imblearn.ensemble import RUSBoostClassifier
import time
import os
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from collections import Counter

# Set random seed for reproducibility
np.random.seed(42)

# Set the number of K folds
K_FOLDS = 2

# Create directories to save results
os.makedirs("confusion_matrices", exist_ok=True)
os.makedirs("visualizations", exist_ok=True)
os.makedirs("results", exist_ok=True)

# Helper function for confusion matrix metrics
def confusion_matrix_metrics(cm, classes):
    metrics = {}
    for idx, class_label in enumerate(classes):
        TP = cm[idx, idx]  # True Positives for this class
        FP = cm[:, idx].sum() - TP  # False Positives for this class
        FN = cm[idx, :].sum() - TP  # False Negatives for this class
        TN = cm.sum() - (TP + FP + FN)  # True Negatives for this class

        metrics[class_label] = {
            'TPR': TP / (TP + FN + 1e-10) if (TP + FN) > 0 else 0,
            'TNR': TN / (TN + FP + 1e-10) if (TN + FP) > 0 else 0,
            'FPR': FP / (FP + TN + 1e-10) if (FP + TN) > 0 else 0,
            'FNR': FN / (FN + TP + 1e-10) if (FN + TP) > 0 else 0
        }
    return metrics

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('C:/Users/ddihora1604/Downloads/IIT Patna/Darshan_Dihora_ID_17_Task_2/Dataset 2/part-00001_preprocessed_dataset.csv')

# Take 20% of the data for faster processing (optional, comment out if you want to use full dataset)
df = df.sample(frac=0.2, random_state=42)

# Rename the last column as 'label' if it's not already named that
df.rename(columns={df.columns[-1]: 'label'}, inplace=True)

# Check class distribution before resampling
original_class_dist = df['label'].value_counts()
print(f"Original class distribution:\n{original_class_dist}")

# Preprocessing: Handle missing values
print("Preprocessing data...")
# Replace NaN values with column means for numerical columns
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col].fillna(df[col].mean(), inplace=True)

# For categorical columns, fill with mode
for col in df.select_dtypes(include=['object']).columns:
    if col != 'label':  # Don't replace label
        df[col].fillna(df[col].mode()[0], inplace=True)

# Encode categorical features if any
for col in df.select_dtypes(include=['object']).columns:
    if col != 'label':  # Don't encode label yet
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Encode the label column
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
class_names = label_encoder.classes_

# Extract features and target
X = df.drop(columns=['label']).values
y = df['label'].values

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize to store results
results = []
timing_results = []

# Create K-fold cross-validation
kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)

# Fold-wise training and evaluation
fold_idx = 1
for train_index, test_index in kf.split(X):
    print(f"Training fold {fold_idx}/{K_FOLDS}...")
    
    # Split the data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Apply oversampling to the training data
    print(f"Handling class imbalance in fold {fold_idx}...")
    
    # Check class distribution
    class_counts = Counter(y_train)
    print(f"Original training class distribution: {class_counts}")
    
    # Strategy 1: First use RandomOverSampler for classes with very few samples
    # This will duplicate existing samples to ensure each class has at least 6 samples
    min_samples_needed = 6  # Minimum samples needed for SMOTE/ADASYN
    
    # Create a sampling strategy that ensures each class has at least min_samples_needed
    sampling_strategy = {cls: max(count, min_samples_needed) for cls, count in class_counts.items()}
    
    # Apply random oversampling first
    random_oversampler = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
    X_temp, y_temp = random_oversampler.fit_resample(X_train, y_train)
    print(f"Class distribution after initial oversampling: {Counter(y_temp)}")
    
    # Strategy 2: Now apply SMOTE on data that has enough samples for each class
    try:
        smote = SMOTE(random_state=42, k_neighbors=5)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_temp, y_temp)
        print("Successfully applied SMOTE after initial oversampling")
    except Exception as e:
        print(f"SMOTE failed: {e}. Using randomly oversampled data instead.")
        X_train_resampled, y_train_resampled = X_temp, y_temp
    
    # Print final class distribution
    train_class_dist_after = pd.Series(y_train_resampled).value_counts().sort_index()
    print(f"Final training class distribution after oversampling:\n{train_class_dist_after}")
    
    # Initialize RUSBoost
    # Using Decision Tree as base estimator
    rusboost = RUSBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=4),  # Changed from base_estimator to estimator
        n_estimators=100,
        learning_rate=0.1,
        algorithm='SAMME.R',  # Use real-valued predictions
        sampling_strategy='auto',  # Auto determine sampling strategy
        replacement=False,  # Sample without replacement
        random_state=42
    )
    
    # Record start time
    start_train_time = time.time()

    # Visualize effect of oversampling (on last fold's data)
    plt.figure(figsize=(12, 6))
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

    # Original distribution
    pd.Series(y_train, name='Original').value_counts().sort_index().plot(
        kind='bar', ax=ax1, title='Class Distribution Before Oversampling')
    ax1.set_ylabel('Count')

    # After oversampling
    pd.Series(y_train_resampled, name='Resampled').value_counts().sort_index().plot(
        kind='bar', ax=ax2, title='Class Distribution After Hybrid Oversampling')
    ax2.set_ylabel('Count')

    plt.tight_layout()
    plt.savefig("visualizations/class_distribution.png")
    plt.close()
    
    # Train the model with ADASYN resampled data
    rusboost.fit(X_train_resampled, y_train_resampled)
    
    train_time = time.time() - start_train_time
    
    # Make predictions
    start_test_time = time.time()
    y_pred = rusboost.predict(X_test)
    test_time = time.time() - start_test_time
    
    # Record timing results
    timing_results.append({
        'Classifier': 'RUSBoost',
        'Fold': fold_idx,
        'Training Time (s)': train_time,
        'Testing Time (s)': test_time,
        'Total Time (s)': train_time + test_time
    })
    
    # Compute metrics
    unique_classes = np.unique(y)
    cm = confusion_matrix(y_test, y_pred, labels=unique_classes)
    cm_metrics = confusion_matrix_metrics(cm, unique_classes)
    
    # Plot and save confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=[class_names[i] if i < len(class_names) else i for i in unique_classes],
                yticklabels=[class_names[i] if i < len(class_names) else i for i in unique_classes])
    plt.title(f"RUSBoost - Fold {fold_idx} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.savefig(f"confusion_matrices/fold_{fold_idx}.png")
    plt.close()
    
    # Calculate metrics per class
    class_metrics_list = []
    for class_label in unique_classes:
        # Create binary labels for this class
        y_test_bin = (y_test == class_label).astype(int)
        y_pred_bin = (y_pred == class_label).astype(int)
        
        # Calculate metrics
        class_specific_metrics = {
            'Classifier': 'RUSBoost',
            'Fold': fold_idx,
            'Class': class_names[class_label] if class_label < len(class_names) else class_label,
            'Accuracy': accuracy_score(y_test_bin, y_pred_bin),
            'Precision': precision_score(y_test_bin, y_pred_bin, zero_division=0),
            'Recall': recall_score(y_test_bin, y_pred_bin),
            'F1 Score': f1_score(y_test_bin, y_pred_bin),
            'Matthews Correlation Coefficient': matthews_corrcoef(y_test_bin, y_pred_bin),
            'Cohen Kappa': cohen_kappa_score(y_test_bin, y_pred_bin),
            'True Positive Rate (TPR)': cm_metrics[class_label]['TPR'],
            'True Negative Rate (TNR)': cm_metrics[class_label]['TNR'],
            'False Positive Rate (FPR)': cm_metrics[class_label]['FPR'],
            'False Negative Rate (FNR)': cm_metrics[class_label]['FNR'],
            'Training Time (s)': train_time,
            'Testing Time (s)': test_time
        }
        class_metrics_list.append(class_specific_metrics)
    
    # Append results for this fold
    results.extend(class_metrics_list)
    fold_idx += 1

# Create DataFrames for results
timing_df = pd.DataFrame(timing_results)
results_df = pd.DataFrame(results)

# Save results to CSV
timing_df.to_csv("results/time.csv", index=False)
results_df.to_csv("results/metrics.csv", index=False)

# Plot feature importance
feature_names = df.drop(columns=['label']).columns.tolist()
if hasattr(rusboost, 'feature_importances_'):
    importances = rusboost.feature_importances_
else:
    # Calculate feature importance based on base estimators
    importances = np.zeros(X.shape[1])
    for estimator in rusboost.estimators_:
        if hasattr(estimator, 'feature_importances_'):
            importances += estimator.feature_importances_
    
    importances = importances / len(rusboost.estimators_)

indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 8))
plt.title('RUSBoost Feature Importances')
plt.bar(range(len(indices)), importances[indices], align='center')
plt.xticks(range(len(indices)), [feature_names[i] for i in indices], rotation=90)
plt.tight_layout()
plt.savefig("visualizations/feature_importance.png")
plt.close()

# Visualize ROC curves for multiclass
try:
    from sklearn.preprocessing import label_binarize
    from sklearn.metrics import roc_curve, auc
    from itertools import cycle
    
    # Get probability estimates
    y_score = rusboost.predict_proba(X_test)
    
    # Binarize the output for ROC curve
    n_classes = len(unique_classes)
    y_test_bin = label_binarize(y_test, classes=unique_classes)
    
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    for i in range(n_classes):
        if i < y_score.shape[1]:  # Check if there are probability scores for this class
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Plot ROC curves
    plt.figure(figsize=(10, 8))
    colors = cycle(['blue', 'red', 'green', 'cyan', 'magenta', 'yellow', 'black'])
    
    for i, color in zip(range(n_classes), colors):
        if i in roc_auc and i < y_score.shape[1]:
            plt.plot(fpr[i], tpr[i], color=color, lw=2,
                    label=f'ROC curve of class {class_names[i]} (area = {roc_auc[i]:.2f})')
    
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for RUSBoost')
    plt.legend(loc="lower right")
    plt.savefig("visualizations/roc_curves.png")
    plt.close()
except Exception as e:
    print(f"Error plotting ROC curves: {e}")

# Generate a pair plot for the most important features
if len(feature_names) > 4:
    top_indices = indices[:4]
    top_features = [feature_names[i] for i in top_indices]
else:
    top_features = feature_names

top_features_df = df[top_features + ['label']].copy()
top_features_df['label'] = label_encoder.inverse_transform(top_features_df['label'])

plt.figure(figsize=(12, 10))
sns.pairplot(top_features_df, hue='label')
plt.savefig("visualizations/pairplot.png")
plt.close()

# # Visualize the class distribution
# plt.figure(figsize=(10, 6))
# class_counts = pd.Series(label_encoder.inverse_transform(y)).value_counts()
# sns.barplot(x=class_counts.index, y=class_counts.values)
# plt.title('Class Distribution')
# plt.xlabel('Class')
# plt.ylabel('Count')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.savefig("visualizations/class_distribution.png")
# plt.close()

# Plot learning curve if possible
try:
    from sklearn.model_selection import learning_curve
    
    train_sizes, train_scores, test_scores = learning_curve(
        rusboost, X, y, cv=3, scoring='accuracy', n_jobs=-1, 
        train_sizes=np.linspace(0.1, 1.0, 5)
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean, color='green', marker='s', markersize=5, label='Validation accuracy')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
    plt.title('Learning Curve for RUSBoost')
    plt.xlabel('Training Set Size')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.savefig("visualizations/learning_curve.png")
    plt.close()
except Exception as e:
    print(f"Error plotting learning curve: {e}")

    results_df = pd.DataFrame(results)
print("Classification Metrics Across Folds:")
print(results_df.head())



# Calculate and display average metrics across folds
avg_metrics = results_df.groupby(['Classifier', 'Class']).mean().reset_index()
avg_metrics.to_csv("results/avg_metrics.csv", index=False)
print("\nAverage Metrics Across Folds:")
print(avg_metrics[['Classifier', 'Class', 'Accuracy', 'Precision', 'Recall', 'F1 Score']])


print("RUSBoost implementation completed successfully!")

Loading dataset...
Original class distribution:
label
6     6767
14    5031
13    4148
10    3955
9     3828
8     3820
12    3371
21    3037
20    2429
19    1900
1     1032
23     961
25     844
24     733
7      463
15     293
22     291
4      277
16     158
26     115
29      79
27      78
18      57
32      33
5       18
11      16
17       9
33       5
3        4
31       2
30       2
2        2
0        2
28       1
Name: count, dtype: int64
Preprocessing data...
Training fold 1/2...
Handling class imbalance in fold 1...
Original training class distribution: Counter({6: 3362, 14: 2549, 13: 2065, 10: 1962, 9: 1935, 8: 1878, 12: 1740, 21: 1506, 20: 1201, 19: 954, 1: 504, 23: 480, 25: 409, 24: 380, 7: 229, 4: 145, 22: 144, 15: 138, 16: 77, 26: 62, 27: 43, 29: 38, 18: 34, 32: 14, 11: 11, 5: 10, 31: 2, 33: 2, 17: 2, 30: 1, 3: 1, 2: 1, 0: 1})
Class distribution after initial oversampling: Counter({6: 3362, 14: 2549, 13: 2065, 10: 1962, 9: 1935, 8: 1878, 12: 1740, 21: 1506, 20: 1201, 

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>