In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             balanced_accuracy_score, confusion_matrix,
                             matthews_corrcoef, cohen_kappa_score)
import time
import os

# Set random seed for reproducibility
np.random.seed(42)

# Set the number of K folds
K_FOLDS = 2

# Create directories to save results
os.makedirs("confusion_matrices", exist_ok=True)
os.makedirs("visualizations", exist_ok=True)
os.makedirs("results", exist_ok=True)

# Helper function for confusion matrix metrics
def confusion_matrix_metrics(cm, classes):
    metrics = {}
    for idx, class_label in enumerate(classes):
        TP = cm[idx, idx]  # True Positives for this class
        FP = cm[:, idx].sum() - TP  # False Positives for this class
        FN = cm[idx, :].sum() - TP  # False Negatives for this class
        TN = cm.sum() - (TP + FP + FN)  # True Negatives for this class

        metrics[class_label] = {
            'TPR': TP / (TP + FN + 1e-10) if (TP + FN) > 0 else 0,
            'TNR': TN / (TN + FP + 1e-10) if (TN + FP) > 0 else 0,
            'FPR': FP / (FP + TN + 1e-10) if (FP + TN) > 0 else 0,
            'FNR': FN / (FN + TP + 1e-10) if (FN + TP) > 0 else 0
        }
    return metrics

# Rotation Forest implementation
class RotationForestClassifier:
    def __init__(self, n_estimators=10, feature_groups=3, max_features=0.75, random_state=42):
        self.n_estimators = n_estimators
        self.feature_groups = feature_groups
        self.max_features = max_features
        self.random_state = random_state
        self.estimators = []
        self.feature_indices = []
        self.pcas = []
        self.feature_importances_ = None
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.estimators = []
        self.feature_indices = []
        self.pcas = []
        
        # Initialize feature importance
        all_importances = np.zeros((self.n_estimators, n_features))
        
        # Set random seed for reproducibility
        np.random.seed(self.random_state)
        
        for i in range(self.n_estimators):
            # Randomly determine number of features per group
            max_features = int(n_features * self.max_features) if isinstance(self.max_features, float) else self.max_features
            features_per_group = max(1, max_features // self.feature_groups)
            
            # Shuffle feature indices
            feature_indices = np.random.permutation(n_features)
            
            # Group features
            groups = [feature_indices[j:j+features_per_group] for j in range(0, len(feature_indices), features_per_group)]
            
            # Create rotation matrix (identity initially)
            R = np.zeros((n_features, n_features))
            
            # Apply PCA on each group
            for group in groups:
                if len(group) > 0:
                    # Select subset of data for this group
                    X_subset = X[:, group]
                    
                    # Apply PCA
                    pca = PCA(random_state=self.random_state+i)
                    pca.fit(X_subset)
                    
                    # Fill the rotation matrix with the PCA components
                    for j, idx in enumerate(group):
                        if j < len(pca.components_):
                            for k, comp_idx in enumerate(group):
                                if k < len(pca.components_[j]):
                                    R[idx, comp_idx] = pca.components_[j, k]
            
            # Transform the data
            X_transformed = X @ R
            
            # Train a decision tree on the transformed data
            tree = DecisionTreeClassifier(random_state=self.random_state+i)
            tree.fit(X_transformed, y)
            
            # Store the estimator, rotation matrix, and feature indices
            self.estimators.append(tree)
            self.feature_indices.append(feature_indices)
            self.pcas.append(R)
            
            # Get feature importances from this tree
            importance = np.zeros(n_features)
            for idx, imp in zip(range(n_features), tree.feature_importances_):
                importance[idx] = imp
            all_importances[i] = importance
        
        # Average feature importances across all trees
        self.feature_importances_ = np.mean(all_importances, axis=0)
        
        return self
    
    def predict(self, X):
        predictions = np.zeros((X.shape[0], len(self.estimators)))
        
        for i, (tree, R) in enumerate(zip(self.estimators, self.pcas)):
            # Transform the data using the rotation matrix
            X_transformed = X @ R
            
            # Make predictions
            predictions[:, i] = tree.predict(X_transformed)
        
        # Majority voting
        return np.array([np.bincount(predictions[i, :].astype(int)).argmax() for i in range(X.shape[0])])
    
    def predict_proba(self, X):
        probas = np.zeros((X.shape[0], len(np.unique(self.estimators[0].predict(X @ self.pcas[0])))))
        
        for i, (tree, R) in enumerate(zip(self.estimators, self.pcas)):
            # Transform the data using the rotation matrix
            X_transformed = X @ R
            
            # Make probability predictions
            probas += tree.predict_proba(X_transformed)
        
        # Average probabilities
        return probas / len(self.estimators)

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('C:/Users/ddihora1604/Downloads/IIT Patna/Task/Dataset 2/part-00001_preprocessed_dataset.csv')

# Take 20% of the data for faster processing (optional, comment out if you want to use full dataset)
df = df.sample(frac=0.2, random_state=42)

# Rename the last column as 'label' if it's not already named that
df.rename(columns={df.columns[-1]: 'label'}, inplace=True)

# Preprocessing: Handle missing values
print("Preprocessing data...")
# Replace NaN values with column means for numerical columns
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col].fillna(df[col].mean(), inplace=True)

# For categorical columns, fill with mode
for col in df.select_dtypes(include=['object']).columns:
    if col != 'label':  # Don't replace label
        df[col].fillna(df[col].mode()[0], inplace=True)

# Encode categorical features if any
for col in df.select_dtypes(include=['object']).columns:
    if col != 'label':  # Don't encode label yet
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Encode the label column
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
class_names = label_encoder.classes_

# Extract features and target
X = df.drop(columns=['label']).values
y = df['label'].values

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize to store results
results = []
timing_results = []

# Create K-fold cross-validation
kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)

# Fold-wise training and evaluation
fold_idx = 1
for train_index, test_index in kf.split(X):
    print(f"Training fold {fold_idx}/{K_FOLDS}...")
    
    # Split the data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Initialize Rotation Forest
    rotation_forest = RotationForestClassifier(n_estimators=10, feature_groups=3, random_state=42)
    
    # Record start time
    start_train_time = time.time()
    
    # Train the model
    rotation_forest.fit(X_train, y_train)
    
    train_time = time.time() - start_train_time
    
    # Make predictions
    start_test_time = time.time()
    y_pred = rotation_forest.predict(X_test)
    test_time = time.time() - start_test_time
    
    # Record timing results
    timing_results.append({
        'Classifier': 'RotationForest',
        'Fold': fold_idx,
        'Training Time (s)': train_time,
        'Testing Time (s)': test_time,
        'Total Time (s)': train_time + test_time
    })
    
    # Compute metrics
    unique_classes = np.unique(y)
    cm = confusion_matrix(y_test, y_pred, labels=unique_classes)
    cm_metrics = confusion_matrix_metrics(cm, unique_classes)
    
    # Plot and save confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=[class_names[i] if i < len(class_names) else i for i in unique_classes],
                yticklabels=[class_names[i] if i < len(class_names) else i for i in unique_classes])
    plt.title(f"Rotation Forest - Fold {fold_idx} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.savefig(f"confusion_matrices/fold_{fold_idx}.png")
    plt.close()
    
    # Calculate metrics per class
    class_metrics_list = []
    for class_label in unique_classes:
        # Create binary labels for this class
        y_test_bin = (y_test == class_label).astype(int)
        y_pred_bin = (y_pred == class_label).astype(int)
        
        # Calculate metrics
        class_specific_metrics = {
            'Classifier': 'RotationForest',
            'Fold': fold_idx,
            'Class': class_names[class_label] if class_label < len(class_names) else class_label,
            'Accuracy': accuracy_score(y_test_bin, y_pred_bin),
            'Precision': precision_score(y_test_bin, y_pred_bin, zero_division=0),
            'Recall': recall_score(y_test_bin, y_pred_bin),
            'F1 Score': f1_score(y_test_bin, y_pred_bin),
            'Matthews Correlation Coefficient': matthews_corrcoef(y_test_bin, y_pred_bin),
            'Cohen Kappa': cohen_kappa_score(y_test_bin, y_pred_bin),
            'True Positive Rate (TPR)': cm_metrics[class_label]['TPR'],
            'True Negative Rate (TNR)': cm_metrics[class_label]['TNR'],
            'False Positive Rate (FPR)': cm_metrics[class_label]['FPR'],
            'False Negative Rate (FNR)': cm_metrics[class_label]['FNR'],
            'Training Time (s)': train_time,
            'Testing Time (s)': test_time
        }
        class_metrics_list.append(class_specific_metrics)
    
    # Append results for this fold
    results.extend(class_metrics_list)
    fold_idx += 1

# Create DataFrames for results
timing_df = pd.DataFrame(timing_results)
results_df = pd.DataFrame(results)

# Save results to CSV
timing_df.to_csv("results/time.csv", index=False)
results_df.to_csv("results/metrics.csv", index=False)

# Plot feature importance
if hasattr(rotation_forest, 'feature_importances_'):
    feature_names = df.drop(columns=['label']).columns.tolist()
    importances = rotation_forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 8))
    plt.title('Rotation Forest Feature Importances')
    plt.bar(range(len(indices)), importances[indices], align='center')
    plt.xticks(range(len(indices)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.savefig("visualizations/feature_importance.png")
    plt.close()

# Generate a pair plot for the most important features
feature_names = df.drop(columns=['label']).columns.tolist()

# Create a correlation matrix for feature selection
corr_matrix = pd.DataFrame(X, columns=feature_names).corrwith(pd.Series(y)).abs().sort_values(ascending=False)

# Select top features
if len(feature_names) > 4:
    top_features = corr_matrix.nlargest(4).index.tolist()
else:
    top_features = feature_names

top_features_df = df[top_features + ['label']].copy()
top_features_df['label'] = label_encoder.inverse_transform(top_features_df['label'])

plt.figure(figsize=(12, 10))
sns.pairplot(top_features_df, hue='label')
plt.savefig("visualizations/pairplot.png")
plt.close()

# Visualize the distribution of classes
plt.figure(figsize=(10, 6))
class_counts = pd.Series(label_encoder.inverse_transform(y)).value_counts()
sns.barplot(x=class_counts.index, y=class_counts.values)
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("visualizations/class_distribution.png")
plt.close()



results_df = pd.DataFrame(results)
print("Classification Metrics Across Folds:")
print(results_df.head())



# Calculate and display average metrics across folds
avg_metrics = results_df.groupby(['Classifier', 'Class']).mean().reset_index()
avg_metrics.to_csv("results/avg_metrics.csv", index=False)
print("\nAverage Metrics Across Folds:")
print(avg_metrics[['Classifier', 'Class', 'Accuracy', 'Precision', 'Recall', 'F1 Score']])

print("Rotation Forest implementation completed successfully!")

Loading dataset...
Preprocessing data...
Training fold 1/2...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


Training fold 2/2...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  c /= stddev[:, None]
  c /= stddev[None, :]


Classification Metrics Across Folds:
       Classifier  Fold  Class  Accuracy  Precision    Recall  F1 Score  \
0  RotationForest     1      0  0.999954   0.000000  0.000000  0.000000   
1  RotationForest     1      1  0.992688   0.788401  0.952652  0.862779   
2  RotationForest     1      2  0.999954   0.000000  0.000000  0.000000   
3  RotationForest     1      3  0.999863   0.000000  0.000000  0.000000   
4  RotationForest     1      4  0.999863   1.000000  0.977273  0.988506   

   Matthews Correlation Coefficient  Cohen Kappa  True Positive Rate (TPR)  \
0                          0.000000     0.000000                  0.000000   
1                          0.863118     0.859057                  0.952652   
2                          0.000000     0.000000                  0.000000   
3                          0.000000     0.000000                  0.000000   
4                          0.988503     0.988437                  0.977273   

   True Negative Rate (TNR)  False Positive

<Figure size 1200x1000 with 0 Axes>