In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             balanced_accuracy_score, confusion_matrix,
                             matthews_corrcoef, cohen_kappa_score)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
import time
import os

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Set the number of K folds
K_FOLDS = 2

# Create directories to save results
os.makedirs("confusion_matrices", exist_ok=True)
os.makedirs("visualizations", exist_ok=True)
os.makedirs("results", exist_ok=True)

# Helper function for confusion matrix metrics
def confusion_matrix_metrics(cm, classes):
    metrics = {}
    for idx, class_label in enumerate(classes):
        TP = cm[idx, idx]  # True Positives for this class
        FP = cm[:, idx].sum() - TP  # False Positives for this class
        FN = cm[idx, :].sum() - TP  # False Negatives for this class
        TN = cm.sum() - (TP + FP + FN)  # True Negatives for this class

        metrics[class_label] = {
            'TPR': TP / (TP + FN + 1e-10) if (TP + FN) > 0 else 0,
            'TNR': TN / (TN + FP + 1e-10) if (TN + FP) > 0 else 0,
            'FPR': FP / (FP + TN + 1e-10) if (FP + TN) > 0 else 0,
            'FNR': FN / (FN + TP + 1e-10) if (FN + TP) > 0 else 0
        }
    return metrics

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('C:/Users/ddihora1604/Downloads/IIT Patna/Task/Dataset 1/Student_performance_data.csv')

# Take 20% of the data for faster processing (optional, comment out if you want to use full dataset)
df = df.sample(frac=0.2, random_state=42)

# Rename the last column as 'label' if it's not already named that
df.rename(columns={df.columns[-1]: 'label'}, inplace=True)

# Preprocessing: Handle missing values
print("Preprocessing data...")
# Replace NaN values with column means for numerical columns
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col].fillna(df[col].mean(), inplace=True)

# For categorical columns, fill with mode
for col in df.select_dtypes(include=['object']).columns:
    if col != 'label':  # Don't replace label
        df[col].fillna(df[col].mode()[0], inplace=True)

# Encode categorical features if any
for col in df.select_dtypes(include=['object']).columns:
    if col != 'label':  # Don't encode label yet
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Encode the label column
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_

# Extract features and target
X = df.drop(columns=['label']).values
y = df['label'].values

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Reshape input for RNN (samples, timesteps, features)
# Treat each feature as a time step for simplicity
X_reshaped = X.reshape(X.shape[0], 1, X.shape[1])

# Initialize to store results
results = []
timing_results = []

# Create K-fold cross-validation
kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)

# Function to build the RNN model
def build_rnn_model(input_shape, num_classes):
    model = Sequential([
        LSTM(64, input_shape=input_shape, return_sequences=True),
        Dropout(0.2),
        LSTM(32),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Fold-wise training and evaluation
fold_idx = 1
for train_index, test_index in kf.split(X_reshaped):
    print(f"Training fold {fold_idx}/{K_FOLDS}...")
    
    # Split the data
    X_train, X_test = X_reshaped[train_index], X_reshaped[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Convert labels to one-hot encoding for RNN
    y_train_onehot = to_categorical(y_train, num_classes=num_classes)
    
    # Build and train the model
    rnn_model = build_rnn_model(input_shape=(X_train.shape[1], X_train.shape[2]), num_classes=num_classes)
    
    # Record start time
    start_train_time = time.time()
    
    # Train the model
    history = rnn_model.fit(
        X_train, y_train_onehot,
        epochs=10,
        batch_size=32,
        verbose=1,
        validation_split=0.2
    )
    
    train_time = time.time() - start_train_time
    
    # Plot training history
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title(f'RNN Accuracy - Fold {fold_idx}')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title(f'RNN Loss - Fold {fold_idx}')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.tight_layout()
    plt.savefig(f"visualizations/training_history_fold_{fold_idx}.png")
    plt.close()
    
    # Make predictions
    start_test_time = time.time()
    y_pred_proba = rnn_model.predict(X_test)
    test_time = time.time() - start_test_time
    
    # Convert probabilities to classes
    y_pred = np.argmax(y_pred_proba, axis=1)
    
    # Record timing results
    timing_results.append({
        'Classifier': 'RNN',
        'Fold': fold_idx,
        'Training Time (s)': train_time,
        'Testing Time (s)': test_time,
        'Total Time (s)': train_time + test_time
    })
    
    # Compute metrics
    unique_classes = np.unique(y)
    cm = confusion_matrix(y_test, y_pred, labels=unique_classes)
    cm_metrics = confusion_matrix_metrics(cm, unique_classes)
    
    # Plot and save confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=[class_names[i] if i < len(class_names) else i for i in unique_classes],
                yticklabels=[class_names[i] if i < len(class_names) else i for i in unique_classes])
    plt.title(f"RNN - Fold {fold_idx} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.savefig(f"confusion_matrices/fold_{fold_idx}.png")
    plt.close()
    
    # Calculate metrics per class
    class_metrics_list = []
    for class_label in unique_classes:
        # Create binary labels for this class
        y_test_bin = (y_test == class_label).astype(int)
        y_pred_bin = (y_pred == class_label).astype(int)
        
        # Calculate metrics
        class_specific_metrics = {
            'Classifier': 'RNN',
            'Fold': fold_idx,
            'Class': class_names[class_label] if class_label < len(class_names) else class_label,
            'Accuracy': accuracy_score(y_test_bin, y_pred_bin),
            'Precision': precision_score(y_test_bin, y_pred_bin, zero_division=0),
            'Recall': recall_score(y_test_bin, y_pred_bin),
            'F1 Score': f1_score(y_test_bin, y_pred_bin),
            'Matthews Correlation Coefficient': matthews_corrcoef(y_test_bin, y_pred_bin),
            'Cohen Kappa': cohen_kappa_score(y_test_bin, y_pred_bin),
            'True Positive Rate (TPR)': cm_metrics[class_label]['TPR'],
            'True Negative Rate (TNR)': cm_metrics[class_label]['TNR'],
            'False Positive Rate (FPR)': cm_metrics[class_label]['FPR'],
            'False Negative Rate (FNR)': cm_metrics[class_label]['FNR'],
            'Training Time (s)': train_time,
            'Testing Time (s)': test_time
        }
        class_metrics_list.append(class_specific_metrics)
    
    # Append results for this fold
    results.extend(class_metrics_list)
    fold_idx += 1

# Create DataFrames for results
timing_df = pd.DataFrame(timing_results)
results_df = pd.DataFrame(results)

# Save results to CSV
timing_df.to_csv("results/time.csv", index=False)
results_df.to_csv("results/metrics.csv", index=False)

# Feature importance isn't directly available for RNNs, but we can try to visualize input-output correlations
X_original = df.drop(columns=['label']).values  # Original features before reshaping
feature_names = df.drop(columns=['label']).columns.tolist()

# Calculate correlation matrix
corr_matrix = pd.DataFrame(X_original, columns=feature_names).corrwith(pd.Series(y)).sort_values(ascending=False)

# Plot feature correlation with target
plt.figure(figsize=(12, 8))
corr_matrix.plot(kind='bar')
plt.title('Feature Correlation with Target')
plt.xlabel('Features')
plt.ylabel('Correlation')
plt.tight_layout()
plt.savefig("visualizations/feature_correlation.png")
plt.close()

# Generate a pair plot for the most important features
if len(feature_names) > 4:
    top_features = corr_matrix.abs().nlargest(4).index.tolist()
else:
    top_features = feature_names

top_features_df = df[top_features + ['label']].copy()
top_features_df['label'] = label_encoder.inverse_transform(top_features_df['label'])

plt.figure(figsize=(12, 10))
sns.pairplot(top_features_df, hue='label')
plt.savefig("visualizations/pairplot.png")
plt.close()



results_df = pd.DataFrame(results)
print("Classification Metrics Across Folds:")
print(results_df.head())


# Calculate and display average metrics across folds
avg_metrics = results_df.groupby(['Classifier', 'Class']).mean().reset_index()
avg_metrics.to_csv("results/avg_metrics.csv", index=False)
print("\nAverage Metrics Across Folds:")
print(avg_metrics[['Classifier', 'Class', 'Accuracy', 'Precision', 'Recall', 'F1 Score']])

print("RNN Classifier implementation completed successfully!")

Loading dataset...
Preprocessing data...
Training fold 1/2...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
  super().__init__(**kwargs)


Epoch 1/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 74ms/step - accuracy: 0.2574 - loss: 1.6057 - val_accuracy: 0.3958 - val_loss: 1.5977
Epoch 2/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.5526 - loss: 1.5823 - val_accuracy: 0.5000 - val_loss: 1.5804
Epoch 3/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5165 - loss: 1.5643 - val_accuracy: 0.4792 - val_loss: 1.5622
Epoch 4/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5033 - loss: 1.5406 - val_accuracy: 0.4792 - val_loss: 1.5415
Epoch 5/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5150 - loss: 1.5131 - val_accuracy: 0.4792 - val_loss: 1.5164
Epoch 6/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5173 - loss: 1.4786 - val_accuracy: 0.4792 - val_loss: 1.4855
Epoch 7/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━

  super().__init__(**kwargs)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 76ms/step - accuracy: 0.2907 - loss: 1.6058 - val_accuracy: 0.6042 - val_loss: 1.5953
Epoch 2/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5326 - loss: 1.5925 - val_accuracy: 0.5625 - val_loss: 1.5827
Epoch 3/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5785 - loss: 1.5786 - val_accuracy: 0.5000 - val_loss: 1.5676
Epoch 4/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.5355 - loss: 1.5616 - val_accuracy: 0.5000 - val_loss: 1.5492
Epoch 5/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.5223 - loss: 1.5405 - val_accuracy: 0.5000 - val_loss: 1.5261
Epoch 6/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5696 - loss: 1.5169 - val_accuracy: 0.5000 - val_loss: 1.4979
Epoch 7/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<Figure size 1200x1000 with 0 Axes>