In [None]:
# Import necessary libraries
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical computing
import math  # Mathematical functions

import matplotlib.pyplot as plt  # Plotting and visualization
import seaborn as sns  # Statistical data visualization

import warnings  # Warning management

from tqdm.notebook import tqdm  # Progress bar for notebooks

import scipy.stats as ss  # Statistical functions
from sklearn.decomposition import PCA  # Principal Component Analysis
from sklearn.preprocessing import MinMaxScaler  # Data scaling
from sklearn.model_selection import train_test_split  # Data splitting
from sklearn.metrics import confusion_matrix  # Confusion matrix for evaluation
from sklearn.metrics import mean_squared_error  # Mean squared error for evaluation

import torch  # PyTorch deep learning library
import torch.nn as nn  # Neural network modules
from torch.utils.data import DataLoader, TensorDataset  # Data loading and batching
import torch.nn.functional as F  # Neural network functions

from imblearn.over_sampling import SMOTE  # Oversampling for imbalanced data
from itertools import product  # Iteration tools

# Suppress warnings
warnings.filterwarnings('ignore')
# Set seaborn style for plots
sns.set()

In [None]:
# Load the dataset
df = pd.read_csv('data/responses_.csv')

# Convert data to float (this doesn't modify df in place, you need to reassign)
df = df.astype('float') 

# Calculate the 'FI' feature by taking the ceiling of the median of 'FI1' to 'FI4'
df['FI'] = np.ceil(df[['FI1', 'FI2', 'FI3', 'FI4']].median(axis=1).values).astype('float')

# Subtract 1 from all values in the DataFrame 
df = df - 1

In [None]:
# Plot a histogram of the 'FI' column
plt.hist(df.FI)

# Set x-axis ticks to be integers from 0 to 4
plt.xticks([x for x in np.arange(0, 5)])  

# Label the y-axis as 'Count'
plt.ylabel('Count')

# Label the x-axis as 'FI Classes'
plt.xlabel('FI Classes')

# Display the plot
plt.show()

In [None]:
# Create the feature matrix X by dropping the 'FI1', 'FI2', 'FI3', 'FI4', and 'FI' columns
X = df.drop(['FI1', 'FI2', 'FI3', 'FI4', 'FI'], axis=1).values  

# Create the target variable y by extracting the 'FI' column
y = df['FI'].values

In [None]:
# Initialize the SMOTE oversampling technique
sm = SMOTE(random_state=42, k_neighbors=1)

In [None]:
# Apply SMOTE to oversample the minority class in the dataset
X_res, y_res = sm.fit_resample(X, y)

In [None]:
# Plot a histogram of the resampled 'FI' classes
plt.hist(y_res)

# Set x-axis ticks to be integers from 0 to 4
plt.xticks([x for x in np.arange(0, 5)]) 

# Label the y-axis as 'Count'
plt.ylabel('Count')  

# Label the x-axis as 'FI Classes'
plt.xlabel('FI Classes')  

# Display the plot
plt.show()

In [None]:
set(y_res)

In [None]:
# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=21)  

# Further split the testing set into validation and testing sets
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=21)

In [None]:
# Print the shapes of the training, validation, and testing sets
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of X_valid: {X_valid.shape}') 
print(f'Shape of X_test: {X_test.shape}')

print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of y_valid: {y_valid.shape}') 
print(f'Shape of y_test: {y_test.shape}')

In [None]:
# Create TensorDatasets for training, validation, and testing
train_data = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
valid_data = TensorDataset(torch.from_numpy(X_valid).float(), torch.from_numpy(y_valid).float())
test_data = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())

# Create DataLoaders for batching and shuffling
train_loader = DataLoader(train_data, shuffle=True)  # Shuffle training data
valid_loader = DataLoader(valid_data, shuffle=True)  # Shuffle validation data
test_loader = DataLoader(test_data)  # No need to shuffle test data

In [None]:
# Determine the device to be used for training (MPS, CUDA, or CPU)
device = (
    "mps"
    if getattr(torch, "has_mps", False)
    else "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using device: {device}")

In [None]:
len(set(y_res))+1

In [None]:
class ClassifierModel(nn.Module):
    """
    A simple feedforward neural network classifier.

    Args:
        drpt1 (float, optional): Dropout probability for the first dropout layer. Defaults to 0.2.
        drpt2 (float, optional): Dropout probability for the second dropout layer. Defaults to 0.4.

    Attributes:
        linear1 (nn.Linear): First linear layer.
        linear2 (nn.Linear): Second linear layer.
        linear3 (nn.Linear): Third linear layer (output layer).
        dropout1 (nn.Dropout): First dropout layer.
        dropout2 (nn.Dropout): Second dropout layer.

    Forward Pass:
        1. Applies the first linear layer to the input and then a ReLU activation.
        2. Applies the first dropout layer to the output of the previous step.
        3. Applies the second linear layer and then a ReLU activation.
        4. Applies the second dropout layer.
        5. Applies the third linear layer (output) and a ReLU activation.

    Returns:
        torch.Tensor: The output tensor of the model.
    """
    def __init__(self, drpt1=0.2, drpt2=0.4):
        super(ClassifierModel, self).__init__()
        self.linear1 = nn.Linear(79, 79)  # First linear layer
        self.linear2 = nn.Linear(79, 50)  # Second linear layer
        self.linear3 = nn.Linear(50, len(set(y_res)))  # Output layer
        self.dropout1 = nn.Dropout(p=drpt2)  # Dropout layer 1
        self.dropout2 = nn.Dropout(p=drpt2)  # Dropout layer 2

    def forward(self, x):
        x = self.dropout1(F.relu(self.linear1(x)))  # Linear layer 1 + ReLU + Dropout
        x = self.dropout2(F.relu(self.linear2(x)))  # Linear layer 2 + ReLU + Dropout
        x = F.relu(self.linear3(x))  # Output layer + ReLU
        return x

In [None]:
def accuracy(out, labels):
    """Calculates the accuracy of the model's predictions.

    Args:
      out (torch.Tensor): The output tensor from the model.
      labels (torch.Tensor): The true labels.

    Returns:
      int: The number of correct predictions.
    """
    _, pred = torch.max(out, dim=1)  # Get the predicted class indices
    return torch.sum(pred == labels).item()  # Count correct predictions and convert to Python number

In [None]:
# Initialize the classifier model
model = ClassifierModel()

# Define the loss function with class weights
criterion = nn.CrossEntropyLoss(weight=torch.tensor([0.1, 0.1, 0.1, 0.5, 0.2]))  

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)  

# Print the model architecture
model

In [None]:
n_epochs = 1  # Number of training epochs
print_every = 10  # Print training progress every 10 epochs
valid_loss_min = np.Inf  # Initialize minimum validation loss to infinity
val_loss = []  # List to store validation losses
val_acc = []  # List to store validation accuracies
train_loss = []  # List to store training losses
train_acc = []  # List to store training accuracies
total_step = len(train_loader)  # Total number of batches in the training loader

# Training loop
for epoch in tqdm(range(1, n_epochs+1)):
    running_loss = 0.0  # Initialize running loss for the epoch
    correct = 0  # Initialize correct predictions for the epoch
    total = 0  # Initialize total samples for the epoch
    print(f'Epoch {epoch}\n')

    # Iterate over batches in the training loader
    for batch_idx, (data_, target_) in enumerate(train_loader):
        outputs = model(data_)  # Forward pass
        loss = criterion(outputs, target_.long())  # Calculate loss
        optimizer.zero_grad()  # Clear gradients
        loss.backward()  # Backpropagate the loss
        optimizer.step()  # Update model weights
        running_loss += loss.item()  # Accumulate running loss
        _, pred = torch.max(outputs, dim=1)  # Get predicted class indices
        correct += torch.sum(pred == target_).item()  # Count correct predictions
        total += target_.size(0)  # Count total samples

    train_acc.append(100 * correct / total)  # Calculate and store training accuracy
    train_loss.append(running_loss / total_step)  # Calculate and store training loss

    batch_loss = 0  # Initialize batch loss for validation
    total_t = 0  # Initialize total samples for validation
    correct_t = 0  # Initialize correct predictions for validation

    # Validation loop (no gradients needed)
    with torch.no_grad():
        model.eval()  # Set model to evaluation mode
        for data_t, target_t in (valid_loader):
            outputs_t = model(data_t)  # Forward pass
            loss_t = criterion(outputs_t, target_t.long())  # Calculate loss
            batch_loss += loss_t.item()  # Accumulate batch loss
            _, pred_t = torch.max(outputs_t, dim=1)  # Get predicted class indices
            correct_t += torch.sum(pred_t == target_t).item()  # Count correct predictions
            total_t += target_t.size(0)  # Count total samples

        val_acc.append(100 * correct_t / total_t)  # Calculate and store validation accuracy
        val_loss.append(batch_loss / len(valid_loader))  # Calculate and store validation loss
        network_learned = batch_loss < valid_loss_min  # Check if validation loss improved

        # Print validation progress
        print(f'validation loss: {np.mean(val_loss):.4f}, validation acc: {(100 * correct_t / total_t):.4f}\n')

        # Save model if validation loss improved
        if network_learned:
            valid_loss_min = batch_loss
            torch.save(model.state_dict(), 'model_classification_tutorial.pt')
            print('Detected network improvement, saving current model')

    model.train()  # Set model back to training mode

In [None]:
# Plot the training and validation loss curves
plt.plot(train_loss, label='Training')
plt.plot(val_loss, label='Validation')
plt.xlabel('Epochs', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.legend(loc='best')
plt.show()  # Add plt.show() to display the plot

In [None]:
# Plot the training and validation accuracy curves
plt.plot(train_acc, label='Training')
plt.plot(val_acc, label='Validation')
plt.xlabel('Epochs', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.legend(loc='best')
plt.show()  # Add plt.show() to display the plot

In [None]:
with torch.no_grad():
    model.eval()  # Set the model to evaluation mode
    batch_loss = 0  # Initialize batch loss for validation
    total_t = 0  # Initialize total samples for validation
    correct_t = 0  # Initialize correct predictions for validation

    # Iterate over the validation data loader
    for data_t, target_t in (valid_loader):
        outputs_t = model(data_t)  # Forward pass
        loss_t = criterion(outputs_t, target_t.long())  # Calculate the loss
        batch_loss += loss_t.item()  # Accumulate batch loss
        _, pred_t = torch.max(outputs_t, dim=1)  # Get the predicted class indices
        correct_t += torch.sum(pred_t == target_t).item()  # Count correct predictions
        total_t += target_t.size(0)  # Count total samples

    val_acc.append(100 * correct_t / total_t)  # Calculate and store validation accuracy
    val_loss.append(batch_loss / len(valid_loader))  # Calculate and store validation loss
    network_learned = batch_loss < valid_loss_min  # Check if validation loss improved

    # Print validation progress
    print(f'validation loss: {np.mean(val_loss):.4f}, validation acc: {(100 * correct_t / total_t):.4f}\n')

    # Save the best weight
    if network_learned:
        valid_loss_min = batch_loss
        torch.save(model.state_dict(), 'model_classification_tutorial.pt')
        print('Detected network improvement, saving current model')

In [None]:
preds_vals = []  # List to store predicted values
label_vals = []  # List to store true labels

with torch.no_grad():  # Disable gradient calculation
    # Iterate over the test data loader
    for data_t, target_t in (test_loader):
        outputs_t = model(data_t)  # Forward pass
        _, pred_t = torch.max(outputs_t, dim=1)  # Get predicted class indices

        # Assuming you want to store all predictions and labels (for multi-class)
        preds_vals.extend(pred_t.tolist())  # Store all predicted values in the list
        label_vals.extend(target_t.long().tolist())  # Store all true labels in the list

In [None]:
# Create a DataFrame from the predicted and true labels
df_res = pd.DataFrame([preds_vals, label_vals]).T  
df_res.columns = ['pred', 'true']  # Rename columns
df_res = df_res + 1  # Add 1 to the predictions and true labels (potentially reversing the -1 done during preprocessing)
df_res['correct'] = df_res['pred'] == df_res['true']  # Check for correct predictions
df_res['correct'] = df_res['correct'].map({True: 1, False: 0})  # Convert boolean to integer (1 for correct, 0 for incorrect)
df_res  # Display the DataFrame

In [None]:
print(f'Test Accuracy: {round(df_res.correct.sum()/len(df_res)*100, 2)}')

In [None]:
c_mat = confusion_matrix(df_res.pred, df_res.true, labels=[1, 2, 3, 4, 5], normalize='true')
vals = ['1', '2', '3', '4', '5']
cmat = pd.DataFrame(c_mat, columns = vals, index=vals)

In [None]:
ax = sns.heatmap(cmat, annot=True, cmap='crest', 
                 cbar_kws={'label': 'Score'})

ax.figure.axes[-1].yaxis.label.set_size(14)
plt.xlabel('Predicted')
plt.ylabel('True Value')