<a href="https://colab.research.google.com/github/gideonoludeyi/cosc5p70/blob/main/notebooks/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data: Predict Students' Dropout and Academic Success

@misc{predict_students'_dropout_and_academic_success_697,
  author       = {Realinho, Valentim, Vieira Martins, Mónica, Machado, Jorge, and Baptista, Luís},
  title        = {{Predict Students' Dropout and Academic Success}},
  year         = {2021},
  howpublished = {UCI Machine Learning Repository},
  note         = {{DOI}: [https://doi.org/10.24432/C5MC89](https://doi.org/10.24432/C5MC89)}
}

In [1]:
!pip install "ucimlrepo" "pandas" "numpy" "matplotlib" "torch" "scikit-learn"



In [2]:
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset - https://archive.ics.uci.edu/dataset/697
repo = fetch_ucirepo(id=697)

# data (as pandas dataframes)
X = repo.data.features
y = repo.data.targets['Target']

# metadata
# print(repo.metadata)

# variable information
# print(repo.variables)

In [4]:
# Fixing the random seed to guarantee deterministic results
def set_seed(seed):
    import os
    import random
    import numpy as np
    import torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed = 123456789
set_seed(seed)
rng = torch.Generator().manual_seed(seed)

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 36 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital Status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   int64  
 8   Mother's qualification                          4424 non-null   int64  
 9   Father's qualification                   

In [6]:
X.describe()

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
count,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,...,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0
mean,1.178571,18.669078,1.727848,8856.642631,0.890823,4.577758,132.613314,1.873192,19.561935,22.275316,...,0.137658,0.541817,6.232143,8.063291,4.435805,10.230206,0.150316,11.566139,1.228029,0.001969
std,0.605747,17.484682,1.313793,2063.566416,0.311897,10.216592,13.188332,6.914514,15.603186,15.343108,...,0.69088,1.918546,2.195951,3.947951,3.014764,5.210808,0.753774,2.66385,1.382711,2.269935
min,1.0,1.0,0.0,33.0,0.0,1.0,95.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.6,-0.8,-4.06
25%,1.0,1.0,1.0,9085.0,1.0,1.0,125.0,1.0,2.0,3.0,...,0.0,0.0,5.0,6.0,2.0,10.75,0.0,9.4,0.3,-1.7
50%,1.0,17.0,1.0,9238.0,1.0,1.0,133.1,1.0,19.0,19.0,...,0.0,0.0,6.0,8.0,5.0,12.2,0.0,11.1,1.4,0.32
75%,1.0,39.0,2.0,9556.0,1.0,1.0,140.0,1.0,37.0,37.0,...,0.0,0.0,7.0,10.0,6.0,13.333333,0.0,13.9,2.6,1.79
max,6.0,57.0,9.0,9991.0,1.0,43.0,190.0,109.0,44.0,44.0,...,12.0,19.0,23.0,33.0,20.0,18.571429,12.0,16.2,3.7,3.51


In [7]:
# There are three labels: 'Dropout', 'Enrolled', and 'Graduate'
# But there are way more instances of 'Graduate'.
#   could it lead to bias in the model?
y.value_counts()

Unnamed: 0_level_0,count
Target,Unnamed: 1_level_1
Graduate,2209
Dropout,1421
Enrolled,794


In [8]:
# Samples from the data
X.sample(10, random_state=seed)

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
2374,1,39,1,9130,1,1,150.0,1,19,19,...,1,0,5,9,3,12.0,0,9.4,-0.8,-3.12
4189,1,42,1,9254,1,1,154.0,1,38,37,...,0,3,12,20,8,12.0,0,9.4,-0.8,-3.12
831,1,39,1,9238,1,9,133.1,1,34,34,...,0,0,5,10,0,0.0,0,7.6,2.6,0.32
410,1,1,2,9500,1,1,137.0,1,19,19,...,0,0,8,8,7,14.654286,0,10.8,1.4,1.74
3983,1,17,1,9238,1,1,145.0,1,38,38,...,0,0,6,6,6,13.166667,0,12.4,0.5,1.79
3000,1,39,1,9500,1,1,148.0,1,3,19,...,0,0,8,12,7,12.714286,0,12.7,3.7,-1.7
2760,1,39,1,9500,1,4,150.0,1,3,38,...,0,2,8,10,7,13.30625,0,11.1,0.6,2.02
2660,2,39,1,9991,0,1,120.0,1,19,37,...,0,0,5,11,4,11.333333,0,13.9,-0.3,0.79
1661,1,1,2,9085,1,1,157.0,1,3,3,...,0,0,6,6,6,15.666667,0,13.9,-0.3,0.79
839,1,1,1,9147,1,1,125.0,1,38,1,...,0,0,5,7,5,12.4,0,9.4,-0.8,-3.12


In [9]:
# Data splitting - training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True, random_state=seed)

print(f"Train set size: {len(y_train)} ({len(y_train) / len(y):.2f})")
print(f"Test  set size: {len(y_test)}  ({len(y_test) / len(y):.2f})")

Train set size: 3539 (0.80)
Test  set size: 885  (0.20)


In [10]:
class SupervisedDataset(Dataset):
    def __init__(self, X, y, transform=None, target_transform=None):
        self.X = np.asarray(X)
        self.y = np.asarray(y)
        self.transform = transform # function to process the features
        self.target_transform = target_transform # function to process the label

    def __getitem__(self, index):
        features = self.X[index]
        label = self.y[index]
        if self.transform is not None:
            features = self.transform(features)
        if self.target_transform is not None:
            label = self.target_transform(label)
        return features, label

    def __len__(self):
        return len(self.y)

In [11]:
def onehot(label):
    """
    encode class labels as one-hot vectors of size 3

    Dropout  -> [1, 0, 0]
    Enrolled -> [0, 1, 0]
    Graduate -> [0, 0, 1]
    """
    encoding = {
        'Dropout': [1, 0, 0], # 0
        'Enrolled': [0, 1, 0], # 1
        'Graduate': [0, 0, 1], # 2
    }
    return torch.tensor(encoding[label]).type(torch.DoubleTensor)

print(f"'Dropout'  -> {onehot('Dropout')}")
print(f"'Enrolled' -> {onehot('Enrolled')}")
print(f"'Graduate' -> {onehot('Graduate')}")

'Dropout'  -> tensor([1., 0., 0.], dtype=torch.float64)
'Enrolled' -> tensor([0., 1., 0.], dtype=torch.float64)
'Graduate' -> tensor([0., 0., 1.], dtype=torch.float64)


In [12]:
def train_step(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    batch_size = dataloader.batch_size
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()

        if batch % (batch_size // 4) == 0:
            loss = loss.item()
            current = batch * dataloader.batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            optimizer.zero_grad() # reset gradient tensor


def test_step(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n accuracy: {(100*correct):>0.1f}%, avg.loss: {test_loss:>8f} \n")

In [13]:
# Neural network architecture (36 x 16 x 3)

input_dimensions = np.shape(X_train)[1]

print(f"Input dimensions: {input_dimensions}")

model = nn.Sequential(
    # Input Layer is implicit
    nn.Linear(input_dimensions, 16, dtype=float), # Hidden Layer
    nn.ReLU(), # Hidden Layer: activation
    nn.Linear(16, 3, dtype=float), # Output Layer
    nn.Softmax(dim=1), # Output Layer: activation
)

print(model)

Input dimensions: 36
Sequential(
  (0): Linear(in_features=36, out_features=16, bias=True)
  (1): ReLU()
  (2): Linear(in_features=16, out_features=3, bias=True)
  (3): Softmax(dim=1)
)


In [14]:
learning_rate = 0.001
epochs = 10
batch_size = 64

In [15]:
classes_train = np.unique(y_train)
loss_weights = compute_class_weight('balanced', classes=classes_train, y=y_train)
print(f"Classes:      {classes_train}")
print(f"Loss Weights: {loss_weights}")

# Loss function
#  assign weights labels due to imbalance in labels
#  e.g., there are more 'Graduate' instances than 'Enrolled' and 'Dropout'
#        so 'Graduate' label is assigned a lower weight in term of loss
loss_fn = nn.CrossEntropyLoss(weight=torch.from_numpy(loss_weights))

Classes:      ['Dropout' 'Enrolled' 'Graduate']
Loss Weights: [1.03752565 1.85774278 0.66760988]


In [16]:
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # Stochastic Gradient Descent

In [17]:
train_data = SupervisedDataset(X_train, y_train, target_transform=onehot)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, generator=rng)

test_data = SupervisedDataset(X_test, y_test, target_transform=onehot)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True, generator=rng)

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_step(train_dataloader, model, loss_fn, optimizer)
    test_step(test_dataloader, model, loss_fn)

Epoch 1
-------------------------------
loss: 1.319063  [   64/ 3539]
loss: 1.317952  [ 1088/ 3539]
loss: 1.192007  [ 2112/ 3539]
loss: 1.245810  [ 3136/ 3539]
Test Error: 
 accuracy: 18.8%, avg.loss: 1.219499 

Epoch 2
-------------------------------
loss: 1.233525  [   64/ 3539]
loss: 1.204900  [ 1088/ 3539]
loss: 1.170047  [ 2112/ 3539]
loss: 1.197125  [ 3136/ 3539]
Test Error: 
 accuracy: 18.8%, avg.loss: 1.218862 

Epoch 3
-------------------------------
loss: 1.226152  [   64/ 3539]
loss: 1.204369  [ 1088/ 3539]
loss: 1.253231  [ 2112/ 3539]
loss: 1.204546  [ 3136/ 3539]
Test Error: 
 accuracy: 18.8%, avg.loss: 1.218833 

Epoch 4
-------------------------------
loss: 1.252877  [   64/ 3539]
loss: 1.177644  [ 1088/ 3539]
loss: 1.189351  [ 2112/ 3539]
loss: 1.220808  [ 3136/ 3539]
Test Error: 
 accuracy: 18.8%, avg.loss: 1.219232 

Epoch 5
-------------------------------
loss: 1.171947  [   64/ 3539]
loss: 1.264275  [ 1088/ 3539]
loss: 1.179014  [ 2112/ 3539]
loss: 1.261137  [ 3136