<a href="https://colab.research.google.com/github/gideonoludeyi/cosc5p70/blob/main/notebooks/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data: Predict Students' Dropout and Academic Success

@misc{predict_students'_dropout_and_academic_success_697,
  author       = {Realinho, Valentim, Vieira Martins, Mónica, Machado, Jorge, and Baptista, Luís},
  title        = {{Predict Students' Dropout and Academic Success}},
  year         = {2021},
  howpublished = {UCI Machine Learning Repository},
  note         = {{DOI}: [https://doi.org/10.24432/C5MC89](https://doi.org/10.24432/C5MC89)}
}

In [1]:
!pip install "ucimlrepo" "pandas" "numpy" "matplotlib" "torch" "scikit-learn"

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim
import random
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset - https://archive.ics.uci.edu/dataset/697
repo = fetch_ucirepo(id=697)

# data (as pandas dataframes)
X = repo.data.features

#X_norm = nn.functional.normalize(torch.from_numpy(X.values), p=2, dim=1)
mean = X.mean(axis=0)
std = X.std(axis=0)
X = (X - mean) / std
y = repo.data.targets['Target']

#X = pd.DataFrame(X_norm.numpy(), columns=X.columns)

# metadata
# print(repo.metadata)

# variable information
# print(repo.variables)

In [4]:
df = pd.concat([repo.data.features, repo.data.targets['Target']], axis=1)
df.to_csv('data.csv', index=True)

In [5]:
# Fixing the random seed to guarantee deterministic results
def set_seed(seed):
    import os
    import random
    import numpy as np
    import torch
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed = 123456789
set_seed(seed)
rng = torch.Generator().manual_seed(seed)

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 36 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital Status                                  4424 non-null   float64
 1   Application mode                                4424 non-null   float64
 2   Application order                               4424 non-null   float64
 3   Course                                          4424 non-null   float64
 4   Daytime/evening attendance                      4424 non-null   float64
 5   Previous qualification                          4424 non-null   float64
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   float64
 8   Mother's qualification                          4424 non-null   float64
 9   Father's qualification                   

In [7]:
X = X.drop(columns=['Gender', 'Nacionality'])

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 34 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital Status                                  4424 non-null   float64
 1   Application mode                                4424 non-null   float64
 2   Application order                               4424 non-null   float64
 3   Course                                          4424 non-null   float64
 4   Daytime/evening attendance                      4424 non-null   float64
 5   Previous qualification                          4424 non-null   float64
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Mother's qualification                          4424 non-null   float64
 8   Father's qualification                          4424 non-null   float64
 9   Mother's occupation                      

In [9]:
X.describe()

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
count,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,...,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0
mean,-8.030546000000001e-17,-1.220643e-16,-1.397315e-16,2.577805e-16,5.781993000000001e-17,-3.2122190000000005e-17,-3.517379e-16,-1.188521e-16,1.2848870000000002e-17,-1.3651930000000001e-17,...,-2.007637e-17,-3.212219e-18,1.702476e-16,-5.460771000000001e-17,-1.156399e-16,4.818328e-17,8.030545999999999e-19,-7.227492e-17,1.349132e-16,3.212219e-18
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.2947954,-1.010546,-1.31516,-4.275919,-2.856147,-0.3501909,-2.852015,-1.189625,-1.386637,-0.4148986,...,-0.1992505,-0.2824104,-2.838016,-2.042399,-1.471361,-1.963267,-0.1994184,-1.488875,-1.466705,-1.789464
25%,-0.2947954,-1.010546,-0.5540051,0.1106615,0.3500429,-0.3501909,-0.5772765,-1.125535,-1.256285,-0.2634881,...,-0.1992505,-0.2824104,-0.5610977,-0.5226233,-0.8079587,0.09975311,-0.1994184,-0.813161,-0.6711664,-0.7497873
50%,-0.2947954,-0.09545943,-0.5540051,0.184805,0.3500429,-0.3501909,0.03690279,-0.03601411,-0.2134715,-0.2256355,...,-0.1992505,-0.2824104,-0.1057141,-0.01603139,0.1871441,0.3780209,-0.1994184,-0.174987,0.1243724,0.1401058
75%,-0.2947954,1.162785,0.2071497,0.3389071,0.3500429,-0.3501909,0.5600925,1.117596,0.9596937,-0.07422501,...,-0.1992505,-0.2824104,0.3496696,0.4905605,0.518845,0.5955176,-0.1994184,0.876123,0.9922329,0.7877013
max,7.959476,2.192257,5.535234,0.5497072,0.3500429,3.760769,4.351323,1.566223,1.415925,6.928509,...,17.1699,9.620922,7.635807,6.316367,5.162658,1.600754,15.72047,1.739535,1.787772,1.545432


In [10]:
# There are three labels: 'Dropout', 'Enrolled', and 'Graduate'
# But there are way more instances of 'Graduate'.
#   could it lead to bias in the model?
y.value_counts()

Unnamed: 0_level_0,count
Target,Unnamed: 1_level_1
Graduate,2209
Dropout,1421
Enrolled,794


In [11]:
# Samples from the data
X.sample(10, random_state=seed)

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
2374,-0.294795,1.162785,-0.554005,0.132468,0.350043,-0.350191,1.318339,-0.036014,-0.213472,-0.074225,...,1.248178,-0.28241,-0.561098,0.237265,-0.476258,0.339639,-0.199418,-0.813161,-1.466705,-1.375356
4189,-0.294795,1.334364,-0.554005,0.192559,0.350043,-0.350191,1.621637,1.181686,0.959694,-0.377046,...,-0.199251,1.281274,2.626588,3.02352,1.182247,0.339639,-0.199418,-0.813161,-1.466705,-1.375356
831,-0.294795,1.162785,-0.554005,0.184805,0.350043,0.432849,0.036903,0.925328,0.764166,-0.414899,...,-0.199251,-0.28241,-0.561098,0.490561,-1.471361,-1.963267,-0.199418,-1.488875,0.992233,0.140106
410,-0.294795,-1.010546,0.20715,0.31177,0.350043,-0.350191,0.332619,-0.036014,-0.213472,-0.14993,...,-0.199251,-0.28241,0.805053,-0.016031,0.850546,0.84902,-0.199418,-0.287606,0.124372,0.765674
3983,-0.294795,-0.095459,-0.554005,0.184805,0.350043,-0.350191,0.939216,1.181686,1.02487,-0.074225,...,-0.199251,-0.28241,-0.105714,-0.522623,0.518845,0.563533,-0.199418,0.313028,-0.526523,0.787701
3000,-0.294795,1.162785,-0.554005,0.31177,0.350043,-0.350191,1.166689,-1.061446,-0.213472,-0.339193,...,-0.199251,-0.28241,0.805053,0.997152,0.850546,0.476717,-0.199418,0.425647,1.787772,-0.749787
2760,-0.294795,1.162785,-0.554005,0.31177,0.350043,-0.056551,1.318339,-1.061446,1.02487,-0.301341,...,-0.199251,0.760046,0.805053,0.490561,0.850546,0.59032,-0.199418,-0.174987,-0.454201,0.889026
2660,1.356059,1.162785,-0.554005,0.549707,-2.856147,-0.350191,-0.956399,-0.036014,0.959694,-0.301341,...,-0.199251,-0.28241,-0.561098,0.743856,-0.144557,0.2117,-0.199418,0.876123,-1.105097,0.34716
1661,-0.294795,-1.010546,0.20715,0.110662,0.350043,-0.350191,1.849111,-1.061446,-1.256285,-0.225635,...,-0.199251,-0.28241,-0.105714,-0.522623,0.518845,1.043305,-0.199418,0.876123,-1.105097,0.34716
839,-0.294795,-1.010546,-0.554005,0.140707,0.350043,-0.350191,-0.577276,1.181686,-1.386637,-0.301341,...,-0.199251,-0.28241,-0.561098,-0.269327,0.187144,0.416403,-0.199418,-0.813161,-1.466705,-1.375356


In [12]:
# Data splitting - training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True, random_state=seed)

print(f"Train set size: {len(y_train)} ({len(y_train) / len(y):.2f})")
print(f"Test  set size: {len(y_test)}  ({len(y_test) / len(y):.2f})")

Train set size: 3539 (0.80)
Test  set size: 885  (0.20)


In [13]:
def label_to_index(label):
  map = {'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}
  return map[label]

def index_to_label(index):
  map = {0: 'Dropout', 1: 'Enrolled', 2: 'Graduate'}
  return map[index]

In [14]:
y_train = y_train.apply(label_to_index)
y_test = y_test.apply(label_to_index)

In [15]:
class SupervisedDataset(Dataset):
    def __init__(self, X, y, transform=None):
        self.X = np.asarray(X)
        self.y = np.asarray(y)
        self.transform = transform # function to process the features
        #self.target_transform = target_transform # function to process the label

    def __getitem__(self, index):
        features = self.X[index]
        label = self.y[index]
        if self.transform is not None:
            features = self.transform(features)
        #if self.target_transform is not None:
        #    label = self.target_transform(label)
        return features, label

    def __len__(self):
        return len(self.y)

In [16]:
def train_step(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    batch_size = dataloader.batch_size
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Convert X and y to float tensors to match model requirements
        X, y = X.clone().detach().float(), y.clone().detach().long()

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % (batch_size // 4) == 0:
            loss = loss.item()
            current = batch * dataloader.batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_step(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            X, y = torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.long)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n accuracy: {(100*correct):>0.1f}%, avg.loss: {test_loss:>8f} \n")

In [17]:
# Neural network architecture (36 x 16 x 3)

input_dimensions = np.shape(X_train)[1]

print(f"Input dimensions: {input_dimensions}")

model = nn.Sequential(
    # Input Layer is implicit
    nn.Linear(input_dimensions, 64, dtype=torch.float32), # Hidden Layer
    nn.ReLU(), # Hidden Layer: activation
    nn.Linear(64, 3, dtype=torch.float32), # Output Layer
    #nn.Softmax(dim=1), # Output Layer: activation
)

print(model)

Input dimensions: 34
Sequential(
  (0): Linear(in_features=34, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=3, bias=True)
)


In [18]:
learning_rate = 0.0001
momentum = 0.9
epochs = 100
batch_size = 64

In [19]:
classes_train = np.unique(y_train)
loss_weights = compute_class_weight('balanced', classes=classes_train, y=y_train)
loss_weights = torch.tensor(loss_weights, dtype=torch.float32)
print(f"Classes:      {classes_train}")
print(f"Loss Weights: {loss_weights}")

# Loss function
#  assign weights labels due to imbalance in labels
#  e.g., there are more 'Graduate' instances than 'Enrolled' and 'Dropout'
#        so 'Graduate' label is assigned a lower weight in term of loss
loss_fn = nn.CrossEntropyLoss(weight=loss_weights)

Classes:      [0 1 2]
Loss Weights: tensor([1.0375, 1.8577, 0.6676])


In [20]:
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum) # Stochastic Gradient Descent

In [21]:
train_data = SupervisedDataset(X_train, y_train)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, generator=rng)

test_data = SupervisedDataset(X_test, y_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True, generator=rng)

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_step(train_dataloader, model, loss_fn, optimizer)
    test_step(test_dataloader, model, loss_fn)

Epoch 1
-------------------------------
loss: 1.136433  [   64/ 3539]
loss: 1.089554  [ 1088/ 3539]
loss: 1.100806  [ 2112/ 3539]
loss: 1.099540  [ 3136/ 3539]


  X, y = torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.long)


Test Error: 
 accuracy: 36.2%, avg.loss: 1.093598 

Epoch 2
-------------------------------
loss: 1.084967  [   64/ 3539]
loss: 1.092157  [ 1088/ 3539]
loss: 1.067300  [ 2112/ 3539]
loss: 1.081509  [ 3136/ 3539]
Test Error: 
 accuracy: 39.4%, avg.loss: 1.085404 

Epoch 3
-------------------------------
loss: 1.110769  [   64/ 3539]
loss: 1.063002  [ 1088/ 3539]
loss: 1.068312  [ 2112/ 3539]
loss: 1.092758  [ 3136/ 3539]
Test Error: 
 accuracy: 42.8%, avg.loss: 1.077442 

Epoch 4
-------------------------------
loss: 1.050394  [   64/ 3539]
loss: 1.047885  [ 1088/ 3539]
loss: 1.061451  [ 2112/ 3539]
loss: 1.066537  [ 3136/ 3539]
Test Error: 
 accuracy: 44.9%, avg.loss: 1.069353 

Epoch 5
-------------------------------
loss: 1.040785  [   64/ 3539]
loss: 1.062785  [ 1088/ 3539]
loss: 1.036796  [ 2112/ 3539]
loss: 1.081972  [ 3136/ 3539]
Test Error: 
 accuracy: 47.6%, avg.loss: 1.061666 

Epoch 6
-------------------------------
loss: 1.121390  [   64/ 3539]
loss: 1.058277  [ 1088/ 3539]


In [22]:
def predict(model, X):
    y_pred = []
    with torch.no_grad():
        X = torch.tensor(X, dtype=torch.float32)
        pred = model(X)
        y_pred = pred.argmax(1)
    return np.asarray(y_pred)

def get_result(model, features_df, y_true):
    X = features_df.values
    y_pred = predict(model, X)
    result_df = features_df.copy()
    result_df['y_true'] = y_true
    result_df['y_true'] = result_df['y_true'].apply(index_to_label)
    result_df['y_pred'] = y_pred
    result_df['y_pred'] = result_df['y_pred'].apply(index_to_label)
    return result_df

In [23]:
test_result_df = get_result(model, X_test, y_test.values)
test_result_df

Unnamed: 0,Marital Status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,y_true,y_pred
107,-0.294795,-1.010546,-0.554005,0.110662,0.350043,-0.350191,-1.487172,-1.189625,-1.386637,-0.263488,...,-0.105714,2.263632,-0.144557,0.579525,-0.199418,1.476757,1.136876,-1.789464,Dropout,Enrolled
1319,-0.294795,-0.095459,0.207150,0.482833,0.350043,-0.350191,-0.122329,-1.189625,1.024870,-0.263488,...,-0.105714,-0.269327,0.187144,0.186112,-0.199418,-0.287606,0.124372,0.765674,Graduate,Graduate
3729,-0.294795,-1.010546,-0.554005,0.070924,0.350043,-0.350191,0.036903,1.181686,1.024870,-0.263488,...,2.626588,1.503744,2.509051,0.812030,-0.199418,1.476757,1.136876,-1.789464,Graduate,Graduate
1572,7.959476,1.162785,-0.554005,0.549707,-2.856147,0.726489,-0.198153,1.117596,0.959694,2.991837,...,-0.561098,-0.775919,-1.471361,-1.963267,-0.199418,-1.488875,0.992233,0.140106,Dropout,Dropout
2684,-0.294795,-0.667389,-0.554005,0.070924,0.350043,-0.154431,-0.198153,-1.189625,-0.213472,-0.225635,...,-0.105714,0.490561,-1.471361,-1.963267,-0.199418,1.739535,-0.671166,-0.406165,Dropout,Dropout
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2262,-0.294795,-1.010546,-0.554005,0.192559,0.350043,-0.350191,3.441427,-1.125535,-1.386637,-0.339193,...,-0.105714,-0.522623,-1.471361,-1.963267,-0.199418,1.476757,1.136876,-1.789464,Dropout,Dropout
2689,-0.294795,-0.095459,-0.554005,0.444065,0.350043,-0.350191,0.939216,-1.189625,1.024870,2.991837,...,-0.105714,-2.042399,-1.471361,-1.963267,-0.199418,-0.287606,0.124372,0.765674,Dropout,Dropout
2213,-0.294795,-1.010546,-0.554005,0.184805,0.350043,-0.350191,0.560093,-0.036014,1.024870,-0.074225,...,-0.105714,0.490561,-0.476258,0.531548,-0.199418,1.739535,-0.671166,-0.406165,Dropout,Graduate
2084,-0.294795,-1.010546,-0.554005,0.444065,0.350043,-0.350191,-0.425627,-0.036014,0.959694,-0.074225,...,-0.105714,-0.522623,0.518845,0.563533,-0.199418,-0.174987,-0.454201,0.889026,Graduate,Graduate


In [24]:
test_result_df.to_csv('test_result.csv', index=True)