## Setup

In [1]:
from preprocessing import preprocess_lernnavi_data
preprocess_lernnavi_data()

Removed 4596 users from data.
Performed train/validation/test split:
Training set size: 8963
Validation set size: 2560
Test set size: 1281


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import math
import time
import random
from functools import partial
from itertools import chain

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
def train_loop(
    model,
    optimizer,
    scheduler,
    criterion,
    train_loader,
    device=device,
    metrics_fn=None,
    verbose=False
):

    # getting the size of the batch just to measure the progress
    size = len(train_loader.dataset)

    model.train()
    metrics = []

    for batch, (X, y) in enumerate(train_loader):
        # train step
        X, y = X.to(device), y.to(device)

        pred = model(X)
        loss = criterion(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # compute metrics
        metrics.append({
            'loss': loss.item(),
            # 'accuracy': (pred.argmax(1) == y).sum().item() / len(y),
        })
        if metrics_fn is not None:
            metrics[-1] = dict(**metrics[-1], **metrics_fn(model, pred, y))

        if scheduler:
            scheduler.step()

        # Log training progress
        if batch % 100 == 0 and verbose:
            loss, current = loss.item(), batch * len(X)
            print(f"""Training [{current:>5d}/{size:>5d}]
    batch loss:     {metrics[-1]['loss']:.3e}""")
    # batch accuracy: {metrics[-1]['accuracy'] * 100:.2f}""")

    return metrics


@torch.no_grad()
def val_loop(
    model,
    criterion,
    val_loader,
    device=device,
    metrics_fn=None,
):

    model.eval()
    metrics = []
    tot_data = 0

    for X, y in val_loader:
        # val step
        X, y = X.to(device), y.to(device)
        tot_data += len(y)

        pred = model(X)
        loss = criterion(pred, y)

        # compute metrics
        metrics.append({
            'loss': loss.item(),
            # 'correct': (pred.argmax(1) == y).sum().item()
        })
        if metrics_fn is not None:
            metrics[-1] = dict(**metrics[-1], **metrics_fn(model, pred, y))

    # compute the average loss and accuracy
    metrics_df = pd.DataFrame(metrics)

    print(f"""Validation
    loss:     {metrics_df['loss'].mean():.3e}""")
    # correct:  {metrics_df['correct'].sum()}
    # total:    {tot_data}
    # accuracy: {metrics_df['correct'].sum() / tot_data * 100:.2f}""")
    return metrics


def train_model(
    model,
    data_loader,
    epochs=10,
    optimizer_fn=optim.Adam,
    optimizer_kwargs={},
    scheduler=None,
    scheduler_kwargs={},
    criterion=F.cross_entropy,
    device=device,
    metrics_fn=None,
    verbose=False
):
    train_loader, val_loader = data_loader()

    optimizer = optimizer_fn(model.parameters(), **optimizer_kwargs)

    if scheduler:
        scheduler = scheduler(optimizer, **scheduler_kwargs)

    train_metrics = []
    val_metrics = []
    for epoch in range(epochs):
        print(
            f"---------- Epoch {epoch+1:{math.ceil(math.log10(epochs+1))}d} ----------")
        epoch_train_metrics = train_loop(model, optimizer, scheduler, criterion, train_loader, device, metrics_fn, verbose=verbose)
        epoch_val_metrics = val_loop(model, criterion, val_loader, device, metrics_fn)

        train_metrics.append(epoch_train_metrics)
        val_metrics.append(epoch_val_metrics)

    return {
        "train": pd.DataFrame(chain.from_iterable(train_metrics)),
        "validation": pd.DataFrame(chain.from_iterable(val_metrics))
    }

## Architecture, dataset, and utilities definition

In [3]:
class MLPEncoder(nn.Module):
	"""
		MLP Encoder: a simple MLP encoder with dropout and relu activation functions

		Args:
			layers (list): list of hidden layers sizes
			embedding_dim (int): size of the embedding
	"""

	def __init__(self, layers, embedding_dim=32):
		super(MLPEncoder, self).__init__()
		self.seq = nn.Sequential(
			*chain.from_iterable([[nn.LazyLinear(l), nn.ReLU(), nn.Dropout(0.1)] for l in layers]),
			nn.LazyLinear(embedding_dim)
		)
    
	def forward(self, x):
		return self.seq(x)
    
    
class MLPDecoder(nn.Module):
	"""
		MLP Decoder: a simple MLP decoder with dropout and relu activation functions

		Args:
			layers (list): list of hidden layers sizes
			input_size (int): size of the original input
	"""

	def __init__(self, layers, input_size):
		super(MLPDecoder, self).__init__()
		self.input_size = input_size
		self.seq = nn.Sequential(
			*chain.from_iterable([[nn.LazyLinear(l), nn.ReLU(), nn.Dropout(0.1)] for l in layers[::-1]]),
			nn.LazyLinear(input_size),
		)
    
	def forward(self, x):
		return self.seq(x)

def get_encoder_decoder_mlp(layers, embedding_dim=32, input_dim=(28, 28)):
	encoder = MLPEncoder(layers, embedding_dim)
	decoder = MLPDecoder(layers, input_dim)
	return encoder, decoder

class AE(nn.Module):
	type_to_model = {
		"mlp": get_encoder_decoder_mlp
	}

	@staticmethod
	def create(network_type, **kwargs):
		builder = AE.type_to_model.get(network_type, None)

		if not builder:
			raise ValueError("Unknown type of model")
		return AE(*builder(**kwargs))

	def __init__(self, encoder, decoder):
		super(AE, self).__init__()
		self.encoder = encoder
		self.decoder = decoder

	def forward(self, x):
		x = self.encoder(x)
		x = self.decoder(x)
		return x

	def encode(self, x):
		return self.encoder(x)

In [4]:
class LernnaviDataset(torch.utils.data.Dataset):
    """
        Lernnavi Dataset: student dataset from Lernnavi.
        The dataset is based on 2 csv files: users.csv and topics.csv.
        The users.csv file contains the features of the students and the topics.csv file contains the students' masteries per topic.
        The dataset is loaded from the csv files and the features are stored in the users_features and topics_features attributes.
        The topics are preprocessed using a multilingual sentence transformer model to generate the topic embedding.
        The returned data is the concatenation of the user features and the topic embeddings scaled by their relative student's mastery level.
        
        Args:
            datapath (str): path to the folder containing the csv files
    """

    _nlp_model = None

    @property
    def nlp_model(self):
        from sentence_transformers import SentenceTransformer
        if LernnaviDataset._nlp_model:
            return LernnaviDataset._nlp_model
        
        LernnaviDataset._nlp_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
        return LernnaviDataset._nlp_model


    def __init__(self, datapath):
        self.datapath = datapath

        import os
        
        # load data
        # self.users_features = pd.read_csv(os.path.join(datapath, "users.csv"), index_col=0)
        # self.topics_features = pd.read_csv(os.path.join(datapath, "topics.csv"), index_col=0)
        # assert self.users_features.shape[0] == self.topics_features.shape[0]

        # # make sure that the data is consistent across the two dataframes
        # self.users_features = self.users_features.sort_index()
        # self.topics_features = self.topics_features.sort_index()
        # assert self.users_features.index.equals(self.topics_features.index)
        
        self.original_data = pd.read_csv(os.path.join(datapath, "qna.csv"))
        self.data = (
            self.original_data
                .drop(["multiple_responses", "question", "choices", "correct", "student_answer", "start_time"], axis=1)
                .set_index("user_id")
                .fillna(0)
        )

        # preprocess topic data
        self._create_topic_embeddings()

    def _create_topic_embeddings(self):
        # create embeddings for each topic
        # self.topic_embeddings = self.nlp_model.encode(self.topics_features.columns.values)
        self.topic_embeddings = self.nlp_model.encode(self.data.columns.values)
        
        # normalize topic embeddings rows
        self.topic_embeddings = self.topic_embeddings / np.linalg.norm(self.topic_embeddings, axis=1)[:, np.newaxis]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # data = np.concatenate((
        #     # user features
        #     # self.users_features.iloc[idx][["mastery_mean", "normalized_score_weighted_by_difficulty", "normalized_score"]],
            
        #     # topic embeddings weighted by user topic mastery level
        #     # (self.topics_features.iloc[idx].values[:, np.newaxis] * self.topic_embeddings).ravel()
        #     (self.data.iloc[idx].values[:, np.newaxis] * self.topic_embeddings).ravel()
        # ))
        return torch.tensor((self.data.iloc[idx].values[:, np.newaxis] * self.topic_embeddings).ravel(), dtype=torch.float32), None

In [5]:
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader

class AutoEncoderDataset(Dataset):
    """
        Dataset wrapper for autoencoder training.
        Returns an (X, X) pair from the underlying dataset.
    """

    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, index):
        x, _ = self.dataset[index]
        return x, x

    def __len__(self):
        return len(self.dataset)

def data_loader(dataset="lernnavi"):
    _datasets = (None, None)

    match dataset:
        case "mnist":
            _datasets = (
                datasets.MNIST(root="data", train=True, download=True, transform=transforms.ToTensor()),
                datasets.MNIST(root="data", train=False, download=True, transform=transforms.ToTensor()),
            )
        case "lernnavi":
            _datasets = (
                LernnaviDataset("data/lernnavi/qna/train"),
                LernnaviDataset("data/lernnavi/qna/validation"),
            )

    train_loader = DataLoader(
        AutoEncoderDataset(_datasets[0]),
        batch_size=16,
        shuffle=True
    )
    val_loader = DataLoader(
        AutoEncoderDataset(_datasets[1]),
        batch_size=16,
        shuffle=False
    )
    return train_loader, val_loader

## Training

In [6]:
ds = LernnaviDataset("data/lernnavi/qna/train")

In [7]:
ds[0][0].shape

torch.Size([12288])

In [10]:
model = AE.create(
    "mlp",
    layers=[4000, 1000],
    embedding_dim=512,
    input_dim=12288
).to(device)

steps_per_epoch = len(data_loader()[0])
epochs = 5

metrics = train_model(
    model=model,
    epochs=epochs,
    data_loader=data_loader,
    criterion=F.mse_loss,
    optimizer_fn=optim.Adam,
    optimizer_kwargs={
        "lr": 1e-5
    },
)



---------- Epoch 1 ----------
Validation
    loss:     2.580e+00
---------- Epoch 2 ----------
Validation
    loss:     6.139e-01
---------- Epoch 3 ----------
Validation
    loss:     4.052e+00
---------- Epoch 4 ----------
Validation
    loss:     2.859e+01
---------- Epoch 5 ----------
Validation
    loss:     4.742e+01


In [10]:
torch.save({
    'epoch': epochs,
    'model_state_dict': model.state_dict(),
}, "model.pt")

## Analyze results
As the validation mean squared error is so low, further investigation is needed.  
We calculate the reconstruction error for each student in terms of norm distance between the original and the prediction. This is indicative of how well the autoencoder is able to reconstruct the original data. However, for future steps we might want to consider combinations of metrics, such as the MSE for the student's features and the distance between each pair (input topic, reconstructed topic) to better assess the quality of the compression and reconstruction.

In [11]:
model = model.to(device).eval()

losses = []
original_norm = []
difference_norm = []

all_data = [
    LernnaviDataset("data/lernnavi/qna/train"),
    LernnaviDataset("data/lernnavi/qna/validation"),
    LernnaviDataset("data/lernnavi/qna/test")
]

for ds in all_data:
    users = range(len(ds))
    for i in np.random.choice(users, size=int(0.25*len(users)), replace=False):
        original = ds[i][0]
        prediction = model(original.to(device)).detach().cpu()
        losses.append(F.mse_loss(original, prediction))
        original_norm.append(np.linalg.norm(original))
        difference_norm.append(np.linalg.norm(original - prediction))

np.mean(losses), np.mean(original_norm), np.mean(difference_norm)

(379.99548, 34243.2, 1687.5343)

## Generate embeddings
We can use the trained encoder to generate embeddings for the entire dataset. We can include the validation and test set as well, since we are not training the model anymore and we are just using the embeddings to train the next model.

In [20]:
model = model.to(device).eval()

all_data = [
    LernnaviDataset("data/lernnavi/qna/train"),
    LernnaviDataset("data/lernnavi/qna/validation"),
    LernnaviDataset("data/lernnavi/qna/test")
]

embeddings = []
user_ids = []
times = []

for dataset in all_data:
    user_ids.extend(dataset.original_data["user_id"].values)
    times.extend(dataset.original_data["start_time"].values)

    for X, _ in dataset:
        X = X.to(device)
        embedding = model.encode(X).detach().cpu().numpy()
        embeddings.append(embedding)

embeddings_df = pd.DataFrame({
    "embedding": embeddings,
    "user_id": user_ids,
    "start_time": times
})
embeddings_df["start_time"] = pd.to_datetime(embeddings_df["start_time"])
embeddings_df.to_pickle("data/lernnavi/qna/question_embeddings.pkl")
embeddings_df.head(2)

In [30]:
embeddings_df = pd.DataFrame({
    "embedding": embeddings,
    "user_id": user_ids,
    "start_time": times
})
embeddings_df["start_time"] = pd.to_datetime(embeddings_df["start_time"])
embeddings_df.to_pickle("data/lernnavi/qna/question_embeddings.pkl")
embeddings_df.head(2)

Unnamed: 0,embedding,user_id,start_time
0,"[-0.05834822, 0.23875548, 0.12567756, -0.23862...",387604,2021-10-31 18:36:44.534
1,"[-0.055224773, 0.22952265, 0.12700102, -0.2168...",387604,2021-11-09 07:57:38.255


In [26]:
qna = pd.read_pickle("data/lernnavi/qna/MULTIPLE_CHOICE_german.pkl")
qna.head(2)

Unnamed: 0,multiple_responses,question,choices,correct,student_answer,start_time,user_id
35299,True,Markiere die Sätze mit der korrekten Kommasetz...,[<table><tbody><tr><td><p>Im „hessischen Landb...,"[False, False, False, True]","[False, False, False, False]",2021-05-21 11:16:29.867,393224
35300,True,Markiere die Sätze mit der korrekten Kommasetz...,[<table><tbody><tr><td><p>Im „hessischen Landb...,"[False, False, False, True]","[False, True, False, True]",2021-05-21 11:16:54.135,393232


In [32]:
merged = qna.merge(embeddings_df, on=["user_id", "start_time"])
merged.to_pickle("data/lernnavi/qna/MULTIPLE_CHOICE_german_with_embeddings.pkl")
merged.head(2)

Unnamed: 0,multiple_responses,question,choices,correct,student_answer,start_time,user_id,embedding
0,True,Markiere die Sätze mit der korrekten Kommasetz...,[<table><tbody><tr><td><p>Im „hessischen Landb...,"[False, False, False, True]","[False, False, False, False]",2021-05-21 11:16:29.867,393224,"[1.219209, -15.155132, 23.729265, -5.7118816, ..."
1,True,Markiere die Sätze mit der korrekten Kommasetz...,[<table><tbody><tr><td><p>Im „hessischen Landb...,"[False, False, False, True]","[False, True, False, True]",2021-05-21 11:16:54.135,393232,"[1.2418625, -15.439329, 24.171888, -5.8181686,..."
