# Topic 2: Predict Student Performance from Game Play

**Session 2: Treating session information as sequential data**

The intuition of this notebook is to encode all the rows in a session as sequential data, and then use a Recurrent Neural Networks to predict whether the user for this particular session will answer this question correctly.

If the output is potential, this could tremendously reduce the effort of future engineering, or can become a reliable support for encoding useful features, which can combine with features from statistical analysis to produce a better classifier.

In [23]:
# Import required libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import pyspark.pandas as ps



In [25]:
# Load the dataset
dtypes = {
    'elapsed_time': np.int32,
    'event_name': 'category', 
    'name': 'category',
    'level': np.int32,
    'room_coor_x': np.float32,
    'room_coor_y': np.float32,
    'screen_coor_x': np.float32,
    'screen_coor_y': np.float32,
    'hover_duration': np.float32,
    'text': 'category',
    'fqid': 'category',
    'room_fqid': 'category',
    'text_fqid': 'category',
    'fullscreen': 'category',
    'hq': 'category',
    'music': 'category',
    'level_group': 'category'
}

df = ps.read_csv('data/train.csv', dtype=dtypes)

# Print the first 5 rows
df.head()



23/04/23 18:19:50 WARN MemoryStore: Not enough space to cache rdd_461_21 in memory! (computed 92.0 MiB so far)
23/04/23 18:19:50 WARN BlockManager: Persisting block rdd_461_21 to disk instead.
23/04/23 18:19:50 WARN MemoryStore: Not enough space to cache rdd_461_25 in memory! (computed 103.5 MiB so far)
23/04/23 18:19:50 WARN BlockManager: Persisting block rdd_461_25 to disk instead.
23/04/23 18:19:50 WARN MemoryStore: Not enough space to cache rdd_461_29 in memory! (computed 103.1 MiB so far)
23/04/23 18:19:50 WARN BlockManager: Persisting block rdd_461_29 to disk instead.
23/04/23 18:19:50 WARN MemoryStore: Not enough space to cache rdd_461_28 in memory! (computed 104.6 MiB so far)
23/04/23 18:19:50 WARN BlockManager: Persisting block rdd_461_28 to disk instead.
23/04/23 18:19:51 WARN MemoryStore: Not enough space to cache rdd_461_20 in memory! (computed 102.4 MiB so far)
23/04/23 18:19:51 WARN BlockManager: Persisting block rdd_461_20 to disk instead.
23/04/23 18:19:52 WARN MemorySt



23/04/23 18:19:55 WARN MemoryStore: Not enough space to cache rdd_461_27 in memory! (computed 92.1 MiB so far)
23/04/23 18:19:55 WARN MemoryStore: Not enough space to cache rdd_461_26 in memory! (computed 92.6 MiB so far)
23/04/23 18:19:55 WARN MemoryStore: Not enough space to cache rdd_461_30 in memory! (computed 102.8 MiB so far)
23/04/23 18:19:55 WARN BlockManager: Persisting block rdd_461_30 to disk instead.
23/04/23 18:19:55 WARN MemoryStore: Not enough space to cache rdd_461_20 in memory! (computed 102.4 MiB so far)
23/04/23 18:19:55 WARN MemoryStore: Not enough space to cache rdd_461_28 in memory! (computed 104.6 MiB so far)
23/04/23 18:19:56 WARN MemoryStore: Not enough space to cache rdd_461_29 in memory! (computed 155.1 MiB so far)
23/04/23 18:19:56 WARN MemoryStore: Not enough space to cache rdd_461_24 in memory! (computed 211.5 MiB so far)
23/04/23 18:19:56 WARN MemoryStore: Not enough space to cache rdd_461_21 in memory! (computed 216.0 MiB so far)




23/04/23 18:19:57 WARN MemoryStore: Not enough space to cache rdd_461_33 in memory! (computed 94.0 MiB so far)
23/04/23 18:19:57 WARN BlockManager: Persisting block rdd_461_33 to disk instead.
23/04/23 18:19:58 WARN MemoryStore: Not enough space to cache rdd_461_32 in memory! (computed 139.5 MiB so far)
23/04/23 18:19:58 WARN BlockManager: Persisting block rdd_461_32 to disk instead.
23/04/23 18:19:58 WARN MemoryStore: Not enough space to cache rdd_461_34 in memory! (computed 141.2 MiB so far)
23/04/23 18:19:58 WARN BlockManager: Persisting block rdd_461_34 to disk instead.
23/04/23 18:19:58 WARN MemoryStore: Not enough space to cache rdd_461_30 in memory! (computed 13.0 MiB so far)




23/04/23 18:19:59 WARN MemoryStore: Not enough space to cache rdd_461_34 in memory! (computed 61.1 MiB so far)
23/04/23 18:20:00 WARN MemoryStore: Not enough space to cache rdd_461_33 in memory! (computed 94.0 MiB so far)
23/04/23 18:20:04 WARN MemoryStore: Not enough space to cache rdd_461_32 in memory! (computed 209.3 MiB so far)


                                                                                

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0.0,0.0,1.0,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0.0,0.0,1.0,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0.0,0.0,1.0,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0.0,0.0,1.0,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991394,-159.314682,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0.0,0.0,1.0,0-4


In [26]:
df.shape

                                                                                

(26296946, 20)

In specific, taking a `session_id`...

In [None]:
session_1_df = df[df['session_id'] == 20090312431273200]
session_1_df



23/04/23 18:20:34 WARN MemoryStore: Not enough space to cache rdd_476_26 in memory! (computed 92.6 MiB so far)
23/04/23 18:20:34 WARN BlockManager: Persisting block rdd_476_26 to disk instead.
23/04/23 18:20:34 WARN MemoryStore: Not enough space to cache rdd_476_25 in memory! (computed 103.5 MiB so far)
23/04/23 18:20:34 WARN BlockManager: Persisting block rdd_476_25 to disk instead.
23/04/23 18:20:34 WARN MemoryStore: Not enough space to cache rdd_476_28 in memory! (computed 104.6 MiB so far)
23/04/23 18:20:34 WARN BlockManager: Persisting block rdd_476_28 to disk instead.
23/04/23 18:20:34 WARN MemoryStore: Not enough space to cache rdd_476_29 in memory! (computed 103.1 MiB so far)
23/04/23 18:20:34 WARN BlockManager: Persisting block rdd_476_29 to disk instead.
23/04/23 18:20:34 WARN MemoryStore: Not enough space to cache rdd_476_20 in memory! (computed 102.4 MiB so far)
23/04/23 18:20:34 WARN BlockManager: Persisting block rdd_476_20 to disk instead.
23/04/23 18:20:35 WARN MemorySt



...which contains 881 actions recorded. We consider it as a report document of 881 words to process and see whether the prediction made from this document is reliable.

**The strategy for encoding a record into numeric format:**

- Removing the session_id and index column since they are identifiers.
- Set all the null values to 0 since there exists a identification, `event_name`, that shows the reason why these values are zeros.
- Encode all categorical columns (using one-hot encoding).

In [5]:
# Observe the unique values in some categorical columns
for col in df.select_dtypes('category'):
    nunique = df[col].nunique()
    last_20_unique_list = df[col].unique().tolist()[:20]
    print(f'{col :-<50} ({nunique}) {last_20_unique_list}')

Unique counts


                                                                                

session_id---------------------------------------- 23562


                                                                                

index--------------------------------------------- 20348


                                                                                

elapsed_time-------------------------------------- 5042639


                                                                                

event_name---------------------------------------- 11


                                                                                

name---------------------------------------------- 6


                                                                                

level--------------------------------------------- 23


                                                                                

page---------------------------------------------- 8


                                                                                

room_coor_x--------------------------------------- 17441451


                                                                                

room_coor_y--------------------------------------- 13653975


                                                                                

screen_coor_x------------------------------------- 57925


                                                                                

screen_coor_y------------------------------------- 102592


                                                                                

hover_duration------------------------------------ 24102


                                                                                

text---------------------------------------------- 619


                                                                                

fqid---------------------------------------------- 129


                                                                                

room_fqid----------------------------------------- 20


                                                                                

text_fqid----------------------------------------- 126


                                                                                

fullscreen---------------------------------------- 3


                                                                                

hq------------------------------------------------ 3


                                                                                

music--------------------------------------------- 3




level_group--------------------------------------- 4


                                                                                

Since there is several columns with high number of unique values, such as `text`, `room_fqid` and `text_fqid`, and the number of records is large, we will filter out `text`, `room_fqid` and `text_fqid`. However, with higher RAM or if we use other big data processing library, using this column can be still possible.

In [6]:
df.drop(['text', 'room_fqid', 'text_fqid'] , axis=1, inplace=True)

In [7]:
df.set_index(['session_id', 'index'], inplace=True)

AttributeError: 'DataFrame' object has no attribute 'set_index'

In [8]:
# Fill null values with zeros first
df[['page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration']] = df[['page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration']].fillna(0)

We are using a custom `GetDummies` class for one-hot encoding for 2 reasons:

1. `OneHotEncoder` runs excessively slower than `pd.get_dummies` in encoding large data.
2. `pd.get_dummies` transformations may encounter inconsistent amount of columns in transformed data if 2 datasets contain different number of unique categorical values.

In [9]:
import sklearn


class GetDummies(sklearn.base.TransformerMixin):
    """Fast one-hot-encoder that makes use of pandas.get_dummies() safely
    on train/test splits.
    """
    def __init__(self, dtypes=None):
        self.input_columns = None
        self.final_columns = None
        if dtypes is None:
            dtypes = [object, 'category']
        self.dtypes = dtypes

    def fit(self, X, y=None, **kwargs):
        self.input_columns = list(X.select_dtypes(self.dtypes).columns)
        X = pd.get_dummies(X, columns=self.input_columns)
        self.final_columns = X.columns
        return self
        
    def transform(self, X, y=None, **kwargs):
        X = pd.get_dummies(X, columns=self.input_columns)
        X_columns = X.columns
        # if columns in X had values not in the data set used during
        # fit add them and set to 0
        missing = set(self.final_columns) - set(X_columns)
        for c in missing:
            X[c] = 0
        # remove any new columns that may have resulted from values in
        # X that were not in the data set when fit
        return X[self.final_columns]
    
    def get_feature_names(self):
        return tuple(self.final_columns)

In [10]:
get_dummies = GetDummies()
df = get_dummies.fit_transform(df)
df.shape

(26296946, 162)

In [11]:
grouped_data = df.groupby('session_id').apply(lambda x: np.array(x))
grouped_data

session_id
20090312431273200    [[0.0, 0.0, 0.0, -413.99139404296875, -159.314...
20090312433251036    [[0.0, 0.0, 0.0, -394.99139404296875, 84.68531...
20090312455206810    [[0.0, 0.0, 0.0, -773.4580688476562, -183.0480...
20090313091715820    [[0.0, 0.0, 0.0, -170.0851593017578, -210.1584...
20090313571836404    [[0.0, 0.0, 0.0, -349.99139404296875, -4.31468...
                                           ...                        
22100215342220508    [[0.0, 0.0, 0.0, -153.9914093017578, -238.3146...
22100215460321130    [[0.0, 0.0, 0.0, -425.1403503417969, 107.19595...
22100217104993650    [[0.0, 0.0, 0.0, -423.99139404296875, 43.68531...
22100219442786200    [[0.0, 0.0, 0.0, -411.99139404296875, -182.314...
22100221145014656    [[0.0, 0.0, 0.0, -606.9913940429688, 42.685314...
Length: 23562, dtype: object

In [16]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Get the numpy array at the given index
        x = self.data[idx]
        
        # Convert the numpy array to a PyTorch tensor
        x = torch.from_numpy(x).float()
        
        return x

In [13]:
def collate_fn_padd(batch):
    """
    Padds batch of variable length

    Note: it converts things ToTensor manually here since the ToTensor transform
    assume it takes in images rather than arbitrary tensors.
    """
    ## Get sequence lengths
    lengths = [t.shape[0] for t in batch]
    try:
        n_features = batch[0].shape[1]
    except:
        n_features = 1
    max_length = max(lengths)
    if max_length == 0:
        max_length += 1
    batch_size = len(lengths)

    padded_tensor = torch.zeros(batch_size, max_length, n_features, dtype=torch.float32)
    for i, val in enumerate(batch):
        l = lengths[i]
        if n_features == 1:
            padded_tensor[i, :l] = val.reshape(-1, 1)
        else:
            padded_tensor[i, :l] = val
    
    return padded_tensor

In [14]:
label_df = pd.read_csv('data/train_labels.csv')
label_df['session'] = label_df.session_id.apply(lambda x: int(x.split('_')[0]) )
label_df['question_idx'] = label_df.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
label_df.drop("session_id", axis=1, inplace=True)
pivoted_questions = label_df.pivot(columns='question_idx', values='correct', index='session')
pivoted_questions['total_score'] = pivoted_questions.iloc[:, 0:18].sum(axis=1)
pivoted_questions.columns = [f'q_{i}' for i in range(1, 19)] + ['total_score']
pivoted_questions

Unnamed: 0_level_0,q_1,q_2,q_3,q_4,q_5,q_6,q_7,q_8,q_9,q_10,q_11,q_12,q_13,q_14,q_15,q_16,q_17,q_18,total_score
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
20090312431273200,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,16
20090312433251036,0,1,1,1,0,1,1,0,1,0,0,1,0,1,0,1,0,1,10
20090312455206810,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,17
20090313091715820,0,1,1,1,1,0,1,1,1,0,0,1,0,1,0,1,1,1,12
20090313571836404,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22100215342220508,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,16
22100215460321130,0,1,1,1,0,1,1,0,1,0,1,1,0,1,0,1,1,1,12
22100217104993650,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,15
22100219442786200,0,1,1,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,13


In [15]:
# Create an instance of the custom dataset
dataset = MyDataset(grouped_data.values)

# Create a PyTorch DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn_padd)

In [None]:
# Define the LSTM model
class StackedLSTM(nn.Module):
    def __init__(self, n_embeddings, n_layers, n_hidden, n_features):
        super(StackedLSTM, self).__init__()
        self.embedding = nn.Linear(n_features, n_embeddings)
        self.lstm = nn.LSTM(n_embeddings, n_hidden, n_layers, batch_first=True)
        self.linear = nn.Linear(n_hidden, 18)
        
    def forward(self, x):
        # Batch size
        batch_size = x.shape[0]

        # Pass the input through the embedding layer
        embed_out = self.embedding(x)

        # Pass the input through the LSTM layers
        lstm_out, _ = self.lstm(embed_out)

        # Get only the last output of the LSTM layer
        out = lstm_out[:, -1, :]
        
        # Flatten the LSTM output and pass it through the linear layer
        out = self.linear(out)
        
        # Apply sigmoid activation function to the output
        out = torch.sigmoid(out)
        
        return out

# Create an instance of the model
n_layers = 2  # Number of LSTM layers
n_hidden = 64  # Number of LSTM units
n_embeddings = 32 # Number of dimension in embedding layer
n_features = 162  # Number of features in each sequence

device = 'mps'
model = StackedLSTM(n_embeddings, n_layers, n_hidden, n_features).to(device)

In [None]:
from tqdm import tqdm

# Define number of output labels (number of questions)
n_out = 18

# Define the batch size
batch_size = 8

# Define the number of epochs
n_epochs = 3

# Data size
n_samples = len(grouped_data)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())

# Train the model
for epoch in range(n_epochs):
    for i, sample in tqdm(enumerate(dataloader)):
        # Get the batch
        batch_data = sample.to(device)
        
        # Forward pass
        outputs = model(batch_data)
        
        # Get label
        labels = torch.from_numpy(pivoted_questions.iloc[i*batch_size:(i+1)*batch_size, :18].values).float()
        labels = labels.to(device)
        
        print(labels.size())
        print(outputs.size())
        # Compute the loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Send back the data to cpu
        batch_data = batch_data.to('cpu')
        labels = labels.to('cpu')
        
    # Print the loss after every epoch
    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {loss.item():.4f}')

0it [00:00, ?it/s]

torch.Size([8, 18])
torch.Size([8, 18])
