# Assignment-3: CSCI 6517 Recommender System



## 0. Initializing and Importing Libraries

In [None]:
import sklearn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle

## Q1. Data Processing


In [None]:
%ls

In [None]:
# read order_products__train.csv from Data folder
order_products_train = pd.read_csv('order_products__train.csv')

In [None]:
order_products_train.head(20)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
5,1,13176,6,0
6,1,47209,7,0
7,1,22035,8,1
8,36,39612,1,0
9,36,19660,2,1


### Q1.a Split data into holdout and cold start set

In [None]:
order_products_train.shape

(1384617, 4)

In [None]:
order_groups = order_products_train.groupby('order_id')

In [None]:
len(order_groups)

131209

In [None]:
# Extracting the unique group keys
group_keys = order_groups.groups.keys()


holdout_groups, coldstart_groups = train_test_split(list(group_keys),
                                                    test_size=0.3,
                                                    random_state=42)

# Creating holdout and cold start dataframes
holdout = pd.concat([group_data for group_key, group_data in order_groups if group_key in holdout_groups])
coldstart = pd.concat([group_data for group_key, group_data in order_groups if group_key in coldstart_groups])

In [None]:
print('Number of groups in holdout groups: ', len(holdout_groups))
print('Number of groups in cold start groups: ', len(coldstart_groups))

In [None]:
holdout.head(20)

### Q1.a.i

#### Q1.a.ii

In [None]:
# find average number of products in each group
# Calculating the number of data points in each group in the holdout set
holdout_group_sizes = holdout.groupby(order_groups.grouper.names).size()

# Calculating the average number of data points in each group in the holdout set
average_holdout_group_size = holdout_group_sizes.mean()
average_holdout_group_size

#### Q1.a.iii

Investigate minimum number of products in each group in the holdout set

In [None]:
minimum_holdout_group_size = holdout_group_sizes.min()
minimum_holdout_group_size

In [None]:
holdout.head(15)

In [None]:
#copy coldstart to cold_start and remove coldstart from memory
cold_start = coldstart.copy()
del coldstart

In [None]:
cold_start.head()

In [None]:
holdout.shape

In [None]:
cold_start.shape

For your **HOLDOUT DATASET** obtained from previous step, for each order, put first 8 products (items) into training set and put the rest into the validation set. If the cart does not have more than 8 items, remove the order from your data pool.

In [None]:
order_counts = holdout.groupby('order_id').size().reset_index(name='count')
less_than_equal_8_count = order_counts[order_counts['count'] <= 8].shape[0]
print('Number of **ORDER_ID** with less than or equal to 8 products: ', less_than_equal_8_count)

Remove the order_id that has less than or equal to 8 products

In [None]:
filtered_holdout = holdout.groupby('order_id').filter(lambda x: len(x) > 8)
filtered_holdout.shape

For each order_id, put first 8 into training set and put the rest into the validation set

In [None]:
filtered_holdout = filtered_holdout.astype(int)
# Initialize empty dataframes for holdout_train and holdout_valid
holdout_train = pd.DataFrame(columns=filtered_holdout.columns)
holdout_valid = pd.DataFrame(columns=filtered_holdout.columns)

# Group the filtered holdout data by 'order_id'
grouped_holdout = filtered_holdout.groupby('order_id')

# Iterate over each group and split into holdout_train and holdout_valid
for order_id, group in grouped_holdout:
    holdout_train = pd.concat([holdout_train, group.head(8).sort_values(by='add_to_cart_order')])
    holdout_valid = pd.concat([holdout_valid, group.tail(len(group) - 8).sort_values(by='add_to_cart_order')])


for each order_id, put them in array for saving and training later.

In [None]:
train_data = holdout_train.groupby('order_id')['product_id'].apply(list).reset_index()
validation_data = holdout_valid.groupby('order_id')['product_id'].apply(list).reset_index()

In [None]:
train_data[:5]

In [None]:
validation_data[-5:]

In [None]:
train_data = np.array(train_data['product_id'].to_list(), dtype=object)
validation_data = np.array(validation_data['product_id'].to_list(), dtype=object)

In [None]:
train_data[:5]

#### Q1.b.i

#### Q1.b.ii

#### Q1.b.iii

Save data to file in Data folder for future use

In [None]:
#save train_data and validation_data to file in Data folder as pickle
# with open('Data/train_data.pkl', 'wb') as f:
#     pickle.dump(train_data, f)
#
# with open('Data/validation_data.pkl', 'wb') as f:
#     pickle.dump(validation_data, f)

load train_data and validation_data from file in Data folder as pickle

For Google Collab

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# %cd '/content/drive/MyDrive/RecSys/A3'

Load processed data from file.

In [None]:
# load train_data and validation_data from file in Data folder as pickle
import pickle

with open('train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)

with open('validation_data.pkl', 'rb') as f:
    validation_data = pickle.load(f)

## Q2. Train GPT as a sequential recommender system

#### Initialize the hyperparameters

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 4 # what is the maximum context length for predictions?
max_iters = 2000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32
n_head = 4
n_layer = 4
dropout = 0.2

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

#### Dataset alignment for Model

In [None]:
train_data[0:5]

[[11913, 18159, 4461, 21616, 23622, 32433, 28842, 42625],
 [45221, 12792, 43719, 15483, 21070, 44980, 277, 39979],
 [45064, 8979, 21955, 14168, 4889, 29487, 14044, 41591],
 [42244, 31663, 38689, 13176, 21137, 19057, 14901, 18027],
 [15860, 29628, 28031, 22108, 4126, 31423, 25476, 34284]]

In [None]:
# check number of unique product_id's in 'product_id' column of filtered_holdout dataset
vocab_size = np.unique(train_data)
vocab_size = len(vocab_size)
vocab_size

25993

In [None]:
# now, provide a encoder and decoder. encoder will take in a 2d array and will return a 2d array. each number will be encoded to a unique number corresponding to that number. decoder will take an encoded 2d array and return back the original 2d array.

num_mapping = {}
rev_mapping = {}

def encode(arr):
    id_for_unique_number = 1
    encoded_arr = []

    for row in arr:
        encoded_row = []
        for num in row:
            if num not in num_mapping:
                num_mapping[num] = id_for_unique_number
                rev_mapping[id_for_unique_number] = num
                id_for_unique_number += 1
            encoded_row.append(num_mapping[num])
        encoded_arr.append(encoded_row)

    return encoded_arr


def decode(encoded_arr):
    decoded_arr = []

    for row in encoded_arr:
        decoded_row = []
        for num in row:
            if num == 0:
                decoded_row.append(0)
            else:
                decoded_row.append(rev_mapping[num])
        decoded_arr.append(decoded_row)

    return decoded_arr


In [None]:
len(train_data), len(validation_data)

(46676, 46676)

In [None]:
# now encode train_data and validation_data
encoded_train_data = encode(train_data)
encoded_validation_data = encode(validation_data)

In [None]:
# make sure validation data is same length as train data where block_size is 4.
# now validation data is of variable length. so, we need to pad it to make it same length as train data.
# we will pad it with 0's
def masked_validation_data(val_data, block_size):
    # Truncate or pad the validation sequences to match the block_size of training data
    aligned_val_data = []

    for seq in val_data:
        seq_length = len(seq)

        if seq_length > block_size:
            aligned_val_data.append(seq[:block_size])  # Truncate the sequence
        else:
            padded_seq = seq + [0] * (block_size - seq_length)  # Pad with 0 token
            aligned_val_data.append(padded_seq)

    return aligned_val_data




In [None]:
# convert the train_data and validation_data to tensors
train_data_tensor = torch.tensor(encoded_train_data)
validation_data_tensor = torch.tensor(masked_validation_data(encoded_validation_data, 8))

In [None]:
# check the shape of train_data_tensor and validation_data_tensor
train_data_tensor.shape, validation_data_tensor.shape

(torch.Size([46676, 8]), torch.Size([46676, 8]))

In [None]:
train_data_tensor[:2]

tensor([[ 1,  2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15, 16]])

#### Batch Production

In [None]:
import random

def get_batch(split):
  data = train_data_tensor if split == 'train' else validation_data_tensor
  shape = data.shape

  ix = random.sample(range(shape[0]), batch_size)  # Generate 4 random integers
  jx = random.sample(range(shape[1] - block_size), batch_size)

  x = torch.empty(0, block_size, dtype=torch.long)
  y = torch.empty(0, block_size, dtype=torch.long)

  for i in range(batch_size):
    x_temp = data[ix[i]:ix[i]+1, jx[i]:jx[i]+block_size]
    y_temp = data[ix[i]:ix[i]+1, jx[i]+1: jx[i]+1+block_size]

    x = torch.cat((x, x_temp), dim=0)
    y = torch.cat((y, y_temp), dim=0)

  x, y = x.to(device), y.to(device)

  return x, y


In [None]:
get_batch("train")

(tensor([[  409, 12331,  5869,    41],
         [  808,  5877,   480, 17435],
         [ 1618,  3349,  9102,    53],
         [ 3418,    81,   575,  1420]], device='cuda:0'),
 tensor([[12331,  5869,    41,   513],
         [ 5877,   480, 17435,  7551],
         [ 3349,  9102,    53,   447],
         [   81,   575,  1420,  2078]], device='cuda:0'))

In [None]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx.long())  # Convert idx to torch.long
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

#### Training

In [None]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

train_losses = []
print_interval = 200
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        train_losses.append(losses['train'])

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


1.740169 M parameters
step 0: train loss 10.3236, val loss 10.4750
step 19: train loss 10.2479, val loss 10.3671


#### Save the model

In [None]:
path = 'base_model.pt'

In [None]:
# Save the model
torch.save(m.state_dict(), path)

#### Load the Model

In [None]:
# Load the model
model = BigramLanguageModel()
model.load_state_dict(torch.load(path))
m = model.to(device)
m.eval()

BigramLanguageModel(
  (token_embedding_table): Embedding(25993, 32)
  (position_embedding_table): Embedding(4, 32)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (key): Linear(in_features=32, out_features=8, bias=False)
            (query): Linear(in_features=32, out_features=8, bias=False)
            (value): Linear(in_features=32, out_features=8, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=32, out_features=32, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=32, out_features=128, bias=True)
          (1): ReLU()
          (2): Linear(in_features=128, out_features=32, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
     

##### Evaluation

Evaluation Metrics for Sequential RecSys

In [None]:
import numpy as np

def precision_at_k(true_list, predicted_list, k):
    true_positive = len(set(true_list[:k]) & set(predicted_list[:k]))
    if k == 0:
        return 0
    return true_positive / k

def recall_at_k(true_list, predicted_list, k):
    true_positive = len(set(true_list[:k]) & set(predicted_list[:k]))
    actual_positive = len(set(true_list))
    if actual_positive == 0:
        return 0
    return true_positive / actual_positive

def ndcg_at_k(true_list, predicted_list, k):
    relevance = [1 if item in true_list else 0 for item in predicted_list[:k]]
    if np.sum(relevance) == 0:
        return 0

    dcg = relevance[0]
    for i in range(1, len(relevance)):
        dcg += relevance[i] / np.log2(i + 1)

    ideal_relevance = sorted(relevance, reverse=True)

    idcg = ideal_relevance[0]
    for i in range(1, len(ideal_relevance)):
        idcg += ideal_relevance[i] / np.log2(i + 1)

    return dcg / idcg

def evaluate_performance(true_lists, predicted_lists):
    assert len(true_lists) == len(predicted_lists), "Number of lists must be the same"

    precision = []
    recall = []
    ndcg = []
    k_values = [1, 3, 5]

    for true_list, predicted_list in zip(true_lists, predicted_lists):
        for k in k_values:
            precision_k = precision_at_k(true_list, predicted_list, k)
            recall_k = recall_at_k(true_list, predicted_list, k)
            ndcg_k = ndcg_at_k(true_list, predicted_list, k)

            precision.append(precision_k)
            recall.append(recall_k)
            ndcg.append(ndcg_k)

    avg_precision = np.mean(precision)
    avg_recall = np.mean(recall)
    avg_ndcg = np.mean(ndcg)

    return avg_precision, avg_recall, avg_ndcg


In [None]:
def print_Performance_2(true_val, pred_val):
  precision, recall, ndcg = evaluate_performance(true_val, pred_val)

  print("Precision@k:", precision)
  print("Recall@k:", recall)
  print("nDCG@k:", ndcg)

### Q3.a.i

In [None]:
from tqdm import tqdm

def holdout_predictions_data():
  list_true = []
  list_predict = []
  prediction_count = 0
  MAX_PREDICTIONS = 4

  for i in tqdm(range(len(train_data_tensor) - 1, -1, -1), total=MAX_PREDICTIONS):
      train_row = train_data_tensor[i]
      validation_row = validation_data_tensor[i].tolist()
      validation_row = [i for i in validation_row if i != 0]

      if len(validation_row) < 5:
          continue

      ### update your code here
      cart_seq = train_row[
      ]
      ### update your code here

      #now predict from the model
      context = cart_seq.unsqueeze(0).to(device)

      predict = m.generate(context, max_new_tokens=len(validation_row))[0].tolist()

      # copy only the prediction from the sequence
      predict = predict[-len(validation_row):]

      #keep tab of true and predicted value for performance evaluation
      list_true.append(validation_row)
      list_predict.append(predict)

      prediction_count += 1

      if prediction_count == MAX_PREDICTIONS:
          break

  return list_true, list_predict


In [None]:
list_true, list_predict = holdout_predictions_data()

print('Performance on Holdout Dataset: ')
print_Performance_2(list_true, list_predict)

100%|█████████▉| 1999/2000 [03:01<00:00, 10.99it/s]

Performance on Holdout Dataset: 
Precision@k: 0.00012222222222222224
Recall@k: 6.25e-05
nDCG@k: 0.0012539531690476384





### Q3.b.ii Performance with **cold_start** dataset

### Q3.C.i

### Q3.C.ii

