# Non-personalised baselines

In [1]:
from utils import SavePredictionsCallback
from collections import defaultdict
import torch
import pandas as pd
from pathlib import Path
import numpy as np
import typing as t
from tqdm.auto import tqdm
SEED = 481424852

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PREDICT_K=20
ks = [5, 10]
BATCH_SIZE = 1024*4

In [25]:
#from utils.analysis import calculate_metrics, save_eval
from utils.t4r_analysis import get_metrics
from utils.analysis import calculate_metrics, save_eval


def predict(loader, seed=SEED):    
    metrics = get_metrics()
    storage_container = SavePredictionsCallback()
    for predictions, targets in tqdm(loader):
        targets = targets.view(-1, 1)
        scores = torch.ones_like(predictions, dtype=float) / torch.arange(1, predictions.shape[1]+1)
        labels = (predictions == targets).int()
        for metric_name, metric in metrics.items():
            metric.update(scores, labels)
        storage_container(pred_item_ids=predictions, pred_item_scores=scores, labels=targets.view(-1))
    return metrics, storage_container

def evaluate(loader, **kwargs):
    batch_metrics, storage_container = predict(loader, **kwargs)
    return calculate_metrics(batch_metrics), storage_container


##### Loading necessary data

In [4]:
ednet_path = Path("ednet/conventional/all_scaled/")
mooc_path = Path("mooc/conventional/all_scaled/")

In [5]:
mooc_val = pd.read_parquet(mooc_path / "val.parquet", columns=["item_id"]).droplevel([1,2])

In [6]:
mooc_test = pd.read_parquet(mooc_path / "test.parquet", columns=["item_id"]).droplevel([1,2])
ednet_test = pd.read_parquet(ednet_path / "test.parquet", columns=["item_id"]).droplevel([1,2])

### Random recommender

In [7]:
class RandomDataset(torch.utils.data.Dataset):
    def __init__(self, labels:pd.DataFrame, cardinality: int, generator: torch.Generator, predict_k=20):
        """Initialization
        Labels  - pd.DataFrame, indexed by `user_id` and with column `item_id` """
        self.labels = labels
        self.cardinality = cardinality
        self.generator = generator
        self.predict_k = predict_k

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.labels)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Load data and get label
        X = torch.randperm(self.cardinality, generator=self.generator)[:self.predict_k]
        # First column is the target item id
        y = self.labels.iloc[index, 0]

        return X, y

##### Ednet

In [8]:
ednet_stats = pd.read_parquet(ednet_path / "feature_stats.parquet")
EDNET_CARDINALITY = ednet_stats.loc["max", "item_id"].astype(int)

In [9]:
random_ednet_gen = torch.Generator().manual_seed(SEED)
random_ednet_dataset = RandomDataset(ednet_test, EDNET_CARDINALITY, random_ednet_gen, predict_k=PREDICT_K)
random_ednet_loader = torch.utils.data.DataLoader(random_ednet_dataset, batch_size=BATCH_SIZE, shuffle=False, generator=random_ednet_gen)

In [10]:
random_ednet_metrics, random_ednet_storage_container = evaluate(random_ednet_loader)
random_ednet_metrics

100%|████████████████████████████████████████████████████| 5/5 [00:00<00:00,  5.49it/s]


{'map': {5: 0.003066029166802764, 10: 0.0037128934636712074},
 'recall': {5: 0.006210838910192251, 10: 0.011212487705051899},
 'ndcg': {5: 0.0038427680265158415, 10: 0.005439006723463535}}

In [11]:
save_eval(random_ednet_metrics, random_ednet_storage_container, "ednet_baseline_random_eval")

##### Mooc

In [14]:
mooc_stats = pd.read_parquet(mooc_path / "cont_feature_stats.parquet")
MOOC_CARDINALITY = mooc_stats.loc["max", "item_id"].astype(int)

In [16]:
random_mooc_gen = torch.Generator().manual_seed(SEED)
random_mooc_dataset = RandomDataset(mooc_test, MOOC_CARDINALITY, random_mooc_gen, predict_k=PREDICT_K)
random_mooc_loader = torch.utils.data.DataLoader(random_mooc_dataset, batch_size=BATCH_SIZE, shuffle=False, generator=random_mooc_gen)

In [17]:
random_mooc_metrics, random_mooc_storage_container = evaluate(random_mooc_loader)
random_mooc_metrics

100%|██████████████████████████████████████████████████| 29/29 [02:14<00:00,  4.63s/it]


{'map': {5: 1.6715097444830462e-05, 10: 2.645026324898936e-05},
 'recall': {5: 3.428737909416668e-05, 10: 0.00010286214092047885},
 'ndcg': {5: 2.0987812604289502e-05, 10: 4.374848504085094e-05}}

In [18]:
save_eval(random_mooc_metrics, random_mooc_storage_container, "mooc_baseline_random_eval")

In [19]:
def predict(last_interaction: pd.DataFrame, labels: pd.DataFrame, lookup_df:pd.DataFrame, predict_k=PREDICT_K, item_col="item_id",
            user_col="user_col", seed=SEED, most_pop_only=False, batch_size=1024*4):
    #print("Creating data loader")
    gen = torch.Generator().manual_seed(seed)
    dataset = SyllabusDataset(last_interaction, labels, lookup_df, predict_k=predict_k, most_pop_only=most_pop_only)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, generator=gen)
    
    
    metrics = get_metrics()
    storage_container = SavePredictionsCallback()
    print("Predicting")
    for predictions, targets in tqdm(loader):
        targets = targets.view(-1, 1)
        scores = torch.ones_like(predictions, dtype=float) / torch.arange(1, predictions.shape[1]+1)
        labels = (predictions == targets).int()
        for metric_name, metric in metrics.items():
            metric.update(scores, labels)
        storage_container(pred_item_ids=predictions, pred_item_scores=scores, labels=targets.view(-1))

    
    
    print("Calculating metrics")
    results = {metric_name: torch.cat(metric.metric_mean, axis=0).mean(axis=0) for metric_name, metric in metrics.items()}
    metric_results = defaultdict(dict)

    for metric_name, result in results.items():
        for k_idx, topk in enumerate(ks):
            metric_results[metric_name][f"at_{topk}"] = result[k_idx].item()
    return dict(metric_results), storage_container

### Most Pop

In [20]:
def get_most_pop(dataset_name: t.Literal["mooc","ednet"]):
    dataset_path = Path(dataset_name)
    if (most_pop_path := dataset_path / "most_pop.parquet").exists():
        most_pop = pd.read_parquet(most_pop_path)
    else:
        # Most popular uptil test interaction
        interactions = pd.read_parquet(dataset_path / "conventional/all_scaled/val_full.parquet", columns=["item_id"])    
        most_pop = interactions.value_counts().rename("view_count").reset_index()
        most_pop.to_parquet(dataset_path / "most_pop.parquet")
    return most_pop

In [21]:
class MostPopDataset(torch.utils.data.Dataset):
    """Takes in the interaction_dict indexed by user_id, and interacted item_id
        `labels`      pd.DataFrame - Ground truth, indexed by user_id
        `most_pop`    pd.DataFrame - View count, with columns `item_id` and `view_count`
    """
    def __init__(self, labels: pd.DataFrame, most_pop: t.List[int], predict_k=20):
        'Initialization'
        self.labels = labels
        self.most_pop = most_pop
        self.predict_k = predict_k

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.labels)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        # Load most_pop
        X = torch.tensor(self.most_pop.nlargest(self.predict_k, "view_count")["item_id"].values)
        y = torch.tensor(self.labels.iloc[index]["item_id"])

        return X, y

#### Ednet

In [22]:
ednet_most_pop = get_most_pop("ednet")
ednet_most_pop.nlargest(20, "view_count")["item_id"].values

array([ 80,  27,  37,  84,  44, 316,   6,   2,   7,   5,  69,  20,  97,
       154,  46,   1,   8,  41,  94,  19])

In [23]:
most_pop_ednet_gen = torch.Generator().manual_seed(SEED)
most_pop_ednet_dataset = MostPopDataset(ednet_test, ednet_most_pop, predict_k=PREDICT_K)
most_pop_ednet_loader = torch.utils.data.DataLoader(most_pop_ednet_dataset, batch_size=len(ednet_test), shuffle=False, generator=most_pop_ednet_gen)

In [26]:
metrics, storage_container = evaluate(most_pop_ednet_loader)
metrics

100%|████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.90s/it]


{'map': {5: 0.029045291244983673, 10: 0.032812077552080154},
 'recall': {5: 0.05501813814043999, 10: 0.0826646164059639},
 'ndcg': {5: 0.03542815148830414, 10: 0.04444465786218643}}

In [27]:
save_eval(metrics, storage_container, "ednet_baseline_most_pop_eval")

#### Mooc

In [28]:
mooc_most_pop = get_most_pop("mooc")
mooc_most_pop.nlargest(20, "view_count")["item_id"].values

array([102, 103, 104, 105, 106, 109, 107, 110, 108,  94, 112, 248, 111,
        95, 113, 249, 561, 161, 123, 160])

In [29]:
most_pop_mooc_gen = torch.Generator().manual_seed(SEED)
most_pop_mooc_dataset = MostPopDataset(mooc_test, mooc_most_pop, predict_k=PREDICT_K)
most_pop_mooc_loader = torch.utils.data.DataLoader(most_pop_mooc_dataset, batch_size=BATCH_SIZE, shuffle=False, generator=most_pop_mooc_gen)

In [30]:
most_pop_mooc_metrics, most_pop_mooc_storage_container = evaluate(most_pop_mooc_loader)
most_pop_mooc_metrics

100%|██████████████████████████████████████████████████| 29/29 [04:22<00:00,  9.06s/it]


{'map': {5: 0.006099153310060501, 10: 0.013897158205509186},
 'recall': {5: 0.017692288383841515, 10: 0.07427503913640976},
 'ndcg': {5: 0.008882351219654083, 10: 0.027426179498434067}}

In [31]:
save_eval(most_pop_mooc_metrics, most_pop_mooc_storage_container, "mooc_baseline_most_pop_eval")

### Syllabus recommender
Get next $x$ vidoes in syllabus, excluding NaN. Pad with MostPop until $k$ predictions


In [32]:
class SyllabusDataset(torch.utils.data.Dataset):
    """Takes in the interaction_dict indexed by user_id, and interacted item_id
        `last_interaction`   pd.DataFrame, with user_id and item_id as columns
        `labels`         pd.DataFrame - Ground truth, indexed by user_id
        `item_lookup`    pd.DataFrame - Indexed by item_id, where -1 is Out of syllabus video_ids
    """
    def __init__(self, last_interaction: pd.DataFrame, labels: pd.DataFrame, item_lookup: pd.DataFrame, most_pop_only=False, predict_k=20):
        'Initialization'
        self.last_interaction = last_interaction
        self.labels = labels
        self.item_lookup = item_lookup
        self.predict_k = predict_k
        self.most_pop_only = most_pop_only

    def __len__(self):
        'Denotes the total number of samples'
        return self.last_interaction.shape[0]

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        user_id, item_id = self.last_interaction.iloc[index].values
        # Load syllabus based predictions
        if item_id not in self.item_lookup.index or self.most_pop_only:
            X = torch.tensor(self.item_lookup.loc[-1].iat[0]) # Default -> Most pop
        else:
            X = torch.tensor(self.item_lookup.loc[item_id].iat[0])
        y = torch.tensor(self.labels.loc[user_id].iat[0])

        return X, y

In [33]:
deduped_syllabus_mapping_val = pd.read_parquet("mooc/deduped_syllabus_mapping.parquet")
deduped_syllabus_mapping_val

Unnamed: 0_level_0,predictions
item_id,Unnamed: 1_level_1
-1,"[102, 103, 104, 105, 106, 109, 107, 110, 108, ..."
28231,"[102, 103, 28232, 104, 35700, 35701, 105, 2057..."
28232,"[102, 35700, 35701, 103, 20579, 104, 105, 5274..."
35700,"[35701, 102, 20579, 103, 104, 52747, 105, 5031..."
35701,"[102, 20579, 103, 104, 52747, 105, 50310, 1479..."
...,...
3545,"[122109, 35241, 34913, 34914, 49605, 102, 103,..."
122109,"[35241, 34913, 34914, 49605, 102, 103, 104, 10..."
35241,"[34913, 34914, 49605, 102, 103, 104, 105, 106,..."
34913,"[34914, 49605, 102, 103, 104, 105, 106, 109, 1..."


In [34]:
syllabus_gen = torch.Generator().manual_seed(SEED)
syllabus_dataset = SyllabusDataset(mooc_val.reset_index(), mooc_test, deduped_syllabus_mapping_val, predict_k=PREDICT_K)
syllabus_loader = torch.utils.data.DataLoader(syllabus_dataset, batch_size=BATCH_SIZE, shuffle=False, generator=syllabus_gen)

In [35]:
syllabus_metrics, syllabus_storage_container = evaluate(syllabus_loader)
syllabus_metrics

100%|██████████████████████████████████████████████████| 29/29 [00:23<00:00,  1.26it/s]


{'map': {5: 0.05349645018577576, 10: 0.06180594116449356},
 'recall': {5: 0.07178062945604324, 10: 0.13219499588012695},
 'ndcg': {5: 0.05795250087976456, 10: 0.07773541659116745}}

In [37]:
save_eval(syllabus_metrics, syllabus_storage_container, "mooc_baseline_syllabus_eval")