In [8]:
import pandas as pd
from scipy.sparse import csr_matrix, save_npz, load_npz
from pathlib import Path
import numpy as np
from tqdm import tqdm
import gc

In [9]:
ednet_path = Path("ednet/conventional/all_scaled/")
mooc_path = Path("mooc/conventional/all_scaled/")

In [10]:
df_names = ["train", "val", "test", "val_full"]

In [11]:
def read_ratings(path: Path):
    return pd.read_parquet(path, columns=["item_id"]).reset_index()[["user_id", "item_id"]]

def get_mappers(df):
    item_id2int = {item_id: num for num, item_id in enumerate(df["item_id"].unique())}
    user_id2int = {user_id: num for num, user_id in enumerate(df["user_id"].unique())}
    return item_id2int, user_id2int
    
def encode_item_ids(df, item_mapper, user_mapper):
    df = df.copy()
    df["item_id"] = df["item_id"].map(item_mapper)
    df["user_id"] = df["user_id"].map(user_mapper)
    
    return df

def get_csr_matrix(sparse_matrix, df):
    df_list = df.groupby("user_id")["item_id"].agg(set).apply(list).reset_index()[["user_id", "item_id"]]
    ratings = sparse_matrix.copy().astype(np.uint8).toarray()
    for user_idx, items_idx in df_list.values.tolist():
        ratings[user_idx, items_idx] = 1
    return csr_matrix(ratings, dtype=float)

### Ednet

In [12]:
LOADED = False
if all([(ednet_path / f"{name}.npz").exists() and (mooc_path / f"{name}.npz").exists() for name in df_names]):
    print("CSR Rating matrices exists - Loading them")
    LOADED = True
    ednet_ratings = {name: load_npz(ednet_path / f"{name}.npz") for name in df_names}
    mooc_ratings = {name: load_npz(mooc_path / f"{name}.npz") for name in df_names}    

In [23]:
%%time
if not LOADED:
    ednet_full = read_ratings(ednet_path / "test_full.parquet")
    ednet_item_mapper, ednet_user_mapper = get_mappers(ednet_full)
    ednet_sparse = csr_matrix((ednet_full["user_id"].nunique(), ednet_full["item_id"].nunique())).tocsr()

    del ednet_full

    mooc_full = read_ratings(mooc_path / "test_full.parquet")
    mooc_item_mapper, mooc_user_mapper = get_mappers(mooc_full)
    mooc_sparse = csr_matrix((mooc_full["user_id"].nunique(), mooc_full["item_id"].nunique())).tocsr()
    del mooc_full
    gc.collect()

CPU times: user 545 ms, sys: 94 ms, total: 639 ms
Wall time: 750 ms


In [24]:
if not LOADED:
    ednet_ratings = {name: get_csr_matrix(
        ednet_sparse, encode_item_ids(
            read_ratings(
                ednet_path / f"{name}.parquet"
            ), 
            ednet_item_mapper, 
            ednet_user_mapper
        )
    ) for name in tqdm(df_names)}

100%|████████████████████████████████████████████████████| 4/4 [00:01<00:00,  3.68it/s]


In [25]:
if not LOADED:
    for name, csr in ednet_ratings.items():
        save_npz(ednet_path / f"{name}.npz", csr)

In [26]:
if not LOADED:
    mooc_ratings = {name: get_csr_matrix(
        mooc_sparse, encode_item_ids(
            read_ratings(
                mooc_path / f"{name}.parquet"
            ), 
            mooc_item_mapper, 
            mooc_user_mapper
        )
    ) for name in tqdm(df_names)}

100%|████████████████████████████████████████████████████| 4/4 [03:34<00:00, 53.74s/it]


In [27]:
if not LOADED:
    for name, csr in mooc_ratings.items():
        save_npz(mooc_path / f"{name}.npz", csr)

In [28]:
ednet_ratings

{'train': <18194x951 sparse matrix of type '<class 'numpy.float64'>'
 	with 241624 stored elements in Compressed Sparse Row format>,
 'val': <18194x951 sparse matrix of type '<class 'numpy.float64'>'
 	with 18194 stored elements in Compressed Sparse Row format>,
 'test': <18194x951 sparse matrix of type '<class 'numpy.float64'>'
 	with 18194 stored elements in Compressed Sparse Row format>,
 'val_full': <18194x951 sparse matrix of type '<class 'numpy.float64'>'
 	with 257990 stored elements in Compressed Sparse Row format>}

In [29]:
mooc_ratings

{'train': <116661x158358 sparse matrix of type '<class 'numpy.float64'>'
 	with 1077872 stored elements in Compressed Sparse Row format>,
 'val': <116661x158358 sparse matrix of type '<class 'numpy.float64'>'
 	with 116661 stored elements in Compressed Sparse Row format>,
 'test': <116661x158358 sparse matrix of type '<class 'numpy.float64'>'
 	with 116661 stored elements in Compressed Sparse Row format>,
 'val_full': <116661x158358 sparse matrix of type '<class 'numpy.float64'>'
 	with 1181258 stored elements in Compressed Sparse Row format>}