# Notebook to preprocess the PostRec dataset
https://www.kaggle.com/datasets/vatsalparsaniya/post-pecommendation

In [None]:
import os
import json
import utils
import numpy as np
import pandas as pd
from scipy import sparse as sp

In [None]:
# data_dir = r"D:\data\datasets"
# data_dir = "/media/data/Datasets"
# data_dir = r"/media/data/Studium/CP_Institut/FairnessDatasets"
data_dir = r"F:\Studium\CP_Institut\FairnessDatasets"
data_dir = os.path.join(data_dir, "postrec")

In [None]:
df_interactions = pd.read_csv(os.path.join(data_dir, "view_data.csv"), sep=",")
df_interactions.head()

In [None]:
df_users = pd.read_csv(os.path.join(data_dir, "user_data.csv"), sep=",")
df_users.head()

In [None]:
user_id_map = {u["user_id"]: i for i, u in df_users.iterrows()}
df_users["user_id"] = df_users["user_id"].replace(user_id_map)
df_users.head()

In [None]:
df_interactions["user_id"] = df_interactions["user_id"].replace(user_id_map) 
df_interactions.head()

In [None]:
# Extracted the README file
attribute_descriptions = {
    "gender": {
        "m": "male",
        "f": "female"
    }
}

In [None]:
df_items = pd.read_csv(os.path.join(data_dir, "post_data.csv"), sep=",")
df_items.columns = ["Title", "Category", "ItemID"]
df_items.head()

In [None]:
df_interactions.columns = ["UserID", "ItemID", "TimeStamp"]
item_ids = df_interactions["ItemID"].unique()

n_users = len(df_users)
n_items = len(item_ids)
n_ratings = len(df_interactions)
density = n_ratings / (n_items * n_users)

# Show some statistics about the dataset
print("Number of users:", n_users)
print("Number of items:", n_items)
print("\nCounts of users per gender:")
print(df_users["gender"].value_counts())

print("\nNumber of interactions:", n_ratings)
print(f"Density: {density:.4f}")

### Data preparation
For our use-case our end-result should be binary interaction matrix, where ```1``` denotes that a user
rated an item, and ```0``` that they did not.

In [None]:
# some items might be missing, let's therefore adjust / re-enumerate the indices
item_rename_dict = {iid: i for i, iid in enumerate(sorted(item_ids))}

df_interactions = df_interactions.assign(ItemID = df_interactions["ItemID"].replace(item_rename_dict))
                  
df_items = df_items[df_items["ItemID"].isin(set(item_ids))]
df_items = df_items.assign(ItemID = df_items["ItemID"].replace(item_rename_dict))
                  
item_ids = list(range(len(item_ids)))

In [None]:
# get user and item ids from ratings df, -1 as the first user originally received the id 1
user_ids = df_interactions["UserID"]
item_ids = df_interactions["ItemID"]
values = np.ones(len(user_ids))

interaction_matrix = sp.csr_matrix((values, (user_ids, item_ids)), shape=(n_users, n_items))
display(interaction_matrix.shape)

# store results
storage_dir = os.path.join(data_dir, "full")
os.makedirs(storage_dir, exist_ok=True)
sp.save_npz(os.path.join(storage_dir, "interactions.npz"), interaction_matrix)

# check whether all interactions were actually kept
print("Number of interactions (again):", interaction_matrix.toarray().sum())

In [None]:
# create new user file for our usage
df_user_info = df_users[["user_id", "gender"]].copy()
df_user_info.columns = ["UserID", "Gender"]
df_user_info["UserID"] -= 1 # move start index from 1 to 0
df_user_info["Gender"] = df_user_info["Gender"].apply(lambda item: item.lower())

# change column names to camel-case
rn = {cn: cn[0].lower() + cn[1:] for cn in df_user_info.columns}
df_user_info.rename(rn, inplace=True, axis=1)

df_user_info.to_csv(os.path.join(storage_dir, "user_info.csv"), index=False)
df_user_info.head()

with open(os.path.join(storage_dir, "attribute_descriptions.json"), "w") as fh:
    json.dump(attribute_descriptions, fh, indent="\t")

In [None]:
# create new user file for our usage
df_item_info = df_items.copy()

# change column names to camel-case
rn = {cn: cn[0].lower() + cn[1:] for cn in df_item_info.columns}
df_item_info.rename(rn, inplace=True, axis=1)

df_item_info.to_csv(os.path.join(storage_dir, "item_info.csv"), index=False)
df_item_info.head()

In [None]:
old, new = zip(*item_rename_dict.items())
df_item_mapping = pd.DataFrame.from_dict({"old": old, "new": new})
df_item_mapping.to_csv(os.path.join(storage_dir, "item_mapping.csv"), index=False)
df_item_mapping.head()

In [None]:
# filter users & tracks with too less interaction
min_interactions_user = 5
min_interactions_item = 5

im_all, umap_all, imap_all, uinfo_all, iinfo_all = utils.ensure_min_interactions(interaction_matrix, 
                                                      min_interactions_user, min_interactions_item,
                                                      df_user_info, df_item_info)

utils.print_stats(im_all)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap_all = imap_all.assign(old=imap_all["old"].replace(item_rename_dict_reverse))

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}")
utils.store_results(storage_dir, im_all, uinfo_all, attribute_descriptions, iinfo_all, umap_all, imap_all)

In [15]:
# filter users & tracks with too less interaction
min_interactions_user = 5
min_interactions_item = 5

im_all, umap_all, imap_all, uinfo_all, iinfo_all = utils.ensure_min_interactions(interaction_matrix, 
                                                      min_interactions_user, min_interactions_item,
                                                      df_user_info, df_item_info)

utils.print_stats(im_all)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap_all = imap_all.assign(old=imap_all["old"].replace(item_rename_dict_reverse))

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}")
utils.store_results(storage_dir, im_all, uinfo_all, attribute_descriptions, iinfo_all, umap_all, imap_all)

Final shape of interactions matrix is (482, 5964)
==> 482 users and 5964 items are remaining.

Number of interactions is 71625,
which leads to a density of 0.0249.
