# Notebook to preprocess the LFM2019 dataset
Please run the preceeding notebook, "filter_lfm2019.ipynb" before running this notebook.

In [1]:
import os
import csv
import json
import glob
import utils
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm import tqdm
from scipy import sparse as sp
import matplotlib.pyplot as plt
from collections import defaultdict

In [None]:
data_dir = "/media/data/Studium/CP_Institut/FairnessDatasets/lfm"
# data_dir = r"F:\Temp\lfm2b data"

### User data

In [None]:
# Load user data
df_users = pd.read_csv(os.path.join(data_dir, "users_demo_filtered.tsv"), sep="\t", index_col=None)
df_users

In [None]:
df_items = pd.read_csv(os.path.join(data_dir, "tracks_filtered.tsv"), sep="\t", index_col=None)
df_items.columns = ["itemID", "artistName", "trackName"]
df_items

### Interaction matrix

As there are by any means too much items, we will randomly select some items and put them in a sparse matrix.

In [None]:
n_sampled_items = 100_000
np.random.seed(42)

interaction_matrix = sp.load_npz(os.path.join(data_dir, "interaction_matrix_filtered.npz"))
n_users, n_items = interaction_matrix.shape

sampled_items = sorted(np.random.choice(np.arange(n_items), n_sampled_items, replace=False))
df_items = df_items.iloc[sampled_items]

interaction_matrix_sampled = interaction_matrix[:, sampled_items]
interaction_matrix_sampled

### User information

In [None]:
# Moreover, as the didn't seem to be any input validation in place when collecting the data,
# we will set the users' attributes to default if they 
#    - don't make sense (also in case of additional punctations, etc..),
#    - belong to a group with only a few users (for nominal data)

min_n_users_per_group = 200
bad_location_groups = [k for k, v in df_users["Country"].value_counts().items() if v < min_n_users_per_group]
bad_location_groups += ["", ",", np.nan]

bad_location_users = df_users["Country"].isin(bad_location_groups)
print(f"{bad_location_users.sum()} of {len(bad_location_users)} user countries reset to default")

df_users.loc[bad_location_users, "Country"] = ""
df_users

In [None]:
min_age = 10
max_age = 100
bad_age_users = (df_users["Age"] < min_age) | (max_age < df_users["Age"])

print(f"{bad_age_users.sum()} of {len(bad_age_users)} user ages reset to default")
df_users.loc[bad_age_users, "Age"] = np.NaN
df_users

In [None]:
# We group users in age categories to make the data easier to handle
age_categories = [18, 25, 35, 45, 55]

assigned_age_cat = np.zeros(shape=(len(df_users),) , dtype=int)
for cat in age_categories:
    assigned_age_cat += (df_users["Age"] > cat).astype(int)

assigned_age_cat[df_users["Age"].isna()] = -1
df_users["Age"] = assigned_age_cat
df_users["Age"].value_counts()

In [None]:
# Extracted the README file
attribute_descriptions = {
    "gender": {
        "m": "male",
        "f": "female"
    },
    "age": {
        -1: "unknown",
        0:  "Under 18",
        1:  "18-24",
        2:  "25-34",
        3:  "35-44",
        4:  "45-54",
        5:  "55+"
    },
    # just in case scripts require for each attribute to have a description
    "country": {d:d for d in df_users["Country"].unique()}
}
attribute_descriptions["country"][""] = "undefined"

In [None]:
n_interactions = interaction_matrix_sampled.sum()
density = n_interactions / (n_items * n_users)

# Show some statistics about the dataset
print("Number of users:", n_users)
print("Number of items:", n_items)

print("\nNumber of interactions:", n_interactions)
print(f"Density: {density:.6f}")

### Data preparation
For our use-case our end-result should be binary interaction matrix, where ```1``` denotes that a user
interacted with an item, and ```0``` that they did not.

In [None]:
# some items might be missing, let's therefore adjust / re-enumerate the indices
item_ids = df_items["itemID"].unique()
item_rename_dict = {iid: i for i, iid in enumerate(sorted(item_ids))}
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
                  
df_items = df_items[df_items["itemID"].isin(set(item_ids))]
df_items = df_items.assign(itemID = df_items["itemID"].replace(item_rename_dict))
                  
item_ids = list(range(len(item_ids)))
df_item_info = df_items

In [None]:
# create new user file for our usage
df_user_info = df_users.copy()
interaction_matrix = interaction_matrix_sampled

# change column names to camel-case & drop hyphons
rn = {cn: cn[0].lower() + cn[1:].replace("-", "") for cn in df_user_info.columns}
df_user_info.rename(rn, inplace=True, axis=1)

sampled_suffix = f"_{n_sampled_items}"

In [None]:
# store results
storage_dir = os.path.join(data_dir, f"full" + sampled_suffix)
utils.store_results(storage_dir, interaction_matrix, df_user_info, attribute_descriptions)

In [None]:
# filter users & tracks with too less interaction
min_interactions_user = 10
min_interactions_item = 10

im_all, umap_all, imap_all, uinfo_all, iinfo_all = utils.ensure_min_interactions(interaction_matrix, 
                                                      min_interactions_user, min_interactions_item,
                                                      df_user_info, df_item_info)

utils.print_stats(im_all)

# account for previous adjustment of item indices
imap_all = imap_all.assign(old=imap_all["old"].replace(item_rename_dict_reverse))

assert im_all.shape[0] == len(uinfo_all)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}" + sampled_suffix)
utils.store_results(storage_dir, im_all, uinfo_all, attribute_descriptions, iinfo_all, umap_all, imap_all)

In [None]:
# drop all data where no gender info is given
mask_gender_given = df_user_info["gender"].isin(["f", "m"])
user_info_gen = df_user_info[mask_gender_given]
im_gen = interaction_matrix[mask_gender_given, :]

im_gen, umap_gen, imap_gen, uinfo_gen, iinfo_gen = utils.ensure_min_interactions(im_gen, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info_gen, df_item_info)

utils.print_stats(im_gen)

# account for previous adjustment of item indices
imap_gen = imap_gen.assign(old=imap_gen["old"].replace(item_rename_dict_reverse))

assert im_gen.shape[0] == len(uinfo_gen)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_gender" + sampled_suffix)
utils.store_results(storage_dir, im_gen, uinfo_gen, attribute_descriptions, iinfo_gen, umap_gen, imap_gen)

In [None]:
# drop all data where no location info is given
mask_country_given = df_user_info["country"] != ""
user_info_country = df_user_info[mask_country_given]
im_country = interaction_matrix[mask_country_given, :]

im_country, umap_country, imap_country, uinfo_country, iinfo_country = utils.ensure_min_interactions(im_country, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info_country, df_item_info)

utils.print_stats(im_country)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap_country = imap_country.assign(old=imap_country["old"].replace(item_rename_dict_reverse))

assert im_country.shape[0] == len(uinfo_country)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_loc" + sampled_suffix)
utils.store_results(storage_dir, im_country, uinfo_country, attribute_descriptions, iinfo_country, umap_country, imap_country)

In [None]:
# drop all data where no age info is given
mask_age_given = df_user_info["age"] != -1
user_info_age = df_user_info[mask_age_given]
im_age = interaction_matrix[mask_age_given, :]

im_age, umap_age, imap_age, uinfo_age, iinfo_age = utils.ensure_min_interactions(im_age, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info_age, df_item_info)

utils.print_stats(im_age)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap_age = imap_age.assign(old=imap_age["old"].replace(item_rename_dict_reverse))

assert im_age.shape[0] == len(uinfo_age)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_age" + sampled_suffix)
utils.store_results(storage_dir, im_age, uinfo_age, attribute_descriptions, iinfo_age, umap_age, imap_age)

In [None]:
# drop all data where either age or loc is not given
mask_given = mask_gender_given
mask_given &= mask_country_given
mask_given &= mask_age_given

user_info = df_user_info[mask_given]
im = interaction_matrix[mask_given, :]

im, umap, imap, uinfo, iinfo = utils.ensure_min_interactions(im, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info, df_item_info)

utils.print_stats(im)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap = imap.assign(old=imap["old"].replace(item_rename_dict_reverse))

assert im.shape[0] == len(uinfo)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_gender_age_loc" + sampled_suffix)
utils.store_results(storage_dir, im, uinfo, attribute_descriptions, iinfo, umap, imap)

In [18]:
# drop all data where either age or loc is not given
mask_given = mask_gender_given
mask_given &= mask_country_given
mask_given &= mask_age_given

user_info = df_user_info[mask_given]
im = interaction_matrix[mask_given, :]

im, umap, imap, uinfo, iinfo = utils.ensure_min_interactions(im, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info, df_item_info)

utils.print_stats(im)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap = imap.assign(old=imap["old"].replace(item_rename_dict_reverse))

assert im.shape[0] == len(uinfo)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_gender_age_loc" + sampled_suffix)
utils.store_results(storage_dir, im, uinfo, attribute_descriptions, iinfo, umap, imap)

Final shape of interactions matrix is (7603, 62617)
==> 7603 users and 62617 items are remaining.

Number of interactions is 1845963,
which leads to a density of 0.0039.
