# Notebook to preprocess the LFM-100k dataset


In [1]:
import os
import csv
import json
import glob
import utils
import pickle 
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm import tqdm
from scipy import sparse as sp
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
data_dir = r"/root/dataset/dir/lfm-100k"

### User data

In [4]:
# Load user data
df_users = pd.read_csv(os.path.join(data_dir, "user_data.tsv.bz2"),sep='\t',names=["userID","Country","Age","gender","creation_time"])
df_users

Unnamed: 0,userID,Country,Age,gender,creation_time
0,0,IT,33,m,2006-06-18 21:07:33
1,1,RU,25,m,2007-10-12 18:42:00
2,2,UK,25,m,2005-06-15 22:02:11
3,3,ES,29,m,2005-09-30 22:38:33
4,4,FR,30,m,2010-11-03 02:52:17
...,...,...,...,...,...
9359,9359,US,24,f,2005-04-15 04:29:35
9360,9360,RU,20,m,2011-07-29 21:09:24
9361,9361,DE,18,m,2010-05-14 14:07:05
9362,9362,UA,25,m,2006-12-05 02:20:59


In [57]:
df_users.Country.nunique(),df_users.Age.nunique()

(128, 87)

In [5]:
df_users.gender.value_counts()

m    7580
f    1784
Name: gender, dtype: int64

In [60]:
df_items = pd.read_csv(os.path.join(data_dir, "tracks.tsv.bz2"),sep='\t',names=["artist","track","gender"] ).reset_index()
df_items.columns = ["itemID","artist","track","gender"] 
df_items

Unnamed: 0,itemID,artist,track,gender
0,0,Hans Zimmer,503,m
1,1,Bear McCreary,A Mass Awakening,m
2,2,Tom Tykwer,All Boundaries Are Conventions,m
3,3,Steve Jablonsky,Arrival To Earth,m
4,4,Rosalía,BAGDAD - Cap.7: Liturgia,f
...,...,...,...,...
99968,99968,Brädi,Lämpöö,m
99969,99969,Milton Nascimento,O Cio Da Terra - Acústico,m
99970,99970,Adriano Celentano,Prisencolinensinainciusol - Remastered,m
99971,99971,Ulrik Munther,Say Goodbye,m


### Interaction matrix

As there are by any means too much items, we will randomly select some items and put them in a sparse matrix.

In [61]:
import pickle 
from pathlib import Path
path_data_dir = Path(data_dir)
interactions = pd.read_csv(path_data_dir/"inter.tsv.bz2",sep="\t" , names =["userID","itemID","pc"])
#user_features = pickle.load(open(path_data_dir/ "","rb") )
interactions


Unnamed: 0,userID,itemID,pc
0,0,0,2
1,0,1,3
2,0,2,2
3,0,3,2
4,0,4,2
...,...,...,...
1820895,9363,3962,17
1820896,9363,30269,13
1820897,9363,30297,17
1820898,9363,4039,14


In [62]:
unique_items = interactions["itemID"].unique()
print(f"n_items: {len(unique_items)}")

item2token = pd.Series(unique_items)
token2item = pd.Series(data=item2token.index, index=item2token.values)


n_items: 99973


In [63]:


uids_iids_array = interactions.values
n_users,n_items = interactions.userID.nunique(),len(unique_items) 
data = np.ones(uids_iids_array.shape[0],dtype=np.int8)
uids,iids = uids_iids_array[:,0],uids_iids_array[:,1]
interaction_matrix = sp.csr_matrix((data, (uids, iids)), 
                                       (n_users, n_items))

In [64]:
sp.save_npz(os.path.join(data_dir, "interaction_matrix.npz"),interaction_matrix)

In [65]:
n_sampled_items = 100_000
np.random.seed(42)

interaction_matrix = sp.load_npz(os.path.join(data_dir, "interaction_matrix.npz"))
n_users, n_items = interaction_matrix.shape

sampled_items = unique_items #sorted(np.random.choice(np.arange(n_items), n_sampled_items, replace=False))

interaction_matrix_sampled = interaction_matrix[:, sampled_items]
interaction_matrix_sampled

<9364x99973 sparse matrix of type '<class 'numpy.int8'>'
	with 1820900 stored elements in Compressed Sparse Row format>

### User information

In [17]:
# Moreover, as the didn't seem to be any input validation in place when collecting the data,
# we will set the users' attributes to default if they 
#    - don't make sense (also in case of additional punctations, etc..),
#    - belong to a group with only a few users (for nominal data)

min_n_users_per_group = 200
bad_location_groups = [k for k, v in df_users["Country"].value_counts().items() if v < min_n_users_per_group]
bad_location_groups += ["", ",", np.nan]

bad_location_users = df_users["Country"].isin(bad_location_groups)
print(f"{bad_location_users.sum()} of {len(bad_location_users)} user countries reset to default")

df_users.loc[bad_location_users, "Country"] = ""
df_users

2899 of 9364 user countries reset to default


Unnamed: 0,userID,Country,Age,gender,creation_time
0,0,,33,m,2006-06-18 21:07:33
1,1,RU,25,m,2007-10-12 18:42:00
2,2,UK,25,m,2005-06-15 22:02:11
3,3,,29,m,2005-09-30 22:38:33
4,4,FR,30,m,2010-11-03 02:52:17
...,...,...,...,...,...
9359,9359,US,24,f,2005-04-15 04:29:35
9360,9360,RU,20,m,2011-07-29 21:09:24
9361,9361,DE,18,m,2010-05-14 14:07:05
9362,9362,UA,25,m,2006-12-05 02:20:59


In [18]:
min_age = 10
max_age = 100
bad_age_users = (df_users["Age"] < min_age) | (max_age < df_users["Age"])

print(f"{bad_age_users.sum()} of {len(bad_age_users)} user ages reset to default")
df_users.loc[bad_age_users, "Age"] = np.NaN
df_users

22 of 9364 user ages reset to default


Unnamed: 0,userID,Country,Age,gender,creation_time
0,0,,33.0,m,2006-06-18 21:07:33
1,1,RU,25.0,m,2007-10-12 18:42:00
2,2,UK,25.0,m,2005-06-15 22:02:11
3,3,,29.0,m,2005-09-30 22:38:33
4,4,FR,30.0,m,2010-11-03 02:52:17
...,...,...,...,...,...
9359,9359,US,24.0,f,2005-04-15 04:29:35
9360,9360,RU,20.0,m,2011-07-29 21:09:24
9361,9361,DE,18.0,m,2010-05-14 14:07:05
9362,9362,UA,25.0,m,2006-12-05 02:20:59


In [30]:
# We group users in age categories to make the data easier to handle
age_categories = [18, 25, 35, 45, 55]

assigned_age_cat = np.zeros(shape=(len(df_users),) , dtype=int)
for cat in age_categories:
    assigned_age_cat += (df_users["Age"] > cat).astype(int)

assigned_age_cat[df_users["Age"].isna()] = -1
df_users["Age"] = assigned_age_cat
df_users["Age"].value_counts()

 1    4962
 2    2623
 0     985
 3     543
 4     170
 5      59
-1      22
Name: Age, dtype: int64

In [67]:
# Extracted the README file
attribute_descriptions = {
    "gender": {
        "m": "male",
        "f": "female"
    },
    "age": {
        -1: "unknown",
        0:  "Under 18",
        1:  "18-24",
        2:  "25-34",
        3:  "35-44",
        4:  "45-54",
        5:  "55+"
    },
    # just in case scripts require for each attribute to have a description
    "country": {d:d for d in df_users["Country"].unique()}
}
attribute_descriptions["country"][""] = "undefined"

In [68]:
n_interactions = interaction_matrix_sampled.sum()
density = n_interactions / (n_items * n_users)

# Show some statistics about the dataset
print("Number of users:", interaction_matrix_sampled.shape[0])
print("Number of items:", interaction_matrix_sampled.shape[1])

print("\nNumber of interactions:", n_interactions)
print(f"Density: {density:.6f}")

Number of users: 9364
Number of items: 99973

Number of interactions: 1820900
Density: 0.001945


### Data preparation
For our use-case our end-result should be binary interaction matrix, where ```1``` denotes that a user
interacted with an item, and ```0``` that they did not.

In [69]:
# some items might be missing, let's therefore adjust / re-enumerate the indices
item_ids = df_items["itemID"].unique()
item_rename_dict = token2item.to_dict()
item_rename_dict_reverse = item2token.to_dict()

df_items = df_items[df_items["itemID"].isin(item_ids)]

#df_items["itemID"]= token2item.loc[df_items["itemID"].values ]
#print(token2item.loc[df_items["itemID"].values]              )

df_items["itemID_int"] = token2item.loc[df_items["itemID"]].values
df_items.rename(columns={"itemID":"itemID_str"},inplace=True)
df_items.rename(columns={"itemID_int":"itemID"},inplace=True)
item_ids = list(range(len(sampled_items)))
df_item_info = df_items[df_items["itemID"].isin(sampled_items)]

In [70]:
df_item_info

Unnamed: 0,itemID_str,artist,track,gender,itemID
0,0,Hans Zimmer,503,m,0
1,1,Bear McCreary,A Mass Awakening,m,1
2,2,Tom Tykwer,All Boundaries Are Conventions,m,2
3,3,Steve Jablonsky,Arrival To Earth,m,3
4,4,Rosalía,BAGDAD - Cap.7: Liturgia,f,4
...,...,...,...,...,...
99968,99968,Brädi,Lämpöö,m,99968
99969,99969,Milton Nascimento,O Cio Da Terra - Acústico,m,99969
99970,99970,Adriano Celentano,Prisencolinensinainciusol - Remastered,m,99970
99971,99971,Ulrik Munther,Say Goodbye,m,99971


In [71]:
# create new user file for our usage
df_user_info = df_users.copy()
interaction_matrix = interaction_matrix_sampled

# change column names to camel-case & drop hyphens
rn = {cn: cn[0].lower() + cn[1:].replace("-", "") for cn in df_user_info.columns}
df_user_info.rename(rn, inplace=True, axis=1)

sampled_suffix = f"_{n_sampled_items}"

In [72]:
# store results
attribute_descriptions = {}
storage_dir = os.path.join(data_dir, f"full" + sampled_suffix)
utils.store_results(storage_dir, interaction_matrix, df_user_info, attribute_descriptions)

In [81]:
# filter users & tracks with too less interaction
min_interactions_user = 10
min_interactions_item = 10

im_all, umap_all, imap_all, uinfo_all, iinfo_all = utils.ensure_min_interactions(interaction_matrix, 
                                                      min_interactions_user, min_interactions_item,
                                                      df_user_info, df_item_info)

utils.print_stats(im_all)

# account for previous adjustment of item indices
#imap_all = imap_all.assign(old=imap_all["old"].replace(item_rename_dict_reverse))
imap_all["old_new"] = item2token.loc[imap_all["old"]].values
imap_all.rename(columns={"old":"old_prev"},inplace=True)
imap_all.rename(columns={"old_new":"old"},inplace=True)
assert im_all.shape[0] == len(uinfo_all)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}" + sampled_suffix)
utils.store_results(storage_dir, im_all, uinfo_all, attribute_descriptions, iinfo_all, umap_all, imap_all)

Final shape of interactions matrix is (8663, 47383)
==> 8663 users and 47383 items are remaining.

Number of interactions is 1476788,
which leads to a density of 0.0036.


In [79]:
df_user_info["age"] = (df_user_info["age"] - 0)/ 120

In [83]:
uinfo_all["age"].isna().sum()

0

In [35]:
# drop all data where no gender info is given
mask_gender_given = df_user_info["gender"].isin(["f", "m"])
user_info_gen = df_user_info[mask_gender_given]
im_gen = interaction_matrix[mask_gender_given, :]

im_gen, umap_gen, imap_gen, uinfo_gen, iinfo_gen = utils.ensure_min_interactions(im_gen, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info_gen, df_item_info)

utils.print_stats(im_gen)

# account for previous adjustment of item indices
imap_gen = imap_gen.assign(old=imap_gen["old"].replace(item_rename_dict_reverse))

assert im_gen.shape[0] == len(uinfo_gen)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_gender" + sampled_suffix)
utils.store_results(storage_dir, im_gen, uinfo_gen, attribute_descriptions, iinfo_gen, umap_gen, imap_gen)

Final shape of interactions matrix is (8663, 47383)
==> 8663 users and 47383 items are remaining.

Number of interactions is 1476788,
which leads to a density of 0.0036.


In [45]:
# drop all data where no location info is given
mask_country_given = df_user_info["country"] != ""
user_info_country = df_user_info[mask_country_given]
im_country = interaction_matrix[mask_country_given, :]

im_country, umap_country, imap_country, uinfo_country, iinfo_country = utils.ensure_min_interactions(im_country, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info_country, df_item_info)

utils.print_stats(im_country)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap_country = imap_country.assign(old=imap_country["old"].replace(item_rename_dict_reverse))

assert im_country.shape[0] == len(uinfo_country)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_loc" + sampled_suffix)
utils.store_results(storage_dir, im_country, uinfo_country, attribute_descriptions, iinfo_country, umap_country, imap_country)

Final shape of interactions matrix is (5860, 32944)
==> 5860 users and 32944 items are remaining.

Number of interactions is 925403,
which leads to a density of 0.0048.


In [84]:
# drop all data where no age info is given
mask_age_given = df_user_info["age"] > 0
user_info_age = df_user_info[mask_age_given]
im_age = interaction_matrix[mask_age_given, :]

im_age, umap_age, imap_age, uinfo_age, iinfo_age = utils.ensure_min_interactions(im_age, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info_age, df_item_info)

utils.print_stats(im_age)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap_age = imap_age.assign(old=imap_age["old"].replace(item_rename_dict_reverse))

assert im_age.shape[0] == len(uinfo_age)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_age" + sampled_suffix)
utils.store_results(storage_dir, im_age, uinfo_age, attribute_descriptions, iinfo_age, umap_age, imap_age)

Final shape of interactions matrix is (8663, 47383)
==> 8663 users and 47383 items are remaining.

Number of interactions is 1476788,
which leads to a density of 0.0036.


In [47]:
# drop all data where either age or loc is not given
mask_given = mask_gender_given
mask_given &= mask_country_given
mask_given &= mask_age_given

user_info = df_user_info[mask_given]
im = interaction_matrix[mask_given, :]

im, umap, imap, uinfo, iinfo = utils.ensure_min_interactions(im, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info, df_item_info)

utils.print_stats(im)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap = imap.assign(old=imap["old"].replace(item_rename_dict_reverse))

assert im.shape[0] == len(uinfo)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_gender_age_loc" + sampled_suffix)
utils.store_results(storage_dir, im, uinfo, attribute_descriptions, iinfo, umap, imap)

Final shape of interactions matrix is (5846, 32863)
==> 5846 users and 32863 items are remaining.

Number of interactions is 922567,
which leads to a density of 0.0048.


In [48]:
# drop all data where either age or loc is not given
mask_given = mask_gender_given
mask_given &= mask_country_given
mask_given &= mask_age_given

user_info = df_user_info[mask_given]
im = interaction_matrix[mask_given, :]

im, umap, imap, uinfo, iinfo = utils.ensure_min_interactions(im, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info, df_item_info)

utils.print_stats(im)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap = imap.assign(old=imap["old"].replace(item_rename_dict_reverse))

assert im.shape[0] == len(uinfo)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_gender_age_loc" + sampled_suffix)
utils.store_results(storage_dir, im, uinfo, attribute_descriptions, iinfo, umap, imap)

Final shape of interactions matrix is (5846, 32863)
==> 5846 users and 32863 items are remaining.

Number of interactions is 922567,
which leads to a density of 0.0048.
