# Notebook to preprocess the LFM2019 dataset
Please run the preceeding notebook, "filter_lfm2019.ipynb" before running this notebook.

In [1]:
import os
import csv
import json
import glob
import utils
import pickle 
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm import tqdm
from scipy import sparse as sp
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
data_dir = "/media/gustavo/Storage/Datasets/PersonalityBias/data/"
# data_dir = r"F:\Temp\lfm2b data"

In [3]:
!ls -l /media/gustavo/Storage/Datasets/PersonalityBias/data/

total 402804
drwxrwxrwx 1 root root       416 Sep 18 13:17 full_100000
-rwxrwxrwx 1 root root  17340125 Sep 18 11:43 interaction_matrix_personality.npz
-rwxrwxrwx 1 root root 204350131 Apr 16  2020 listening_histories.pkl
-rwxrwxrwx 1 root root   4121280 Apr 16  2020 precomputed_profiles_thr.pkl
-rwxrwxrwx 1 root root 186593099 Apr 16  2020 spotify_features.csv
-rwxrwxrwx 1 root root     57234 Apr 16  2020 users_info.csv


### User data

In [4]:
# Load user data
df_users = pd.read_csv(os.path.join(data_dir, "users_info.csv"), index_col=None)
df_users.rename(columns={'label':'userID'},inplace=True)
df_users

Unnamed: 0,userID,ope,con,ext,agr,neu,n_les,n_tracks
0,0,5.00,1.00,4.75,2.50,4.00,6.0,6
1,1,4.25,3.75,3.25,3.50,2.00,432905.0,28906
2,2,4.10,3.05,2.35,3.90,3.05,43108.0,5028
3,3,4.65,2.85,3.89,3.35,3.35,43474.0,13681
4,4,4.05,3.10,2.60,3.85,1.80,5928.0,1003
...,...,...,...,...,...,...,...,...
1465,1465,4.65,2.80,4.30,3.65,2.25,1822.0,1071
1466,1466,3.75,2.75,1.35,3.55,4.30,14748.0,7695
1467,1467,3.95,2.95,1.35,3.30,2.10,70066.0,19371
1468,1468,4.15,3.60,3.60,3.50,1.95,8736.0,1276


In [5]:
df_items = pd.read_csv(os.path.join(data_dir, "spotify_features.csv"))
df_items.columns = ["itemID"] +list(df_items.columns[1:])
df_items

Unnamed: 0,itemID,spotify_popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,7zzxnAzSZM8luJ7IASrnax,33.0,0.000803,0.621,204231.0,0.884,0.000023,6.0,0.2760,-6.292,0.0,0.0344,130.040,4.0,0.0706
1,7zzwQwN3jNiK46B2M9kL2Q,17.0,0.295000,0.480,153187.0,0.899,0.000676,10.0,0.2280,-6.698,1.0,0.3830,163.955,4.0,0.4660
2,7zzw3gHoCvn8D4jmsZS9fP,1.0,0.000005,0.402,280840.0,0.918,0.813000,11.0,0.0541,-4.908,0.0,0.0575,100.067,4.0,0.4690
3,7zzvGaj7wSkF8DbqdW2UJk,0.0,0.868000,0.488,202507.0,0.225,0.000019,0.0,0.1710,-11.019,1.0,0.0292,100.772,4.0,0.3600
4,7zzrexVQkXFgNs9DszadOo,1.0,0.830000,0.776,299413.0,0.118,0.832000,0.0,0.0804,-20.933,1.0,0.0649,91.026,4.0,0.4880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1544641,0003gzqa5DjwfTBvxYQ9w1,4.0,0.000798,0.545,190189.0,0.720,0.000000,2.0,0.2090,-8.585,1.0,0.0441,176.048,4.0,0.7950
1544642,0002QCA9IIaRf3ifnR2LNO,3.0,0.212000,0.587,284840.0,0.735,0.000000,1.0,0.0514,-4.992,1.0,0.0440,134.948,4.0,0.4910
1544643,0001baInewEqHk38YpW7W2,5.0,0.902000,0.533,204947.0,0.400,0.910000,7.0,0.1110,-12.368,1.0,0.0253,131.729,3.0,0.5880
1544644,0001KlxKxXdIbexJkKRSDL,0.0,0.045100,0.657,258733.0,0.591,0.589000,6.0,0.1240,-8.501,0.0,0.0563,140.832,4.0,0.1540


### Interaction matrix

As there are by any means too much items, we will randomly select some items and put them in a sparse matrix.

In [6]:
import pickle 
from pathlib import Path
path_data_dir = Path(data_dir)
interaction_dict=  pickle.load(open(path_data_dir/"listening_histories.pkl","rb") )
#user_features = pickle.load(open(path_data_dir/ "","rb") )


In [7]:
unique_items = []
for val in interaction_dict.values():
    unique_items+=val
unique_items = list(set(unique_items))
print(f"n_items: {len(unique_items)}")
item2token = pd.Series(unique_items)
token2item = pd.Series(data=item2token.index, index=item2token.values)


n_items: 1544646


In [4]:


iids_dict = {k:token2item.loc[value].values  for k,value in interaction_dict.items()}
uids_iids_array = pd.Series(iids_dict).explode().reset_index().to_numpy().astype(np.int32)
n_users,n_items = len(interaction_dict.keys()),len(unique_items) 
data = np.ones(uids_iids_array.shape[0],dtype=np.int8)
uids,iids = uids_iids_array[:,0],uids_iids_array[:,1]
interaction_matrix = sp.csr_matrix((data, (uids, iids)), 
                                       (n_users, n_items))

n_items: 1544646


In [12]:
sp.save_npz(os.path.join(data_dir, "interaction_matrix_personality.npz"),interaction_matrix)

In [8]:
n_sampled_items = 100_000
np.random.seed(42)

interaction_matrix = sp.load_npz(os.path.join(data_dir, "interaction_matrix_personality.npz"))
n_users, n_items = interaction_matrix.shape

sampled_items = sorted(np.random.choice(np.arange(n_items), n_sampled_items, replace=False))

interaction_matrix_sampled = interaction_matrix[:, sampled_items]
interaction_matrix_sampled

<1470x100000 sparse matrix of type '<class 'numpy.int8'>'
	with 410280 stored elements in Compressed Sparse Row format>

### User information

In [None]:
# Moreover, as the didn't seem to be any input validation in place when collecting the data,
# we will set the users' attributes to default if they 
#    - don't make sense (also in case of additional punctations, etc..),
#    - belong to a group with only a few users (for nominal data)

min_n_users_per_group = 200
bad_location_groups = [k for k, v in df_users["Country"].value_counts().items() if v < min_n_users_per_group]
bad_location_groups += ["", ",", np.nan]

bad_location_users = df_users["Country"].isin(bad_location_groups)
print(f"{bad_location_users.sum()} of {len(bad_location_users)} user countries reset to default")

df_users.loc[bad_location_users, "Country"] = ""
df_users

In [None]:
min_age = 10
max_age = 100
bad_age_users = (df_users["Age"] < min_age) | (max_age < df_users["Age"])

print(f"{bad_age_users.sum()} of {len(bad_age_users)} user ages reset to default")
df_users.loc[bad_age_users, "Age"] = np.NaN
df_users

In [None]:
# We group users in age categories to make the data easier to handle
age_categories = [18, 25, 35, 45, 55]

assigned_age_cat = np.zeros(shape=(len(df_users),) , dtype=int)
for cat in age_categories:
    assigned_age_cat += (df_users["Age"] > cat).astype(int)

assigned_age_cat[df_users["Age"].isna()] = -1
df_users["Age"] = assigned_age_cat
df_users["Age"].value_counts()

In [None]:
# Extracted the README file
attribute_descriptions = {
    "gender": {
        "m": "male",
        "f": "female"
    },
    "age": {
        -1: "unknown",
        0:  "Under 18",
        1:  "18-24",
        2:  "25-34",
        3:  "35-44",
        4:  "45-54",
        5:  "55+"
    },
    # just in case scripts require for each attribute to have a description
    "country": {d:d for d in df_users["Country"].unique()}
}
attribute_descriptions["country"][""] = "undefined"

In [9]:
n_interactions = interaction_matrix_sampled.sum()
density = n_interactions / (n_items * n_users)

# Show some statistics about the dataset
print("Number of users:", interaction_matrix_sampled.shape[0])
print("Number of items:", interaction_matrix_sampled.shape[1])

print("\nNumber of interactions:", n_interactions)
print(f"Density: {density:.6f}")

Number of users: 1470
Number of items: 100000

Number of interactions: 2023227
Density: 0.000891


### Data preparation
For our use-case our end-result should be binary interaction matrix, where ```1``` denotes that a user
interacted with an item, and ```0``` that they did not.

In [10]:
# some items might be missing, let's therefore adjust / re-enumerate the indices
item_ids = df_items["itemID"].unique()
item_rename_dict = token2item.to_dict()
item_rename_dict_reverse = item2token.to_dict()

df_items = df_items[df_items["itemID"].isin(item_ids)]

#df_items["itemID"]= token2item.loc[df_items["itemID"].values ]
#print(token2item.loc[df_items["itemID"].values]              )

df_items["itemID_int"] = token2item.loc[df_items["itemID"]].values
df_items.rename(columns={"itemID":"itemID_str"},inplace=True)
df_items.rename(columns={"itemID_int":"itemID"},inplace=True)
item_ids = list(range(len(sampled_items)))
df_item_info = df_items[df_items["itemID"].isin(sampled_items)]

In [11]:
df_item_info

Unnamed: 0,itemID_str,spotify_popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,itemID
15,7zzbPkvxJ6iANIxcSz60rm,20.0,0.864000,0.638,161560.0,0.156,0.000000,8.0,0.1830,-17.372,1.0,0.0741,103.905,4.0,0.4430,1179763
20,7zzUVdPXpDr0o7Y2SzgWO8,25.0,0.007410,0.720,282000.0,0.494,0.836000,1.0,0.0786,-9.957,1.0,0.1590,176.955,4.0,0.9020,404914
35,7zzD8MpgOd9qd3i3fspKlf,8.0,0.037800,0.497,298878.0,0.580,0.000000,9.0,0.0943,-8.671,1.0,0.0287,147.024,4.0,0.3520,81011
37,7zzAU4EIuTPTUjiMUpmLbN,26.0,0.022500,0.572,158640.0,0.994,0.915000,5.0,0.1090,-5.855,0.0,0.0418,119.535,4.0,0.1270,350401
49,7zylvC67zTFvKLxSecOInv,20.0,0.002640,0.192,231360.0,0.936,0.054000,6.0,0.3080,-4.484,0.0,0.0749,167.153,4.0,0.4590,474268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1544594,0012gcMTLOeEr2UfflyGgh,9.0,0.133000,0.613,210733.0,0.487,0.811000,0.0,0.0711,-9.202,0.0,0.0603,76.519,4.0,0.1080,287007
1544603,000rdOe90OF65lgFIwPGHm,0.0,0.324000,0.481,203333.0,0.552,0.000399,9.0,0.1170,-6.874,1.0,0.0338,145.629,4.0,0.2460,1278576
1544611,000cAxMxUfuDlvPYIVrrZ4,28.0,0.000147,0.474,181345.0,0.966,0.000000,8.0,0.3740,-2.957,1.0,0.0931,156.993,4.0,0.6020,75032
1544616,000V2mp9CgAS1Er4eTzG3C,5.0,0.845000,0.151,286960.0,0.238,0.829000,0.0,0.0922,-13.381,1.0,0.0389,105.631,3.0,0.0286,742785


In [12]:
# create new user file for our usage
df_user_info = df_users.copy()
interaction_matrix = interaction_matrix_sampled

# change column names to camel-case & drop hyphens
rn = {cn: cn[0].lower() + cn[1:].replace("-", "") for cn in df_user_info.columns}
df_user_info.rename(rn, inplace=True, axis=1)

sampled_suffix = f"_{n_sampled_items}"

In [13]:
# store results
attribute_descriptions = {}
storage_dir = os.path.join(data_dir, f"full" + sampled_suffix)
utils.store_results(storage_dir, interaction_matrix, df_user_info, attribute_descriptions)

In [15]:
# filter users & tracks with too less interaction
min_interactions_user = 5
min_interactions_item = 5

im_all, umap_all, imap_all, uinfo_all, iinfo_all = utils.ensure_min_interactions(interaction_matrix, 
                                                      min_interactions_user, min_interactions_item,
                                                      df_user_info, df_item_info)

utils.print_stats(im_all)

# account for previous adjustment of item indices
#imap_all = imap_all.assign(old=imap_all["old"].replace(item_rename_dict_reverse))
imap_all["old_new"] = item2token.loc[imap_all["old"]].values
imap_all.rename(columns={"old":"old_prev"},inplace=True)
imap_all.rename(columns={"old_new":"old"},inplace=True)
assert im_all.shape[0] == len(uinfo_all)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}" + sampled_suffix)
utils.store_results(storage_dir, im_all, uinfo_all, attribute_descriptions, iinfo_all, umap_all, imap_all)

Final shape of interactions matrix is (1284, 37929)
==> 1284 users and 37929 items are remaining.

Number of interactions is 1936313,
which leads to a density of 0.0398.


In [None]:
# drop all data where no gender info is given
mask_gender_given = df_user_info["gender"].isin(["f", "m"])
user_info_gen = df_user_info[mask_gender_given]
im_gen = interaction_matrix[mask_gender_given, :]

im_gen, umap_gen, imap_gen, uinfo_gen, iinfo_gen = utils.ensure_min_interactions(im_gen, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info_gen, df_item_info)

utils.print_stats(im_gen)

# account for previous adjustment of item indices
imap_gen = imap_gen.assign(old=imap_gen["old"].replace(item_rename_dict_reverse))

assert im_gen.shape[0] == len(uinfo_gen)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_gender" + sampled_suffix)
utils.store_results(storage_dir, im_gen, uinfo_gen, attribute_descriptions, iinfo_gen, umap_gen, imap_gen)

In [None]:
# drop all data where no location info is given
mask_country_given = df_user_info["country"] != ""
user_info_country = df_user_info[mask_country_given]
im_country = interaction_matrix[mask_country_given, :]

im_country, umap_country, imap_country, uinfo_country, iinfo_country = utils.ensure_min_interactions(im_country, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info_country, df_item_info)

utils.print_stats(im_country)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap_country = imap_country.assign(old=imap_country["old"].replace(item_rename_dict_reverse))

assert im_country.shape[0] == len(uinfo_country)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_loc" + sampled_suffix)
utils.store_results(storage_dir, im_country, uinfo_country, attribute_descriptions, iinfo_country, umap_country, imap_country)

In [None]:
# drop all data where no age info is given
mask_age_given = df_user_info["age"] != -1
user_info_age = df_user_info[mask_age_given]
im_age = interaction_matrix[mask_age_given, :]

im_age, umap_age, imap_age, uinfo_age, iinfo_age = utils.ensure_min_interactions(im_age, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info_age, df_item_info)

utils.print_stats(im_age)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap_age = imap_age.assign(old=imap_age["old"].replace(item_rename_dict_reverse))

assert im_age.shape[0] == len(uinfo_age)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_age" + sampled_suffix)
utils.store_results(storage_dir, im_age, uinfo_age, attribute_descriptions, iinfo_age, umap_age, imap_age)

In [None]:
# drop all data where either age or loc is not given
mask_given = mask_gender_given
mask_given &= mask_country_given
mask_given &= mask_age_given

user_info = df_user_info[mask_given]
im = interaction_matrix[mask_given, :]

im, umap, imap, uinfo, iinfo = utils.ensure_min_interactions(im, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info, df_item_info)

utils.print_stats(im)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap = imap.assign(old=imap["old"].replace(item_rename_dict_reverse))

assert im.shape[0] == len(uinfo)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_gender_age_loc" + sampled_suffix)
utils.store_results(storage_dir, im, uinfo, attribute_descriptions, iinfo, umap, imap)

In [18]:
# drop all data where either age or loc is not given
mask_given = mask_gender_given
mask_given &= mask_country_given
mask_given &= mask_age_given

user_info = df_user_info[mask_given]
im = interaction_matrix[mask_given, :]

im, umap, imap, uinfo, iinfo = utils.ensure_min_interactions(im, 
                                                      min_interactions_user, min_interactions_item,
                                                      user_info, df_item_info)

utils.print_stats(im)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap = imap.assign(old=imap["old"].replace(item_rename_dict_reverse))

assert im.shape[0] == len(uinfo)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}_gender_age_loc" + sampled_suffix)
utils.store_results(storage_dir, im, uinfo, attribute_descriptions, iinfo, umap, imap)

Final shape of interactions matrix is (7603, 62617)
==> 7603 users and 62617 items are remaining.

Number of interactions is 1845963,
which leads to a density of 0.0039.
