# Notebook to filter the LFM2b dataset

Prerequisites:
Download the files 
- [listening-events.tsv.bz2](http://www.cp.jku.at/datasets/lfm-2b/chiir/listening-events.tsv.bz2)
- [users.tsv.bz2](http://www.cp.jku.at/datasets/lfm-2b/chiir/users.tsv.bz2)  

from http://www.cp.jku.at/datasets/LFM-2b/ to your preferred directory and extract them.

In case you do not want to reduce the time span of the listening events, unlike us,
download [listening-counts.tsv.bz2](http://www.cp.jku.at/datasets/lfm-2b/chiir/listening-counts.tsv.bz2) instead.

As the dataset is quite big (~90GB extracted), we filter all listening events that are not relevant for our use-case and will proceed with the preprocessing in another notebook.

Our main objective is to reduce the memory consumption when loading the dataset, such that anyone can create the dataset for themselves. We are aware that this leads to longer processing times, however, as this has to be done only once, this should be an acceptible tradeoff. 

To do so, please execute the following command to split the dataset into multiple smaller files:  
```/usr/bin/split listening-events.tsv -l 20000000```  
and move all files into a separate folder.

The current configuration requieres about 3 GB of free memory and takes ~2h to run.

In [1]:
data_dir = "/media/data1/Tmp/lfm"
# data_dir = r"F:\Temp\lfm2b data"
splits_dir = "/media/data1/Tmp/lfm/splits"
filtered_splits_dir = "/media/data1/Tmp/lfm/splits_filtered_2019"
# filtered_splits_dir = r"F:\Temp\lfm2b data\splits_filtered_2019"

In [2]:
import os
import csv
import time
import glob
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm import tqdm
from datetime import datetime,  timedelta
from scipy import sparse as sp
from collections import defaultdict

### Settings

In [3]:
# define time range of which to take interactions
min_time = datetime.fromisoformat('2019-01-01')
max_time = datetime.fromisoformat('2019-12-31')

def date_parser(time_str):
    return datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")

def time_ok(time_str):
    return min_time <= datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S") <= max_time

# filter users & tracks with too less interaction
min_interactions_user = 10
min_interactions_item = 10
min_interaction_count = 1

### Preprocess users

In [4]:
df_users = pd.read_csv(os.path.join(data_dir, "users.tsv"), 
                        sep="\t", engine="python", encoding='latin-1',
                        header=0)
df_users.columns = ["UserID", "Country", "Age", "Gender", "RegistrationDate"]

# keep users for which at least one attribute is available
user_mask = ~df_users["Country"].isna()
user_mask |= df_users["Gender"].isin(["f", "m"])
user_mask |= df_users["Age"] > 0

df_users = df_users[user_mask]
user_ids = df_users["UserID"]

# store old user indices and adjust user ids
df_users.reset_index(drop=True, inplace=True)
user_mapping = {a: b for a, b in zip(user_ids, df_users.index)}
df_users = df_users.assign(UserID = df_users.index)

df_users.to_csv(os.path.join(data_dir, "users-demo.tsv"), sep="\t", index=False)
df_users.head()

Unnamed: 0,UserID,Country,Age,Gender,RegistrationDate
0,0,UK,31,m,2002-12-28 01:00:00
1,1,US,43,m,2003-04-15 02:00:00
2,2,UK,35,m,2002-10-29 01:00:00
3,3,BR,31,m,2003-07-20 02:00:00
4,4,,51,m,2003-07-21 02:00:00


### Filter interactions
We first select only the users for which some demographic information is available. Moreover, we drop all interactions that did happen in our previously defined time frame. The results will again be stored in separate files.

In [5]:
os.makedirs(filtered_splits_dir, exist_ok=True)

first = True
fcount = 0
header = ["user_id", "track_id", "album_id", "timestamp"]

item_indices = set()

files = list(sorted(glob.glob(os.path.join(splits_dir, "*"))))
for f in tqdm(files, desc="Parsing files"):
    if first:
        df = pd.read_csv(f, sep="\t", engine="c")
        first = False
    else:
        df = pd.read_csv(f, sep="\t", names=header, engine="c")
    
    # Filter by user id and timestamp
    uid_mask = df["user_id"].isin(user_ids)
    df["timestamp"] = pd.to_datetime(df['timestamp'], format="%Y-%m-%d %H:%M:%S", errors="coerce")
    time_mask = (min_time <= df["timestamp"]) & (df["timestamp"] <= max_time)
        
    mask = uid_mask & time_mask
    if mask.sum() > 0:
        df = df.loc[mask, ["user_id", "track_id"]]
        df["user_id"] = df["user_id"].map(user_mapping)
        
        item_indices = item_indices.union(df["track_id"].unique())
        
        path = os.path.join(filtered_splits_dir, f"filtered_part_{fcount}.tsv")
        df.to_csv(path, sep="\t", index=False)
        fcount += 1

Parsing files: 100%|██████████████████████████| 101/101 [23:20<00:00, 13.87s/it]


### Generate interaction matrix

In [5]:
# gather item indices (in case the cell above is not executed beforehand)
item_indices = set()

files = sorted(glob.glob(os.path.join(filtered_splits_dir, "*.tsv")))
for i, f in enumerate(tqdm(files, desc="Parsing files")):
    df = pd.read_csv(f, sep="\t")
    item_indices = item_indices.union(df["track_id"].unique())

Parsing files: 100%|████████████████████████████| 47/47 [00:31<00:00,  1.51it/s]


In [6]:
n_users = len(df_users)
n_items = len(item_indices)

In [7]:
item_map = {idx: i for i, idx in enumerate(sorted(item_indices))} 
item_map_rev = {v: k for k, v in item_map.items()}

In [8]:
# generate interaction matrix for the individual files
files = sorted(glob.glob(os.path.join(filtered_splits_dir, "*.tsv")))
for i, f in enumerate(tqdm(files, desc="Parsing files")):
    df = pd.read_csv(f, sep="\t")
    
    d = defaultdict(lambda: 0)
    for uid, iid in zip(df["user_id"], df["track_id"]):
        d[(uid, iid)] += 1
        
    uids, iids = zip(*d.keys())
    iids = [item_map[i] for i in iids]
    data = list(d.values())
    
    interaction_matrix = sp.csr_matrix((data, (uids, iids)), 
                                       (n_users, n_items))

    sp.save_npz(os.path.join(filtered_splits_dir, f"interaction_matrix_part_{i}.npz"), interaction_matrix)

Parsing files: 100%|████████████████████████████| 47/47 [04:26<00:00,  5.68s/it]


In [9]:
# process the items in chunks when loading the previously stored matrix parts
items_per_chunk = 500_000

files = sorted(glob.glob(os.path.join(filtered_splits_dir, "*.npz")))

im_full = []
valid_item_indices = []
chunk_steps = list(range(0, n_items, items_per_chunk))
for i in tqdm(chunk_steps, desc="Processing item chunks"):
    interaction_matrix = None
    for f in files:
        im_part = sp.load_npz(f)[:, i : i+items_per_chunk]
        if interaction_matrix is None:
            interaction_matrix = sp.csr_matrix(im_part.shape, dtype=int)
        interaction_matrix += im_part

    # Ignore "misclick" interactions
    interaction_matrix.data -= min_interaction_count
    interaction_matrix.eliminate_zeros()

    # and binarize result
    interaction_matrix.data[:] = 1
        
    # determine items with enough interactions. this should already reduce the number of items considerably
    # allowing us to store them in memory
    valid_items = np.argwhere(np.array(interaction_matrix.sum(axis=0)).flatten() >= min_interactions_item).flatten()
    valid_item_indices.append(valid_items + i)
    im_full.append(interaction_matrix[:, valid_items])
    
im_full = sp.hstack(im_full).tocsr()
valid_item_indices = np.concatenate(valid_item_indices)

Processing item chunks: 100%|███████████████████| 18/18 [00:28<00:00,  1.56s/it]


In [27]:
valid_item_map = {ind: i for i, ind in enumerate(valid_item_indices)}
filtered_item_mapping = {item_map_rev[k]: v for k, v in valid_item_map.items()}

original_item_indices = set(filtered_item_mapping.keys())
    
n_items = 0
with open(os.path.join(data_dir, "tracks.tsv"), "r") as csv_input:
    header = next(csv_input)[:-1].split("\t") # remove whitespace
    
    with open(os.path.join(data_dir, "tracks_filtered.tsv"), "w") as csv_output:
        writer = csv.writer(csv_output, delimiter="\t")
        writer.writerow(header)
        
        for row in tqdm(csv_input):
            (iid, artist, track) = row[:-1].split("\t")
            iid = int(iid)
            if iid in original_item_indices:
                writer.writerow((filtered_item_mapping[iid], artist, track))
                n_items += 1
                
print(f"Kept {n_items} items.")

50813373it [00:38, 1318153.45it/s]

Kept 297826 items.





In [32]:
# Also store track mappings, as the ids may later be used to retrieve genre information
ks, vs = zip(*filtered_item_mapping.items())
df_item_mapping = pd.DataFrame.from_dict({"old": ks, "new": vs})
df_item_mapping.to_csv(os.path.join(data_dir, "tracks_filtered_mapping.tsv"), sep="\t", index=False)

In [28]:
# moreover, determine which users are lacking interactions.
valid_users = np.argwhere(np.array(im_full.sum(axis=1)).flatten() >= min_interactions_user).flatten()
im_full = im_full[valid_users, :]
sp.save_npz(os.path.join(data_dir, "interaction_matrix_filtered.npz"), im_full)

df_users = df_users.loc[valid_users, :]
df_users.reset_index(drop=True, inplace=True)
df_users = df_users.assign(UserID=df_users.index)
df_users.to_csv(os.path.join(data_dir, "users_demo_filtered.tsv"), sep="\t", index=False)

print("Final matrix")
display(im_full)

Final matrix


<13593x297826 sparse matrix of type '<class 'numpy.int64'>'
	with 10231472 stored elements in Compressed Sparse Row format>

In [29]:
display(df_users)

Unnamed: 0,UserID,Country,Age,Gender,RegistrationDate
0,0,UK,31,m,2002-12-28 01:00:00
1,1,UK,35,m,2002-10-29 01:00:00
2,2,BR,31,m,2003-07-20 02:00:00
3,3,AT,28,n,2003-07-23 02:00:00
4,4,UK,48,m,2003-02-18 21:44:13
...,...,...,...,...,...
13588,13588,FI,-1,m,2012-05-27 19:05:13
13589,13589,NL,-1,f,2012-05-28 17:50:26
13590,13590,PL,16,f,2012-05-28 19:59:37
13591,13591,BY,19,f,2012-07-19 22:07:27
