# Notebook to preprocess the MovieLens100k dataset
https://grouplens.org/datasets/movielens/100k/

Note that this notebook serves as a good basis for preprocessing datasets where **all demographic information** for all users is available.

In [1]:
import os
import json
import utils
import numpy as np
import pandas as pd
from scipy import sparse as sp

In [2]:
data_dir = "/media/data/Datasets/ml-100k"

In [3]:
df_ratings = pd.read_csv(os.path.join(data_dir, "u.data"), 
                        sep="\t", engine="python", encoding='latin-1',
                        names=["UserID", "ItemID", "Rating", "Timestamp"])
df_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
df_users = pd.read_csv(os.path.join(data_dir, "u.user"), 
                        sep="|", engine="python", encoding='latin-1',
                        names=["UserID", "Age", "Gender", "Occupation", "Zip-code"])

# change gender labels to lower case
df_users.assign(Gender=[g.lower() for g in df_users["Gender"]])
df_users.head()

Unnamed: 0,UserID,Age,Gender,Occupation,Zip-code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
# We group users in age categories to make the data easier to handle
age_categories = [18, 25, 35, 45, 55, 100]

assigned_age_cat = np.zeros(shape=(len(df_users),) , dtype=int)
for cat in age_categories:
    assigned_age_cat += (df_users["Age"] > cat).astype(int)

assigned_age_cat[df_users["Age"].isna()] = -1
df_users["Age"] = assigned_age_cat
df_users["Age"].value_counts()

2    299
1    218
3    182
4    138
0     54
5     52
Name: Age, dtype: int64

In [6]:
# Extracted the README file
attribute_descriptions = {
    "gender": {
        "m": "male",
        "f": "female"
    },
    "age": {
        0:  "Under 18",
        1:  "18-24",
        2:  "25-34",
        3:  "35-44",
        4:  "45-54",
        5:  "55+"
    },
    
    # just in case scripts require for each attribute to have a description
    "occupation": {d:d for d in df_users["Occupation"].unique()}
}

In [7]:
item_ids = df_ratings["ItemID"].unique()

n_users = len(df_users)
n_items = len(item_ids)
n_ratings = len(df_ratings)
density = n_ratings / (n_items * n_users)

# Show some statistics about the dataset
print("Number of users:", n_users)
print("Number of items:", n_items)
print("\nCounts of users per gender:")
print(df_users["Gender"].value_counts())

print("\nNumber of interactions:", n_ratings)
print(f"Density: {density:.4f}")

Number of users: 943
Number of items: 1682

Counts of users per gender:
M    670
F    273
Name: Gender, dtype: int64

Number of interactions: 100000
Density: 0.0630


### Data preparation
For our use-case our end-result should be binary interaction matrix, where ```1``` denotes that a user
rated an item, and ```0``` that they did not.

In [8]:
# some items might be missing, lets therefore adjust / re-enumerate the indices
item_rename_dict = {iid: i for i, iid in enumerate(item_ids)}
df_ratings["ItemID"] = df_ratings["ItemID"].replace(item_rename_dict)

item_ids = list(range(len(item_ids)))

In [9]:
# get user and item ids from ratings df, -1 as the first user originally received the id 1
user_ids = df_ratings["UserID"] - 1
item_ids = df_ratings["ItemID"]
values = np.ones(len(user_ids))

interaction_matrix = sp.csr_matrix((values, (user_ids, item_ids)), shape=(n_users, n_items))
display(interaction_matrix.shape)

# store results
storage_dir = os.path.join(data_dir, "full")
os.makedirs(storage_dir, exist_ok=True)
sp.save_npz(os.path.join(storage_dir, "interactions.npz"), interaction_matrix)

# check whether all interactions were actually kept
print("Number of interactions (again):", interaction_matrix.toarray().sum())

(943, 1682)

Number of interactions (again): 100000.0


In [10]:
# create new user file for our usage
df_user_info = df_users[["UserID", "Gender", "Age", "Occupation"]].copy()
df_user_info["UserID"] -= 1 # move start index from 1 to 0
df_user_info["Gender"] = df_user_info["Gender"].apply(lambda item: item.lower())

# change column names to camel-case
rn = {cn: cn[0].lower() + cn[1:] for cn in df_user_info.columns}
df_user_info.rename(rn, inplace=True, axis=1)

df_user_info.to_csv(os.path.join(storage_dir, "user_info.csv"), index=False)
df_user_info.head()

with open(os.path.join(storage_dir, "attribute_descriptions.json"), "w") as fh:
    json.dump(attribute_descriptions, fh, indent="\t")

In [11]:
# filter users & tracks with too less interaction
min_interactions_user = 5
min_interactions_item = 5

im_all, user_info_all = utils.ensure_min_interactions(interaction_matrix, df_user_info, 
                                                      min_interactions_user, min_interactions_item)

utils.print_stats(im_all)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}")
utils.store_results(storage_dir, im_all, user_info_all, attribute_descriptions)

Final shape of interactions matrix is (943, 1349)
==> 943 users and 1349 items are remaining.

Number of interactions is 99287,
which leads to a density of 0.0780.
