# Notebook to preprocess the MovieLens1M dataset
https://grouplens.org/datasets/movielens/1m/

Note that this notebook serves as a good basis for preprocessing datasets where **all demographic information** for all users is available.

In [1]:
import os
import json
import utils
import numpy as np
import pandas as pd
from scipy import sparse as sp

In [2]:
# data_dir = "/media/chiru/Ext/data/Bias Research/movielens_1m"
# data_dir = r"D:\data\datasets\ml-1m"
# data_dir = "/media/data/Datasets/ml-1m"
data_dir = r"/media/data/Studium/CP_Institut/FairnessDatasets/ml-1m"

In [3]:
df_ratings = pd.read_csv(os.path.join(data_dir, "ratings.dat"), 
                        sep="::", engine="python", encoding='latin-1',
                        names=["UserID", "ItemID", "Rating", "Timestamp"])
df_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
df_ratings["Rating"].value_counts()

4    348971
3    261197
5    226310
2    107557
1     56174
Name: Rating, dtype: int64

In [5]:
df_users = pd.read_csv(os.path.join(data_dir, "users.dat"), 
                        sep="::", engine="python", encoding='latin-1',
                        names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
df_users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [6]:
# Extracted the README file
attribute_descriptions = {
    "gender": {
        "m": "male",
        "f": "female"
    },
    "age": {
        1:  "Under 18",
        18:  "18-24",
        25:  "25-34",
        35:  "35-44",
        45:  "45-49",
        50:  "50-55",
        56:  "56+"
    },
    "occupation": {
        0:  "other / not specified",
        1:  "academic/educator",
        2:  "artist",
        3:  "clerical/admin",
        4:  "college/grad student",
        5:  "customer service",
        6:  "doctor/health care",
        7:  "executive/managerial",
        8:  "farmer",
        9:  "homemaker",
        10:  "K-12 student",
        11:  "lawyer",
        12:  "programmer",
        13:  "retired",
        14:  "sales/marketing",
        15:  "scientist",
        16:  "self-employed",
        17:  "technician/engineer",
        18:  "tradesman/craftsman",
        19:  "unemployed",
        20:  "writer"
    }
}

In [7]:
df_items = pd.read_csv(os.path.join(data_dir, "movies.dat"), 
                        sep="::", engine="python", encoding='latin-1',
                        names=["ItemID", "Title", "Genres"])
df_items.head()

Unnamed: 0,ItemID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
item_ids = df_ratings["ItemID"].unique()

n_users = len(df_users)
n_items = len(item_ids)
n_ratings = len(df_ratings)
density = n_ratings / (n_items * n_users)

# Show some statistics about the dataset
print("Number of users:", n_users)
print("Number of items:", n_items)
print("\nCounts of users per gender:")
print(df_users["Gender"].value_counts())

print("\nNumber of interactions:", n_ratings)
print(f"Density: {density:.4f}")

Number of users: 6040
Number of items: 3706

Counts of users per gender:
M    4331
F    1709
Name: Gender, dtype: int64

Number of interactions: 1000209
Density: 0.0447


### Data preparation
For our use-case our end-result should be binary interaction matrix, where ```1``` denotes that a user
rated an item, and ```0``` that they did not.

In [9]:
# only keep ratings >=4, i.e., the ones users really liked
df_ratings = df_ratings[df_ratings["Rating"] >= 4].reset_index(drop=True)
df_ratings["Rating"].value_counts()

4    348971
5    226310
Name: Rating, dtype: int64

In [10]:
# some items might be missing, let's therefore adjust / re-enumerate the indices
item_rename_dict = {iid: i for i, iid in enumerate(sorted(item_ids))}

df_ratings = df_ratings.assign(ItemID = df_ratings["ItemID"].replace(item_rename_dict))
                  
df_items = df_items[df_items["ItemID"].isin(set(item_ids))]
df_items = df_items.assign(ItemID = df_items["ItemID"].replace(item_rename_dict))
                  
item_ids = list(range(len(item_ids)))

In [11]:
# get user and item ids from ratings df, -1 as the first user originally received the id 1
user_ids = df_ratings["UserID"] - 1
item_ids = df_ratings["ItemID"]
values = np.ones(len(user_ids))

interaction_matrix = sp.csr_matrix((values, (user_ids, item_ids)), shape=(n_users, n_items))
display(interaction_matrix.shape)

# store results
storage_dir = os.path.join(data_dir, "full")
os.makedirs(storage_dir, exist_ok=True)
sp.save_npz(os.path.join(storage_dir, "interactions.npz"), interaction_matrix)

# check whether all interactions were actually kept
print("Number of interactions (again):", interaction_matrix.toarray().sum())

(6040, 3706)

Number of interactions (again): 575281.0


In [12]:
# create new user file for our usage
df_user_info = df_users[["UserID", "Gender", "Age", "Occupation"]].copy()
df_user_info["UserID"] -= 1 # move start index from 1 to 0
df_user_info["Gender"] = df_user_info["Gender"].apply(lambda item: item.lower())

# change column names to camel-case
rn = {cn: cn[0].lower() + cn[1:] for cn in df_user_info.columns}
df_user_info.rename(rn, inplace=True, axis=1)

df_user_info.to_csv(os.path.join(storage_dir, "user_info.csv"), index=False)
df_user_info.head()

with open(os.path.join(storage_dir, "attribute_descriptions.json"), "w") as fh:
    json.dump(attribute_descriptions, fh, indent="\t")

In [13]:
# create new user file for our usage
df_item_info = df_items.copy()

# change column names to camel-case
rn = {cn: cn[0].lower() + cn[1:] for cn in df_item_info.columns}
df_item_info.rename(rn, inplace=True, axis=1)

df_item_info.to_csv(os.path.join(storage_dir, "item_info.csv"), index=False)
df_item_info.head()

Unnamed: 0,itemID,title,genres
0,0,Toy Story (1995),Animation|Children's|Comedy
1,1,Jumanji (1995),Adventure|Children's|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
3,3,Waiting to Exhale (1995),Comedy|Drama
4,4,Father of the Bride Part II (1995),Comedy


In [14]:
old, new = zip(*item_rename_dict.items())
df_item_mapping = pd.DataFrame.from_dict({"old": old, "new": new})
df_item_mapping.to_csv(os.path.join(storage_dir, "item_mapping.csv"), index=False)
df_item_mapping.head()

Unnamed: 0,old,new
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [15]:
# filter users & tracks with too less interaction
min_interactions_user = 5
min_interactions_item = 5

im_all, umap_all, imap_all, uinfo_all, iinfo_all = utils.ensure_min_interactions(interaction_matrix, 
                                                      min_interactions_user, min_interactions_item,
                                                      df_user_info, df_item_info)

utils.print_stats(im_all)

# account for previous adjustment of item indices
item_rename_dict_reverse = {v: k for k, v in item_rename_dict.items()}
imap_all = imap_all.assign(old=imap_all["old"].replace(item_rename_dict_reverse))

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_item_gte_{min_interactions_item}")
utils.store_results(storage_dir, im_all, uinfo_all, attribute_descriptions, iinfo_all, umap_all, imap_all)

Final shape of interactions matrix is (6034, 3125)
==> 6034 users and 3125 items are remaining.

Number of interactions is 574376,
which leads to a density of 0.0305.
