# Notebook to preprocess the Book-Crossing dataset
http://www2.informatik.uni-freiburg.de/~cziegler/BX/

Note that this notebook serves as a good basis for preprocessing datasets where **only partial demographic information** of users is available.

In [1]:
import os
import json
import utils
import numpy as np
import pandas as pd
from scipy import sparse as sp
import matplotlib.pyplot as plt

In [2]:
#data_dir = r"D:\data\datasets\book-crossing"
data_dir = r"/media/data/Datasets/book-crossing"

In [3]:
df_ratings = pd.read_csv(os.path.join(data_dir, "BX-Book-Ratings.csv"), 
                        sep=";", engine="c", encoding='latin-1')
df_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
# drop ISBN and replace it with a proper item ID
isbns = df_ratings["ISBN"].unique()
replacement_dict = {isbn: i for i, isbn in enumerate(isbns)}
item_ids = [replacement_dict[isbn]for isbn in df_ratings["ISBN"]]
df_ratings["Item-ID"] = item_ids
df_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Item-ID
0,276725,034545104X,0,0
1,276726,0155061224,5,1
2,276727,0446520802,0,2
3,276729,052165615X,3,3
4,276729,0521795028,6,4


In [5]:
df_ratings["Book-Rating"].value_counts()

0     716109
8     103736
10     78610
7      76457
9      67541
5      50974
6      36924
4       8904
3       5996
2       2759
1       1770
Name: Book-Rating, dtype: int64

In [6]:
df_users = pd.read_csv(os.path.join(data_dir, "BX-Users.csv"), 
                       sep=";", engine="c", encoding='latin-1')
df_users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [7]:
# Due to the number of possible locations, we will limit ourselves to the country only
df_users["Location"] = df_users.apply(lambda x: x.Location.split(", ")[-1] if x.Location else "", axis=1)
display(df_users["Location"].value_counts())
display(df_users.head())

usa               139711
canada             21658
united kingdom     18538
germany            17043
spain              13147
                   ...  
r.o.c.                 1
neverland              1
the gambia             1
montevideo,            1
uyo                    1
Name: Location, Length: 1152, dtype: int64

Unnamed: 0,User-ID,Location,Age
0,1,usa,
1,2,usa,18.0
2,3,russia,
3,4,portugal,17.0
4,5,united kingdom,


In [8]:
# Moreover, as the didn't seem to be any input validation in place when collecting the data,
# we will set the users' attributes to default if they 
#    - don't make sense (also in case of additional punctations, etc..),
#    - belong to a group with only a few users (for nominal data)

min_n_users_per_group = 200
bad_location_groups = [k for k, v in df_users["Location"].value_counts().items() if v < min_n_users_per_group]
bad_location_groups += ["", ","]

bad_location_users = df_users["Location"].isin(bad_location_groups)
print(f"{bad_location_users.sum()} of {len(bad_location_users)} user locations reset to default")

df_users.loc[bad_location_users, "Location"] = ""
df_users

10689 of 278858 user locations reset to default


Unnamed: 0,User-ID,Location,Age
0,1,usa,
1,2,usa,18.0
2,3,,
3,4,portugal,17.0
4,5,united kingdom,
...,...,...,...
278853,278854,usa,
278854,278855,united kingdom,50.0
278855,278856,canada,
278856,278857,usa,


In [9]:
min_age = 10
max_age = 100
bad_age_users = (df_users["Age"] < min_age) | (max_age < df_users["Age"])

print(f"{bad_age_users.sum()} of {len(bad_age_users)} user ages reset to default")
df_users.loc[bad_age_users, "Age"] = np.NaN
df_users

1435 of 278858 user ages reset to default


Unnamed: 0,User-ID,Location,Age
0,1,usa,
1,2,usa,18.0
2,3,,
3,4,portugal,17.0
4,5,united kingdom,
...,...,...,...
278853,278854,usa,
278854,278855,united kingdom,50.0
278855,278856,canada,
278856,278857,usa,


In [10]:
# We group users in age categories to make the data easier to handle
age_categories = [18, 25, 35, 45, 55, 100]

assigned_age_cat = np.zeros(shape=(len(df_users),) , dtype=int)
for cat in age_categories:
    assigned_age_cat += (df_users["Age"] > cat).astype(int)

assigned_age_cat[df_users["Age"].isna()] = -1
df_users["Age"] = assigned_age_cat
df_users["Age"].value_counts()

-1    112197
 2     49343
 1     33919
 3     31014
 4     22519
 0     14944
 5     14922
Name: Age, dtype: int64

In [11]:
# Extracted the README file
attribute_descriptions = {
    "age": {
        -1: "unknown",
        0:  "Under 18",
        1:  "18-24",
        2:  "25-34",
        3:  "35-44",
        4:  "45-54",
        5:  "55+"
    },
    # just in case scripts require for each attribute to have a description
    "location": {d:d for d in df_users["Location"].unique()}
}
attribute_descriptions["location"][""] = "undefined"

In [12]:
n_users = len(df_users)
n_items = len(df_ratings["ISBN"].unique())
n_ratings = len(df_ratings)

density = n_ratings / (n_items * n_users)

# Show some statistics about the dataset
print("Number of users:", n_users)
print("Number of items:", n_items)

print("\nNumber of interactions:", n_ratings)
print(f"Density: {density:.6f}")

Number of users: 278858
Number of items: 340556

Number of interactions: 1149780
Density: 0.000012


### Data preparation
For our use-case our end-result should be binary interaction matrix, where ```1``` denotes that a user
rated an item, and ```0``` that they did not.

In [13]:
# get user and movie ids from ratings df, -1 as the first user originally received the id 1
user_ids = df_ratings["User-ID"] - 1
item_ids = df_ratings["Item-ID"]
values = np.ones(len(user_ids))

# create (sparse) interaction matrix
interaction_matrix = sp.csr_matrix((values, (user_ids, item_ids)), shape=(n_users, n_items))
display(interaction_matrix.shape)

# create new user file for our usage
df_user_info = df_users[["User-ID", "Age", "Location"]].copy()
df_user_info["User-ID"] -= 1 # move start index from 1 to 0
df_user_info["Location"] = df_user_info["Location"].apply(lambda item: item.lower())

# change column names to camel-case & drop hyphons
rn = {cn: cn[0].lower() + cn[1:].replace("-", "") for cn in df_user_info.columns}
df_user_info.rename(rn, inplace=True, axis=1)

# check whether all interactions were actually kept
print("Number of interactions (again):", len(user_ids))

(278858, 340556)

Number of interactions (again): 1149780


In [14]:
# store results
storage_dir = os.path.join(data_dir, "full")
utils.store_results(storage_dir, interaction_matrix, df_user_info, attribute_descriptions)

In [15]:
# filter users & tracks with too less interaction
min_interactions_user = 5
min_interactions_item = 5

im_all, user_info_all = utils.ensure_min_interactions(interaction_matrix, df_user_info, 
                                                      min_interactions_user, min_interactions_item)

utils.print_stats(im_all)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_movie_gte_{min_interactions_item}")
utils.store_results(storage_dir, im_all, user_info_all, attribute_descriptions)

Final shape of interactions matrix is (15798, 38093)
==> 15798 users and 38093 items are remaining.

Number of interactions is 585579,
which leads to a density of 0.0010.


In [16]:
# drop all data where no location info is given
mask_location_given = df_user_info["location"] != ""
user_info_loc = df_user_info[mask_location_given]
im_loc = interaction_matrix[mask_location_given, :]

im_loc, user_info_loc = utils.ensure_min_interactions(im_loc, user_info_loc, 
                                                      min_interactions_user, min_interactions_item)

utils.print_stats(im_loc)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_movie_gte_{min_interactions_item}_loc")
utils.store_results(storage_dir, im_loc, user_info_loc, attribute_descriptions)

Final shape of interactions matrix is (14894, 36327)
==> 14894 users and 36327 items are remaining.

Number of interactions is 553766,
which leads to a density of 0.0010.


In [17]:
# drop all data where no age info is given
mask_age_given = df_user_info["age"] != -1
user_info_age = df_user_info[mask_age_given]
im_age = interaction_matrix[mask_age_given, :]

im_age, user_info_age = utils.ensure_min_interactions(im_age, user_info_age, 
                                                      min_interactions_user, min_interactions_item)

utils.print_stats(im_age)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_movie_gte_{min_interactions_item}_age")
utils.store_results(storage_dir, im_age, user_info_age, attribute_descriptions)

Final shape of interactions matrix is (10215, 27977)
==> 10215 users and 27977 items are remaining.

Number of interactions is 396417,
which leads to a density of 0.0014.


In [18]:
# drop all data where either age or loc is not given
mask_given = df_user_info["age"] != -1
mask_given &= df_user_info["location"] != ""

user_info = df_user_info[mask_given]
im = interaction_matrix[mask_given, :]

im, user_info = utils.ensure_min_interactions(im, user_info, 
                                              min_interactions_user, min_interactions_item)

utils.print_stats(im)

# store results
storage_dir = os.path.join(data_dir, f"user_gte_{min_interactions_user}_movie_gte_{min_interactions_item}_age_loc")
utils.store_results(storage_dir, im, user_info, attribute_descriptions)

Final shape of interactions matrix is (9828, 27415)
==> 9828 users and 27415 items are remaining.

Number of interactions is 386566,
which leads to a density of 0.0014.
