In [1]:
# Configuration parameters
TARGET_RATINGS = 100000
FILE_SUFFIX = "_100K"

In [2]:
# Import libraries
import pandas as pd
import numpy as np

In [3]:
# Load data
df = pd.read_csv("../../data/raw/user_ratings.csv",
                 dtype={"BGGId": "int32", "Rating": "float32", "Username": "string"})


In [4]:
# Data cleaning
print(f"Original rows: {len(df)}")
print(f"Duplicated rows: {df.duplicated().sum()}")
print(f"Duplicated user ratings: {df.duplicated(subset=['Username', 'BGGId'], keep=False).sum()}")
df = df.drop_duplicates(subset=['Username', 'BGGId'], keep='last')

print(f"Missing values:\n{df.isnull().sum()}")
df = df.dropna(subset=['Username'])

print(f"Rating statistics:\n{df['Rating'].describe()}")
print(f"Ratings < 0.5: {(df['Rating'] < 0.5).sum()}")
df = df[df["Rating"] >= 0.5]

Original rows: 18942215
Duplicated rows: 0
Duplicated user ratings: 64120
Missing values:
BGGId        0
Rating       0
Username    63
dtype: int64
Rating statistics:
count    1.890946e+07
mean     7.131226e+00
std      1.543284e+00
min      1.000000e-04
25%      6.000000e+00
50%      7.000000e+00
75%      8.000000e+00
max      1.000000e+01
Name: Rating, dtype: float64
Ratings < 0.5: 5


In [5]:
# User ID mapping
df["UserId"] = pd.factorize(df["Username"])[0].astype(np.int32)
df = df.drop(columns=["Username"])
df

Unnamed: 0,BGGId,Rating,UserId
0,213788,8.0,0
1,213788,8.0,1
2,213788,8.0,2
3,213788,8.0,3
4,213788,8.0,4
...,...,...,...
18942210,165521,3.0,863
18942211,165521,3.0,21539
18942212,165521,3.0,5112
18942213,193488,1.0,43419


In [6]:
# Calculate original metrics and target values
original_users = df['UserId'].nunique()
original_items = df['BGGId'].nunique()
original_ratings = len(df)
original_matrix_size = original_users * original_items
original_sparsity = 1 - (original_ratings / original_matrix_size)

print(f"Original metrics:")
print(f"Users: {original_users} | Items: {original_items} | Ratings: {original_ratings}")
print(f"Matrix sparsity: {original_sparsity:.4f} ({original_sparsity*100:.2f}%)")

Original metrics:
Users: 411374 | Items: 21925 | Ratings: 18909460
Matrix sparsity: 0.9979 (99.79%)


In [7]:
# Filter to 500K ratings
target_matrix_size = TARGET_RATINGS / (1 - original_sparsity)
scaling_factor = (target_matrix_size / original_matrix_size) ** 0.5
target_users = int(original_users * scaling_factor)
target_items = int(original_items * scaling_factor)

print(f"Scaling factor: {scaling_factor}")
print(f"Target users: {target_users} | Target items: {target_items}")

Scaling factor: 0.07272109950884517
Target users: 29915 | Target items: 1594


In [8]:
# Filter most active users and most rated items
most_active_users = set(df['UserId'].value_counts().nlargest(target_users).index)
most_rated_items = set(df['BGGId'].value_counts().nlargest(target_items).index)
df_reduced = df[df['UserId'].isin(most_active_users) & df['BGGId'].isin(most_rated_items)]

In [9]:
# Sample if we have more ratings than needed
if len(df_reduced) > TARGET_RATINGS:
    df_reduced = df_reduced.sample(n=TARGET_RATINGS, random_state=42)

In [10]:
# Calculate final metrics
new_users = df_reduced['UserId'].nunique()
new_items = df_reduced['BGGId'].nunique()
new_ratings = len(df_reduced)
new_sparsity = 1 - (new_ratings / (new_users * new_items))

print(f"Final metrics:")
print(f"Users: {new_users} | Items: {new_items} | Ratings: {new_ratings}")
print(f"Matrix sparsity: {new_sparsity:.4f} ({new_sparsity*100:.2f}%)")

Final metrics:
Users: 27430 | Items: 1594 | Ratings: 100000
Matrix sparsity: 0.9977 (99.77%)


In [11]:
# Split data into train and test sets
np.random.seed(42)  # For reproducibility
train_ratio = 0.8

# Group by UserId
user_groups = df_reduced.groupby('UserId')
train_indices = []
test_indices = []

# For each user, put some ratings in train and some in test
for user_id, user_df in user_groups:
    indices = user_df.index.tolist()

    # If user has only one rating, put it in training
    if len(indices) == 1:
        train_indices.extend(indices)
        continue

    # Shuffle the user's ratings
    np.random.shuffle(indices)

    # Split point for this user
    user_split = max(1, int(train_ratio * len(indices)))

    # Add to train and test sets
    train_indices.extend(indices[:user_split])
    test_indices.extend(indices[user_split:])

train_df = df_reduced.loc[train_indices]
test_df = df_reduced.loc[test_indices]

print(f"Training set: {len(train_df)} samples ({len(train_df) / len(df_reduced) * 100:.1f}%)")
print(f"Test set: {len(test_df)} samples ({len(test_df) / len(df_reduced) * 100:.1f}%)")

Training set: 71839 samples (71.8%)
Test set: 28161 samples (28.2%)


In [12]:
# Save datasets
train_df.to_csv(f"../../data/processed/user_ratings_train{FILE_SUFFIX}.csv", index=False)
test_df.to_csv(f"../../data/processed/user_ratings_test{FILE_SUFFIX}.csv", index=False)
df.to_csv("../../data/processed/user_ratings_full.csv", index=False)
print("Datasets saved successfully")

Datasets saved successfully
