In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

DATA_DIR = Path(os.getenv("PROJECT_ROOT", Path.cwd())).parent
PREPROCESSING_DIR = DATA_DIR / 'preprocessing'

NON_BINARY_FRAC = 0.1
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)

# ============================================================================
# PROCESS USERS DATA
# ============================================================================

users_df = pd.read_csv(
    PREPROCESSING_DIR / 'users.dat',
    sep='::',
    engine='python',
    header=None,
    usecols=[0, 1],  # Only user_id and gender
    names=['user_id', 'gender']
)

# Adjust indexing
users_df['user_id'] = users_df['user_id'] - 1

male_count = users_df[users_df['gender'] == 'M'].shape[0]
female_count = users_df[users_df['gender'] == 'F'].shape[0]
total = male_count + female_count

print("=" * 60)
print("INITIAL GENDER DISTRIBUTION")
print("=" * 60)
print(f"Male: {male_count}")
print(f"Female: {female_count}")
print(f"Total: {total}")
print(f"Ratio (M/F): {male_count/female_count:.3f} \n")

num_users = len(users_df)
num_non_binary = int(num_users * NON_BINARY_FRAC)

# Sample users to become non-binary (respecting existing gender ratio)
gender_counts = users_df['gender'].value_counts()
ratio_m_f = gender_counts['M'] / gender_counts['F']

num_nb_from_female = int(num_non_binary / (1 + ratio_m_f))
num_nb_from_male = num_non_binary - num_nb_from_female

print(f"Sampling {NON_BINARY_FRAC*100:.0f}% of users to be non-binary.")
print(f"Sampling respects the existing M/F ratio:")
print(f"  - {num_nb_from_male} from male users")
print(f"  - {num_nb_from_female} from female users \n")

# Sample indices directly
male_indices = users_df[users_df['gender'] == 'M'].sample(
    n=num_nb_from_male, random_state=RANDOM_SEED
).index
female_indices = users_df[users_df['gender'] == 'F'].sample(
    n=num_nb_from_female, random_state=RANDOM_SEED
).index

# Combine and assign non-binary
nb_indices = male_indices.union(female_indices)
users_df.loc[nb_indices, 'gender'] = 'NB'

# Calculate new statistics
male_count_after = users_df[users_df['gender'] == 'M'].shape[0]
female_count_after = users_df[users_df['gender'] == 'F'].shape[0]
nb_count = users_df[users_df['gender'] == 'NB'].shape[0]

print("=" * 60)
print("AFTER NON-BINARY SAMPLING")
print("=" * 60)
print(f"Men: {male_count_after} ({male_count_after/total*100:.1f}%)")
print(f"Women: {female_count_after} ({female_count_after/total*100:.1f}%)")
print(f"Non-binary: {nb_count} ({nb_count/total*100:.1f}%) \n")

# Map genders to integers
gender_mapping = {'M': 0, 'F': 1, 'NB': 2}
users_df['gender'] = users_df['gender'].map(gender_mapping)

# Save both ordered and randomized versions
users_df.to_csv(DATA_DIR / 'sensitive_attribute.csv', index=False)

users_random = users_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
users_random.to_csv(DATA_DIR / 'sensitive_attribute_random.csv', index=False)

# ============================================================================
# PROCESS RATINGS DATA
# ============================================================================

items_df = pd.read_csv(
    PREPROCESSING_DIR / 'ratings.dat',
    sep='::',
    engine='python',
    header=None,
    usecols=[0, 1, 2],  # Skip timestamp
    names=['user_id', 'item_id', 'rating']
)

# Adjust indexing
items_df['user_id'] = items_df['user_id'] - 1
items_df['item_id'] = items_df['item_id'] - 1

# Binarize ratings
items_df['label'] = (items_df['rating'] > 4).astype(int)
items_df = items_df.drop(columns=['rating'])

# ============================================================================
# SPLIT DATA PER USER
# ============================================================================

def split_per_user(df, train_frac=0.8, val_frac=0.1, random_state=42):
    """
    Splitting items per user by using groupby and apply.

    Args:
        df: DataFrame with user_id column
        train_frac: Fraction for training
        val_frac: Fraction for validation
        random_state: Random seed

    Returns:
        train_df, val_df, test_df
    """
    def split_user_data(user_items):
        user_id = user_items.name
        user_items['user_id'] = user_id

        # Shuffle user's items
        user_items = user_items.sample(frac=1, random_state=random_state)
        num_items = len(user_items)

        num_train = int(train_frac * num_items)
        num_val = int(val_frac * num_items)

        # Add split indicator
        split = ['train'] * num_train + ['val'] * num_val + ['test'] * (num_items - num_train - num_val)
        user_items['split'] = split

        return user_items

    # Apply splitting to each user's items
    df_with_splits = df.groupby('user_id', group_keys=False).apply(split_user_data)
    df_with_splits = df_with_splits[['user_id', 'item_id', 'label', 'split']]

    train_df = df_with_splits[df_with_splits['split'] == 'train'].drop(columns=['split'])
    val_df = df_with_splits[df_with_splits['split'] == 'val'].drop(columns=['split'])
    test_df = df_with_splits[df_with_splits['split'] == 'test'].drop(columns=['split'])

    return (
        train_df.reset_index(drop=True),
        val_df.reset_index(drop=True),
        test_df.reset_index(drop=True)
    )

train_df, val_df, test_df = split_per_user(items_df, random_state=RANDOM_SEED)

print("=" * 60)
print("DATASET SPLITS")
print("=" * 60)
print(f"Train size: {len(train_df):,}")
print(f"Val size: {len(val_df):,}")
print(f"Test size: {len(test_df):,}")
print(f"Total size: {len(train_df) + len(val_df) + len(test_df):,}\n")

train_df.to_csv(DATA_DIR / 'train.csv', index=False)
val_df.to_csv(DATA_DIR / 'valid.csv', index=False)
test_df.to_csv(DATA_DIR / 'test.csv', index=False)

print("✓ All files saved successfully!")

INITIAL GENDER DISTRIBUTION
Male: 4331
Female: 1709
Total: 6040
Ratio (M/F): 2.534 

Sampling 10% of users to be non-binary.
Sampling respects the existing M/F ratio:
  - 434 from male users
  - 170 from female users 

AFTER NON-BINARY SAMPLING
Men: 3897 (64.5%)
Women: 1539 (25.5%)
Non-binary: 604 (10.0%) 

DATASET SPLITS
Train size: 797,758
Val size: 97,383
Test size: 105,068
Total size: 1,000,209

✓ All files saved successfully!
