In [None]:
import pandas as pd
import numpy as np
import random

random.seed(42)

processed_df = pd.read_parquet("../data/processed/ratings_clean.parquet")

In [None]:
# `필수 학습 데이터` 확보
guaranteed_by_user = processed_df.drop_duplicates(subset=["userId"], keep="first")
guaranteed_by_movie = processed_df.drop_duplicates(subset=["movieId"], keep="first")
guaranteed_train_df = pd.concat(
    [guaranteed_by_user, guaranteed_by_movie]
).drop_duplicates()
print(f"Guaranteed training set size: {len(guaranteed_train_df)}")

Guaranteed training set size: 6554


In [8]:
# 나머지로 `랜덤 풀` 만들기
random_pool_df = processed_df.drop(guaranteed_train_df.index)

print(f"Random pool size: {len(random_pool_df)}")

Random pool size: 72280


In [None]:
grouped = random_pool_df.groupby("userId")
additional_train_list = []
test_list = []

for user_id, group in grouped:
    if len(group) == 0:
        continue

    # 남은 데이터가 1개 뿐이면, 추가 학습 데이터로 사용
    if len(group) == 1:
        additional_train_list.append(group)
    else:
        test_sample = group.sample(n=random.randint(1, len(group)), random_state=42)
        test_list.append(test_sample)

        train_sample = group.drop(test_sample.index)
        if not train_sample.empty:
            additional_train_list.append(train_sample)

# 리스트들을 하나의 데이터프레임으로 합침
additional_train_df = (
    pd.concat(additional_train_list) if additional_train_list else pd.DataFrame()
)
test_df = pd.concat(test_list) if test_list else pd.DataFrame()

In [None]:
final_train_df = pd.concat([guaranteed_train_df, additional_train_df])

print(f"\nFinal Train data size: {len(final_train_df)}")
print(f"Final Test data size: {len(test_df)}")
print(
    f"Verification (Train + Test == Original): {len(final_train_df) + len(test_df) == len(processed_df)}"
)


Final Train data size: 42839
Final Test data size: 35995
Verification (Train + Test == Original): True


In [None]:
train_users = set(final_train_df["userId"].unique())
train_movies = set(final_train_df["movieId"].unique())
all_users = set(processed_df["userId"].unique())
all_movies = set(processed_df["movieId"].unique())

print(f"\nAll users in training set: {train_users == all_users}")
print(f"All movies in training set: {train_movies == all_movies}")


All users in training set: True
All movies in training set: True


In [None]:
output_path = "../data/processed/"
final_train_df.to_parquet(output_path + "train_data.parquet", index=False)
test_df.to_parquet(output_path + "test_data.parquet", index=False)

print(f"\nProcessed data saved to {output_path}")


Processed data saved to ../data/processed/
