In [None]:
import pandas as pd
import json
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load MovieLens + Amazon matched
ml_df = pd.read_csv('../data/ml_merged.csv')
with open('../data/matched_amazon_reviews.jsonl') as f:
    matched_data = [json.loads(line) for line in f]
matched_df = pd.DataFrame(matched_data)

# Ambil satu asin per movieId
matched_unique = matched_df.drop_duplicates(subset='movieId', keep='first')

# Gabungkan MovieLens dengan matched_asin
merged = pd.merge(ml_df, matched_unique[['movieId', 'asin']], on='movieId', how='inner')
merged = merged.dropna(subset=['asin'])

# Load encoded vector ASIN
df_qi = pd.read_csv('../data/encoded_amazon_vectors.csv')
available_asins = set(df_qi['asin'])

# Filter hanya asin yang punya vektor q_i
merged = merged[merged['asin'].isin(available_asins)].copy()

# Dapatkan urutan asin yang benar2 dipakai
used_asins = merged['asin'].unique().tolist()

In [None]:
item_encoder = LabelEncoder()
item_encoder.fit(used_asins)
merged['item_index'] = item_encoder.transform(merged['asin'])

# Encode userId
user_encoder = LabelEncoder()
merged['user_index'] = user_encoder.fit_transform(merged['userId'])

# Simpan hasil
print("Jumlah user unik:", merged['user_index'].nunique())
print("Jumlah item unik:", merged['item_index'].nunique())
merged[['user_index', 'item_index', 'rating']].to_csv('../data/pmf_training_data.csv', index=False)

# Simpan mapping asin -> item_index
mapping_df = merged[['item_index', 'asin']].drop_duplicates()
mapping_df.to_csv('../data/item_asin_mapping.csv', index=False)

# Simpan ulang asin order yang benar-benar digunakan untuk PMF (opsional, bisa dipakai load di trainer)
pd.Series(used_asins).to_csv('../data/used_asins.csv', index=False, header=False)

print("Selesai menyimpan PMF training data dan mapping.")

  df_qi = pd.read_csv('../data/encoded_amazon_vectors.csv')


Jumlah user unik: 6040
Jumlah item unik: 1344


In [5]:
print("Duplikasi user-item:", merged.duplicated(subset=['user_index', 'item_index']).sum())

Duplikasi user-item: 0
