In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

print("All libraries loaded successfully.")

  from pandas.core import (


All libraries loaded successfully.


In [3]:
df = pd.read_csv(r"C:\Users\User\OneDrive\Documents\DEL DEL DEL\Semester 7\SisRek\data-books\train.csv")
df.head()

Unnamed: 0,user_id,item_id
0,8,0002005018
1,8,074322678X
2,8,0887841740
3,8,1552041778
4,8,1567407781


### Step 3: Preprocessing Data

In [4]:
df['rating'] = 1  # implicit feedback, semua interaksi dianggap positif (rating=1)

user_ids = df['user_id'].unique()
item_ids = df['item_id'].unique()

user_map = {u:i for i,u in enumerate(user_ids)}
item_map = {i:j for j,i in enumerate(item_ids)}
inv_item_map = {v:k for k,v in item_map.items()}

df['user_idx'] = df['user_id'].map(user_map)
df['item_idx'] = df['item_id'].map(item_map)

n_users = len(user_map)
n_items = len(item_map)
print(f"Total users: {n_users}, items: {n_items}")

Total users: 13876, items: 123069


### Step 4: Split Data (Train-Test)

In [5]:
import numpy.random as random

train_rows, test_rows = [], []
rng = random.default_rng(42)

for u, group in df.groupby('user_idx'):
    items = group['item_idx'].tolist()
    if len(items) > 1:
        test_item = rng.choice(items)
        for it in items:
            if it == test_item:
                test_rows.append((u,it,1))
            else:
                train_rows.append((u,it,1))
    else:
        for it in items:
            train_rows.append((u,it,1))

train_df = pd.DataFrame(train_rows, columns=['user_idx','item_idx','rating'])
test_df = pd.DataFrame(test_rows, columns=['user_idx','item_idx','rating'])

print(len(train_df), "train interactions,", len(test_df), "test interactions")

255888 train interactions, 13876 test interactions


### Step 5: Bangun Matriks Sparse (untuk Model)

In [6]:
R = sp.coo_matrix((train_df['rating'].values,
                   (train_df['item_idx'].values, train_df['user_idx'].values)),
                  shape=(n_items, n_users)).tocsr()


### Step 6: Baseline Model 1 — Popularity

In [7]:
item_pop = np.array(R.sum(axis=1)).ravel()
popular_items = np.argsort(-item_pop)

def recommend_popularity(user_idx, k=10):
    seen = set(train_df[train_df['user_idx']==user_idx]['item_idx'])
    recs = [i for i in popular_items if i not in seen][:k]
    return recs


### Step 7: Baseline Model 2 — Item-based CF

In [8]:
TOP_SIM_ITEMS = 2000
top_items = popular_items[:TOP_SIM_ITEMS]

M = R[top_items,:].toarray()
item_sim = cosine_similarity(M)
pos_of_item_in_top = {item:i for i,item in enumerate(top_items)}

def recommend_itemcf(user_idx, k=10):
    user_items = train_df[train_df['user_idx']==user_idx]['item_idx'].tolist()
    scores = np.zeros(len(top_items))
    for it in user_items:
        if it in pos_of_item_in_top:
            scores += item_sim[pos_of_item_in_top[it]]
    recs = []
    for idx in np.argsort(-scores):
        item = top_items[idx]
        if item not in user_items:
            recs.append(item)
            if len(recs) == k: break
    return recs

### Step 8: Evaluasi MAP@10

In [9]:
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    return score / min(len(actual), k)

def mapk(actuals, predicteds, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actuals, predicteds)])


In [10]:
test_users = test_df['user_idx'].tolist()
actuals = [[it] for it in test_df['item_idx'].tolist()]

pred_pop = [recommend_popularity(u,10) for u in test_users]
pred_item = [recommend_itemcf(u,10) for u in test_users]

print("MAP@10 Popularity:", mapk(actuals, pred_pop))
print("MAP@10 Item-CF:", mapk(actuals, pred_item))

MAP@10 Popularity: 0.00520483008231639
MAP@10 Item-CF: 0.015433093339190195


In [12]:
!pip install implicit==0.7.2 --prefer-binary

Collecting implicit==0.7.2
  Obtaining dependency information for implicit==0.7.2 from https://files.pythonhosted.org/packages/7c/25/48964efed207b60b2d5b2855161638e4f368f5db332b57f62b6cd16fb591/implicit-0.7.2-cp311-cp311-win_amd64.whl.metadata
  Downloading implicit-0.7.2-cp311-cp311-win_amd64.whl.metadata (6.3 kB)
Downloading implicit-0.7.2-cp311-cp311-win_amd64.whl (750 kB)
   ---------------------------------------- 0.0/750.8 kB ? eta -:--:--
   ---------------------------------------- 0.0/750.8 kB ? eta -:--:--
   ---------------------------------------- 0.0/750.8 kB ? eta -:--:--
   ---------------------------------------- 0.0/750.8 kB ? eta -:--:--
   ---------------------------------------- 0.0/750.8 kB ? eta -:--:--
   ---------------------------------------- 0.0/750.8 kB ? eta -:--:--
   ---------------------------------------- 0.0/750.8 kB ? eta -:--:--
   -- ------------------------------------- 41.0/750.8 kB ? eta -:--:--
   -- ------------------------------------- 41.0/750

### Step 9: Model-Based — Implicit ALS

In [4]:
import pandas as pd
from scipy.sparse import csr_matrix

# Baca dataset
train_df = pd.read_csv(r"C:\Users\User\OneDrive\Documents\DEL DEL DEL\Semester 7\SisRek\data-books\train.csv")  # ganti path sesuai file kamu

# Lihat beberapa data
print(train_df.head())

# Tambahkan kolom interaksi (semua = 1)
train_df['interaction'] = 1

# Buat mapping user dan item jadi index numerik
user_mapping = {u: i for i, u in enumerate(train_df['user_id'].unique())}
item_mapping = {i: j for j, i in enumerate(train_df['item_id'].unique())}

train_df['user_idx'] = train_df['user_id'].map(user_mapping)
train_df['item_idx'] = train_df['item_id'].map(item_mapping)

# Buat user-item matrix (CSR)
R = csr_matrix(
    (train_df['interaction'].astype(float),
     (train_df['user_idx'], train_df['item_idx']))
)

print("Matrix shape:", R.shape)


   user_id     item_id
0        8  0002005018
1        8  074322678X
2        8  0887841740
3        8  1552041778
4        8  1567407781
Matrix shape: (13876, 123069)


In [2]:
from tqdm import tqdm
from scipy.sparse import csr_matrix

# 1. pastikan matriks interaksi sudah dibuat
R_csr = csr_matrix(R)

# 2. ambil daftar user yang ada di data test
test_users = test_df['user_idx'].unique()

# 3. ambil ground truth (item yang sebenarnya di-like user di test set)
actuals = (
    test_df.groupby('user_idx')['item_idx']
    .apply(list)
    .reindex(test_users)
    .tolist()
)

# 4. generate rekomendasi
pred_als = []
for u in tqdm(test_users):
    recs, _ = model.recommend(
        userid=u,
        user_items=R_csr,
        N=10,
        filter_already_liked_items=True
    )
    pred_als.append(recs)

# 5. hitung MAP@10
print("MAP@10 ALS:", mapk(actuals, pred_als))

NameError: name 'R' is not defined

In [8]:
user_mapping = {u: i for i, u in enumerate(train_df['user_id'].unique())}
item_mapping = {i: j for j, i in enumerate(train_df['item_id'].unique())}

train_df['user_idx'] = train_df['user_id'].map(user_mapping)
train_df['item_idx'] = train_df['item_id'].map(item_mapping)
test_df['user_idx'] = test_df['user_id'].map(user_mapping)
test_df['item_idx'] = test_df['item_id'].map(item_mapping)

NameError: name 'test_df' is not defined