# Recommendation System Project 

Group 02 :
 

### **Import library & konfigurasi**

In [60]:
# Cell 1: Import libraries & konfigurasi
import os
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Lokasi file kamu
DATA_DIR = r"D:\Perkuliahaan\SemesterVII\System Recommendation\Proyek\data-books"

# Nama file
TRAIN_FILE = "train.csv"
TARGET_USERS_FILE = "data_target_users_test.csv"
SAMPLE_SUB_FILE = "sample_submission.csv"

# Parameter model
N_RECS = 10
ALS_ON = True                # aktifkan ALS
ALS_FACTORS = 32             # jumlah faktor laten
ALS_ITER = 8                 # iterasi pelatihan
ALS_REG = 0.1                # regularisasi
ALS_ALPHA = 40               # parameter confidence


### **Load dan cek data**

In [62]:
# Cell 2: Load data
train = pd.read_csv(os.path.join(DATA_DIR, TRAIN_FILE))
test_users = pd.read_csv(os.path.join(DATA_DIR, TARGET_USERS_FILE))
sample_sub = pd.read_csv(os.path.join(DATA_DIR, SAMPLE_SUB_FILE))

print("Train shape:", train.shape)
print("Test users shape:", test_users.shape)
print("Sample submission shape:", sample_sub.shape)
train.head()


Train shape: (269764, 2)
Test users shape: (13876, 1)
Sample submission shape: (2, 2)


Unnamed: 0,user_id,item_id
0,8,0002005018
1,8,074322678X
2,8,0887841740
3,8,1552041778
4,8,1567407781


### **Pra - Proses data & encoding**

In [64]:
# Cell 3: Preprocessing & encoding
def infer_columns(df):
    cols = df.columns.tolist()
    user_col = None
    item_col = None
    for c in cols:
        if "user" in c.lower(): user_col = c
        if "item" in c.lower(): item_col = c
    return user_col, item_col

u_col, i_col = infer_columns(train)
train['rating'] = 1  # implicit
train = train[[u_col, i_col, 'rating']].rename(columns={u_col:'user', i_col:'item'})

# Encode id
user_enc = LabelEncoder()
item_enc = LabelEncoder()
train['uidx'] = user_enc.fit_transform(train['user'])
train['iidx'] = item_enc.fit_transform(train['item'])

n_users = train['uidx'].nunique()
n_items = train['iidx'].nunique()
print(f"Total users: {n_users}, Total items: {n_items}")

# Sparse matrix user-item
R = sparse.csr_matrix((train['rating'], (train['uidx'], train['iidx'])), shape=(n_users, n_items))
print("Shape matriks user-item:", R.shape)


Total users: 13876, Total items: 123069
Shape matriks user-item: (13876, 123069)


### **Fungsi rekomendasi (Popularity, IBCF, ALS)**

In [66]:
# Cell 4: Define functions

# 1. Popularity baseline
def popularity_recommend(R, user_idx, N=10, exclude_seen=True):
    item_pop = np.array(R.sum(axis=0)).ravel()
    if exclude_seen and user_idx is not None:
        seen = set(R[user_idx].nonzero()[1])
        candidates = [(i, p) for i,p in enumerate(item_pop) if i not in seen]
    else:
        candidates = list(enumerate(item_pop))
    return [i for i,_ in sorted(candidates, key=lambda x: -x[1])[:N]]



In [67]:
# 2. Item-Based Collaborative Filtering
def item_based_recommend(R, user_idx, N=10):
    if user_idx is None:
        return np.argsort(-np.array(R.sum(axis=0)).ravel())[:N]
    user_items = R[user_idx].nonzero()[1]
    if len(user_items) == 0:
        return popularity_recommend(R, user_idx, N)
    Iu = R.T.tocsr()
    profile = Iu[user_items].sum(axis=0)
    profile = np.asarray(profile).ravel()
    item_norms = np.sqrt(Iu.multiply(Iu).sum(axis=1)).A1 + 1e-9
    profile_norm = np.linalg.norm(profile) + 1e-9
    scores = Iu.dot(profile) / (item_norms * profile_norm)
    scores = np.array(scores).ravel()
    scores[list(user_items)] = -np.inf
    return np.argsort(-scores)[:N]



In [68]:
# 3. Implicit ALS (Alternating Least Squares)
def implicit_als(R_csr, factors=20, regularization=0.1, alpha=40, iterations=10):
    users, items = R_csr.shape
    X = np.random.normal(scale=0.01, size=(users, factors))
    Y = np.random.normal(scale=0.01, size=(items, factors))
    I_f = np.eye(factors)
    R_csr = R_csr.tocsr()

    for it in range(iterations):
        print(f"ALS iteration {it+1}/{iterations}")
        YtY = Y.T @ Y + regularization * I_f
        for u in range(users):
            start, end = R_csr.indptr[u], R_csr.indptr[u+1]
            item_idxs = R_csr.indices[start:end]
            if len(item_idxs) == 0:
                continue
            Cu = 1.0 + alpha * R_csr.data[start:end]
            Pu = np.ones_like(Cu)
            Y_i = Y[item_idxs]
            A = YtY + (Y_i.T * (Cu - 1)).dot(Y_i)
            b = (Y_i.T * Cu).dot(Pu)
            X[u] = np.linalg.solve(A, b)
        XtX = X.T @ X + regularization * I_f
        R_csc = R_csr.tocsc()
        for i in range(items):
            start, end = R_csc.indptr[i], R_csc.indptr[i+1]
            user_idxs = R_csc.indices[start:end]
            if len(user_idxs) == 0:
                continue
            Ci = 1.0 + alpha * R_csc.data[start:end]
            Pi = np.ones_like(Ci)
            X_u = X[user_idxs]
            A = XtX + (X_u.T * (Ci - 1)).dot(X_u)
            b = (X_u.T * Ci).dot(Pi)
            Y[i] = np.linalg.solve(A, b)
    return X, Y


### **Train ALS**

In [70]:
# Cell 5: Train ALS (optional)
if ALS_ON:
    print("Training ALS model...")
    X, Y = implicit_als(R, factors=ALS_FACTORS, regularization=ALS_REG, alpha=ALS_ALPHA, iterations=ALS_ITER)
    print("Training selesai.")
else:
    X, Y = None, None
    print("ALS dimatikan (gunakan baseline saja).")


Training ALS model...
ALS iteration 1/8
ALS iteration 2/8
ALS iteration 3/8
ALS iteration 4/8
ALS iteration 5/8
ALS iteration 6/8
ALS iteration 7/8
ALS iteration 8/8
Training selesai.


### **Generate rekomendasi**

In [72]:
# Cell 6: Generate rekomendasi
sample_user_col = [c for c in sample_sub.columns if 'user' in c.lower()][0]
sample_users = sample_sub[sample_user_col].tolist()
item_pop = np.array(R.sum(axis=0)).ravel()
global_pop_ranking = np.argsort(-item_pop)

pop_recs, ibcf_recs, als_recs = [], [], []




In [73]:
for su in tqdm(sample_users, desc="Generate rekomendasi"):
    uid = user_enc.transform([su])[0] if su in user_enc.classes_ else None

    # Popularity
    pop_idx = popularity_recommend(R, uid, N_RECS)
    pop_list = item_enc.inverse_transform(pop_idx)
    pop_recs.append(pop_list)

    # IBCF
    ibcf_idx = item_based_recommend(R, uid, N_RECS)
    ibcf_list = item_enc.inverse_transform(ibcf_idx)
    ibcf_recs.append(ibcf_list)

    # ALS
    if ALS_ON and X is not None and uid is not None:
        scores = Y @ X[uid]
        seen = set(R[uid].nonzero()[1])
        scores[list(seen)] = -np.inf
        top_idx = np.argsort(-scores)[:N_RECS]
        als_list = item_enc.inverse_transform(top_idx)
    else:
        als_list = item_enc.inverse_transform(global_pop_ranking[:N_RECS])
    als_recs.append(als_list)

Generate rekomendasi: 100%|██████████| 2/2 [00:00<00:00,  6.86it/s]


In [74]:
print("\n===== CONTOH HASIL REKOMENDASI (5 USER PERTAMA) =====\n")
for i in range(min(5, len(sample_users))):
    print(f"👤 User {sample_users[i]}")
    print(f"   Popularity → {pop_recs[i]}")
    print(f"   IBCF       → {ibcf_recs[i]}")
    print(f"   ALS        → {als_recs[i]}")
    print("-" * 60)


===== CONTOH HASIL REKOMENDASI (5 USER PERTAMA) =====

👤 User 8
   Popularity → ['0316666343' '0385504209' '0312195516' '0142001740' '059035342X'
 '0060928336' '0446672211' '0345337662' '0452282152' '0316601950']
   IBCF       → ['0020811853' '0747251134' '0767904168' '0676972179' '0618335455'
 '0870211110' '0743618174' '0099535203' '0618485228' '0870443615']
   ALS        → ['0316601950' '0375707972' '0385484518' '0451191145' '067976402X'
 '0375706771' '0142000205' '0316569321' '0440220602' '0345439104']
------------------------------------------------------------
👤 User 99
   Popularity → ['0316666343' '0385504209' '0312195516' '0142001740' '059035342X'
 '0060928336' '0446672211' '0345337662' '0452282152' '0316601950']
   IBCF       → ['0446677477' '0671034421' '0446528382' '0451524462' '0515130982'
 '0743453956' '0743453484' '0671537873' '0786015160' '031230899X']
   ALS        → ['0316666343' '0156027321' '0786867647' '0060976845' '0385504209'
 '0671027360' '0446611867' '038551043

### **Evaluasi & simpan hasil**

In [90]:
# Format long table (setiap baris = 1 user-item)
rows = []
for i, user in enumerate(sample_users):
    for model, recs in zip(["popularity", "ibcf", "als"], [pop_recs, ibcf_recs, als_recs]):
        for rank, item in enumerate(recs[i], 1):
            rows.append([user, model, rank, item])

df_long = pd.DataFrame(rows, columns=['user_id', 'model', 'rank', 'item_id'])
out_path = os.path.join(DATA_DIR, "recs_submission_long.csv")
df_long.to_csv(out_path, index=False)
print("File disimpan ke:", out_path)
df_long.head(10)


File disimpan ke: D:\Perkuliahaan\SemesterVII\System Recommendation\Proyek\data-books\recs_submission_long.csv


Unnamed: 0,user_id,model,rank,item_id
0,8,popularity,1,0316666343
1,8,popularity,2,0385504209
2,8,popularity,3,0312195516
3,8,popularity,4,0142001740
4,8,popularity,5,059035342X
5,8,popularity,6,0060928336
6,8,popularity,7,0446672211
7,8,popularity,8,0345337662
8,8,popularity,9,0452282152
9,8,popularity,10,0316601950
