In [48]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import Dataset, DataLoader

stalls = pd.read_csv('hawkerstallscleaned.csv').reset_index(drop=True)

interactions = (
    pd.read_csv('userinteractionscleaned.csv')
      .rename(columns={'stall_id': 'place_id'})
)

interactions = interactions[
    interactions['place_id'].isin(stalls['place_id'])
].reset_index(drop=True)

REF = pd.Timestamp('2025-04-27')
def parse_rt(rt):
    if pd.isna(rt): 
        return REF
    num, unit, *_ = rt.split()
    n = 1 if num in ('a','an') else int(num)
    if 'year' in unit: return REF - pd.DateOffset(years=n)
    if 'month' in unit: return REF - pd.DateOffset(months=n)
    if 'day' in unit: return REF - pd.DateOffset(days=n)
    return REF

interactions['ts'] = interactions['relative_time'].apply(parse_rt)

interactions = interactions[interactions['rating'] >= 4].reset_index(drop=True)
counts = interactions['author'].value_counts()
eligible = counts[counts >= 10].index.tolist()
interactions = interactions[
    interactions['author'].isin(eligible)
].reset_index(drop=True)

ienc = LabelEncoder().fit(interactions['place_id'])
n_items = len(ienc.classes_)

train_parts, test_data = [], {}
for user, grp in interactions.groupby('author'):
    grp = grp.sort_values('ts').reset_index(drop=True)
    half = len(grp) // 2
    train_parts.append(grp.iloc[:half])
    test_data[user] = grp.iloc[half:].reset_index(drop=True)
train_df = pd.concat(train_parts, ignore_index=True)

uenc = LabelEncoder().fit(train_df['author'])
train_df['uid'] = uenc.transform(train_df['author'])
train_df['iid'] = ienc.transform(train_df['place_id'])
n_users = len(uenc.classes_)

stalls = stalls.fillna({'business_status': 'unknown'})
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_feats = ohe.fit_transform(stalls[['business_status']])
tfidf = TfidfVectorizer(max_features=100)
text_input = stalls['name_norm'].fillna('') + ' ' + stalls['address_norm'].fillna('')
text_feats = tfidf.fit_transform(text_input).toarray()
side_matrix = np.hstack([cat_feats, text_feats])
n_side = side_matrix.shape[1]

item_side = np.zeros((n_items, n_side))
for idx, pid in enumerate(ienc.classes_):
    match = stalls.index[stalls['place_id'] == pid]
    if len(match):
        item_side[idx] = side_matrix[match[0]]

def generate_pairs(df, n_items):
    pos = df[['uid','iid']].values.tolist()
    neg = []
    for u, i in pos:
        choices = np.setdiff1d(np.arange(n_items), df[df['uid']==u]['iid'].unique())
        neg_i = np.random.choice(choices, size=1)[0]
        neg.append([u, neg_i])
    return list(zip(pos, neg))

pairs = generate_pairs(train_df, n_items)
class BPRDataset(Dataset):
    def __init__(self, pairs): self.pairs = pairs
    def __len__(self): return len(self.pairs)
    def __getitem__(self, idx):
        (u, pos_i), (_, neg_i) = self.pairs[idx]
        return u, pos_i, neg_i

loader = DataLoader(BPRDataset(pairs), batch_size=1024, shuffle=True)

class DeepFM(nn.Module):
    def __init__(self, n_users, n_items, n_side, k=10, hidden=[64,32]):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, k)
        self.item_emb = nn.Embedding(n_items, k)
        self.user_lin = nn.Embedding(n_users, 1)
        self.item_lin = nn.Embedding(n_items, 1)
        self.side_lin = nn.Linear(n_side, 1)
        self.mlp = nn.Sequential(
            nn.Linear(2*k + n_side, hidden[0]),
            nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(hidden[0], hidden[1]),
            nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(hidden[1], 1)
        )

    def forward(self, u, i, side):
        u_vec = self.user_emb(u)
        i_vec = self.item_emb(i)
        lin = self.user_lin(u) + self.item_lin(i) + self.side_lin(side)
        fm = ((u_vec+i_vec)**2 - u_vec**2 - i_vec**2).sum(1, keepdim=True)
        x = torch.cat([u_vec, i_vec, side], dim=1)
        return lin + fm + self.mlp(x)

def bpr_loss(pos, neg):
    return -torch.log(torch.sigmoid(pos - neg)).mean()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DeepFM(n_users, n_items, n_side).to(device)
opt = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

for epoch in range(1, 1001):
    model.train()
    total_loss = 0
    for u_b, p_b, n_b in loader:
        u_b, p_b, n_b = u_b.to(device), p_b.to(device), n_b.to(device)
        side_p = torch.tensor(item_side[p_b.cpu().numpy()],
                              dtype=torch.float32, device=device)
        side_n = torch.tensor(item_side[n_b.cpu().numpy()],
                              dtype=torch.float32, device=device)
        opt.zero_grad()
        pos_s = model(u_b, p_b, side_p).squeeze()
        neg_s = model(u_b, n_b, side_n).squeeze()
        loss = bpr_loss(pos_s, neg_s)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    # if epoch % 50 == 0:
    #     print(f"Epoch {epoch:03d} Loss: {total_loss/len(loader):.4f}")

model.eval()
side_tensor = torch.tensor(item_side, dtype=torch.float32, device=device)
all_items = np.arange(n_items)

user_scores = {}
for user in eligible:
    uid = uenc.transform([user])[0]
    seen = set(train_df[train_df['uid']==uid]['iid'])
    test_i = [
        int(ienc.transform([pid])[0])
        for pid in test_data[user]['place_id']
        if pid in ienc.classes_
    ]
    if not test_i:
        continue

    u_rep = torch.full((n_items,), uid, dtype=torch.long, device=device)
    with torch.no_grad():
        scores = model(u_rep,
                       torch.tensor(all_items, device=device),
                       side_tensor).squeeze().cpu().numpy()
    scores[list(seen)] = -np.inf
    user_scores[user] = {'scores': scores, 'test_i': test_i, 'seen': seen}

pool_sizes = [50, 100, 200, 500, 1000, 2000, n_items]
ks = [1, 2, 3, 5]

metrics_by_pool = {
    ps: {k: {'hit': [], 'precision': [], 'recall': [], 'f1': []}
         for k in ks}
    for ps in pool_sizes
}

for ps in pool_sizes:
    for user, info in user_scores.items():
        scores = info['scores']
        test_i = info['test_i']
        seen = info['seen']
        neg_pool = list(set(all_items) - seen - set(test_i))

        neg_needed = ps - len(test_i)
        if neg_needed > 0:
            neg_needed = min(neg_needed, len(neg_pool))
            sampled_neg = np.random.choice(neg_pool,
                                           size=neg_needed,
                                           replace=False).tolist()
            candidates = test_i + sampled_neg
        else:
            candidates = test_i.copy()

        ranked = sorted(candidates, key=lambda i: scores[i], reverse=True)

        for k in ks:
            rec_k = ranked[:k]
            tp = sum(1 for i in rec_k if i in test_i)
            prec = tp / k
            rec = tp / len(test_i)
            f1 = 2 * prec * rec / (prec + rec) if (prec+rec) > 0 else 0.0
            hit = tp > 0

            m = metrics_by_pool[ps][k]
            m['hit'].append(hit)
            m['precision'].append(prec)
            m['recall'].append(rec)
            m['f1'].append(f1)

hitrate_rows, metrics_rows = [], []
for ps, data in metrics_by_pool.items():
    hr = {'pool_size': ps}
    mr = {'pool_size': ps}
    for k in ks:
        hr[f'HitRate@{k}'] = np.mean(data[k]['hit'])
        mr[f'Precision@{k}'] = np.mean(data[k]['precision'])
        mr[f'Recall@{k}'] = np.mean(data[k]['recall'])
        mr[f'F1@{k}'] = np.mean(data[k]['f1'])
    hitrate_rows.append(hr)
    metrics_rows.append(mr)

deepfm_hitrate_df = pd.DataFrame(hitrate_rows)
deepfm_metrics_df = pd.DataFrame(metrics_rows)


In [49]:
print("=== DeepFM HitRate@k ===")
deepfm_hitrate_df

=== DeepFM HitRate@k ===


Unnamed: 0,pool_size,HitRate@1,HitRate@2,HitRate@3,HitRate@5
0,50,0.43722,0.630045,0.738789,0.85426
1,100,0.272422,0.45852,0.575112,0.705157
2,200,0.180493,0.290359,0.376682,0.5
3,500,0.085202,0.151345,0.205157,0.293722
4,1000,0.049327,0.087444,0.117713,0.174888
5,2000,0.020179,0.045964,0.065022,0.103139
6,2778,0.015695,0.039238,0.053812,0.073991


In [50]:
print("\n=== DeepFM Precision@k, Recall@k, F1@k ===")
deepfm_metrics_df


=== DeepFM Precision@k, Recall@k, F1@k ===


Unnamed: 0,pool_size,Precision@1,Recall@1,F1@1,Precision@2,Recall@2,F1@2,Precision@3,Recall@3,F1@3,Precision@5,Recall@5,F1@5
0,50,0.43722,0.0463,0.081955,0.431054,0.091383,0.145378,0.413677,0.13037,0.189134,0.389686,0.20433,0.252064
1,100,0.272422,0.027764,0.049243,0.277466,0.057116,0.091082,0.268685,0.08269,0.120167,0.251794,0.12854,0.159421
2,200,0.180493,0.018297,0.032478,0.16648,0.033103,0.053055,0.157698,0.048224,0.070074,0.147534,0.074093,0.091955
3,500,0.085202,0.009187,0.016208,0.077915,0.016249,0.025803,0.075859,0.022942,0.03345,0.070404,0.035735,0.044244
4,1000,0.049327,0.005304,0.009329,0.044843,0.009001,0.014345,0.041106,0.012594,0.018312,0.040135,0.020036,0.024947
5,2000,0.020179,0.002391,0.004141,0.022982,0.004788,0.007554,0.022048,0.006886,0.009916,0.021749,0.01103,0.013636
6,2778,0.015695,0.001896,0.003302,0.019619,0.004094,0.006431,0.017937,0.005736,0.008194,0.015471,0.008275,0.010072
