In [68]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

stalls = pd.read_csv('hawkerstallscleaned.csv').reset_index(drop=True)

interactions = (
    pd.read_csv('userinteractionscleaned.csv')
      .rename(columns={'stall_id': 'place_id'})
)

interactions = interactions[interactions['rating'] >= 4].reset_index(drop=True)

interactions = interactions[
    interactions['place_id'].isin(stalls['place_id'])
].reset_index(drop=True)

counts = interactions['author'].value_counts()
eligible = counts[counts >= 10].index.tolist()
interactions = interactions[
    interactions['author'].isin(eligible)
].reset_index(drop=True)

train_parts, test_data = [], {}
for author, grp in interactions.groupby('author'):
    grp_shuf = grp.sample(frac=1, random_state=42).reset_index(drop=True)
    half = len(grp_shuf) // 2
    train_parts.append(grp_shuf.iloc[:half])
    test_data[author] = grp_shuf.iloc[half:].reset_index(drop=True)
train_df = pd.concat(train_parts, ignore_index=True)

train_df['review_clean'] = (
    train_df['review_text']
      .fillna('')
      .str.lower()
      .str.replace(r'[^a-z0-9 ]', ' ', regex=True)
      .str.strip()
)

stall_docs = (
    interactions
      .groupby('place_id')['review_text']
      .apply(lambda texts: ' '.join(texts))
      .rename('all_reviews')
      .reset_index()
      .merge(stalls, on='place_id', how='inner')
)
stall_docs['all_reviews'] = stall_docs['all_reviews'].fillna('').astype(str)
stall_docs['doc'] = (
    stall_docs['name_norm'] + '. ' +
    stall_docs['address_norm'] + '. ' +
    stall_docs['all_reviews']
)
stall_docs['doc'] = stall_docs['doc'].astype(str)

model = SentenceTransformer('all-MiniLM-L6-v2')
stall_embs = model.encode(
    stall_docs['doc'].tolist(),
    # show_progress_bar=True,
    convert_to_numpy=True
)
pid_to_idx = {pid: idx for idx, pid in enumerate(stall_docs['place_id'])}
max_pool = len(stall_embs)  # should be 2778

fixed_pool_sizes = [50, 100, 200, 500, 1000, 2000]
pool_sizes = fixed_pool_sizes + [max_pool]
ks = [1, 2, 3, 5]

metrics_by_pool = {
    ps: {k: {'hit': [], 'precision': [], 'recall': [], 'f1': []} for k in ks}
    for ps in pool_sizes
}

for author in eligible:
    hist = train_df[train_df['author'] == author]
    if hist.empty:
        continue

    emb = model.encode(hist['review_clean'].tolist(), convert_to_numpy=True)
    profile = emb.mean(axis=0, keepdims=True)
    sims = cosine_similarity(profile, stall_embs)[0]

    test_pids = test_data[author]['place_id'].tolist()
    test_idxs = [pid_to_idx[p] for p in test_pids if p in pid_to_idx]
    if not test_idxs:
        continue

    seen_idxs = {pid_to_idx[p] for p in hist['place_id'] if p in pid_to_idx}
    all_idxs = set(range(max_pool))
    neg_pool = list(all_idxs - seen_idxs - set(test_idxs))

    for ps in pool_sizes:
        neg_needed = ps - len(test_idxs)
        if neg_needed > 0:
            neg_needed = min(neg_needed, len(neg_pool))
            sampled_neg = np.random.choice(neg_pool, size=neg_needed,
                                           replace=False).tolist()
            candidates = test_idxs + sampled_neg
        else:
            candidates = test_idxs.copy()

        ranked = sorted(candidates, key=lambda i: sims[i], reverse=True)

        for k in ks:
            rec_k = ranked[:k]
            tp = sum(1 for i in rec_k if i in test_idxs)
            prec = tp / k
            rec = tp / len(test_idxs)
            f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0
            hit = tp > 0

            m = metrics_by_pool[ps][k]
            m['hit'].append(hit)
            m['precision'].append(prec)
            m['recall'].append(rec)
            m['f1'].append(f1)

hitrate_rows, metrics_rows = [], []
for ps, data in metrics_by_pool.items():
    hr = {'pool_size': ps}
    mr = {'pool_size': ps}
    for k in ks:
        hr[f'HitRate@{k}'] = np.mean(data[k]['hit'])
        mr[f'Precision@{k}'] = np.mean(data[k]['precision'])
        mr[f'Recall@{k}'] = np.mean(data[k]['recall'])
        mr[f'F1@{k}'] = np.mean(data[k]['f1'])
    hitrate_rows.append(hr)
    metrics_rows.append(mr)

bert_hitrate_df = pd.DataFrame(hitrate_rows)
bert_metrics_df = pd.DataFrame(metrics_rows)

In [69]:
print("=== BERT HitRate@k ===")
bert_hitrate_df

=== BERT HitRate@k ===


Unnamed: 0,pool_size,HitRate@1,HitRate@2,HitRate@3,HitRate@5
0,50,0.390135,0.557175,0.679372,0.81278
1,100,0.220852,0.375561,0.4787,0.617713
2,200,0.132287,0.232063,0.308296,0.422646
3,500,0.070628,0.127803,0.181614,0.252242
4,1000,0.038117,0.06278,0.087444,0.151345
5,2000,0.026906,0.040359,0.050448,0.073991
6,2778,0.020179,0.034753,0.044843,0.057175


In [70]:
print("\n=== BERT Precision@k, Recall@k, F1@k ===")
bert_metrics_df


=== BERT Precision@k, Recall@k, F1@k ===


Unnamed: 0,pool_size,Precision@1,Recall@1,F1@1,Precision@2,Recall@2,F1@2,Precision@3,Recall@3,F1@3,Precision@5,Recall@5,F1@5
0,50,0.390135,0.040376,0.071566,0.365471,0.07532,0.120245,0.359865,0.111508,0.162111,0.346861,0.177027,0.219826
1,100,0.220852,0.021666,0.038508,0.230381,0.045939,0.073516,0.224963,0.067348,0.098162,0.217713,0.108715,0.135053
2,200,0.132287,0.013608,0.024111,0.131166,0.026391,0.042301,0.133034,0.039557,0.057824,0.126906,0.062223,0.077742
3,500,0.070628,0.007397,0.013098,0.068386,0.014134,0.022521,0.067265,0.020586,0.029907,0.064126,0.031902,0.039586
4,1000,0.038117,0.003901,0.00692,0.033632,0.00665,0.010656,0.031764,0.009658,0.014048,0.035202,0.017074,0.021375
5,2000,0.026906,0.002511,0.004466,0.0213,0.004201,0.006733,0.017564,0.005468,0.007911,0.016368,0.008581,0.010541
6,2778,0.020179,0.00173,0.003101,0.017377,0.003405,0.005459,0.015321,0.00479,0.006909,0.012556,0.006578,0.008079
