# 2 Content Based Recommender

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, IntegerType, DoubleType
import pandas as pd
import numpy as np
import pickle

# ─── 1) Start Spark with console‐progress enabled ────────────────────────────────
spark = SparkSession.builder \
    .appName("ContentBasedSpark") \
    .config("spark.ui.showConsoleProgress", "true") \
    .getOrCreate()

sc = spark.sparkContext

# ─── 2) Config & broadcast data ─────────────────────────────────────────────────
M_CANDIDATES      = 50
K1, K2            = 5, 10
EMB_FP            = "../datasets/articles_embeddings.pickle"
META_FP           = "../datasets/articles_metadata.csv"
TRAIN_FP          = "../datasets/train_clicks.parquet"
VALID_FP          = "../datasets/valid_clicks.parquet"
UP_FP             = "../datasets/user_profiles.parquet"

# Load embeddings & normalize
with open(EMB_FP, "rb") as f:
    embs = pickle.load(f)
emb_norm = embs / np.linalg.norm(embs, axis=1)[:, None]

# Load metadata and build maps
meta = pd.read_csv(META_FP, usecols=["article_id","created_at_ts"])
meta["created_at_ts"] = pd.to_datetime(meta["created_at_ts"], unit="ms")
meta = meta.sort_values("article_id").reset_index(drop=True)
article_ids = meta["article_id"].to_numpy()
pub_map     = dict(zip(meta["article_id"], meta["created_at_ts"]))

# Load clicks and build seen/val–time maps
train_pd     = pd.read_parquet(TRAIN_FP, engine="pyarrow")
val_pd       = pd.read_parquet(VALID_FP, engine="pyarrow")
val_pd["click_timestamp"] = pd.to_datetime(val_pd["click_timestamp"], unit="ms")
seen_map     = train_pd.groupby("user_id")["click_article_id"].apply(set).to_dict()
val_time_map = val_pd.set_index("user_id")["click_timestamp"].to_dict()

# Broadcast to executors
b_article_ids  = sc.broadcast(article_ids)
b_emb_norm     = sc.broadcast(emb_norm)
b_pub_map      = sc.broadcast(pub_map)
b_seen_map     = sc.broadcast(seen_map)
b_val_time_map = sc.broadcast(val_time_map)

# Load user profiles
up_df = spark.read.parquet(UP_FP)


25/04/30 14:42:50 WARN Utils: Your hostname, p962cnts8crs64g128g resolves to a loopback address: 127.0.0.1; using 10.0.0.4 instead (on interface eth0)
25/04/30 14:42:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/30 14:42:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [3]:
from pyspark.sql.types import StructType, StructField, LongType, IntegerType, DoubleType

# Define output schema
schema = StructType([
    StructField("user_id",    LongType(),    False),
    StructField("rank",       IntegerType(), False),
    StructField("article_id", LongType(),    False),
    StructField("score",      DoubleType(),  False),
])

def recommend_batch(pdf_iter):
    for pdf in pdf_iter:
        recs = {"user_id":[], "rank":[], "article_id":[], "score":[]}
        emb_cols = [c for c in pdf.columns if c != "user_id"]
        for row in pdf.itertuples(index=False):
            uid = int(row.user_id)
            uemb = np.array([getattr(row,c) for c in emb_cols], dtype=float)
            norm = np.linalg.norm(uemb)
            u_norm = uemb/norm if norm>0 else uemb

            sims = b_emb_norm.value.dot(u_norm)
            idxs = np.argpartition(-sims, M_CANDIDATES)[:M_CANDIDATES]
            idxs = idxs[np.argsort(-sims[idxs])]
            aids = b_article_ids.value[idxs]
            ss  = sims[idxs]

            cutoff = b_val_time_map.value.get(uid, pd.Timestamp.max)
            valid = [(a,s) for a,s in zip(aids,ss)
                     if b_pub_map.value.get(a,pd.Timestamp.min) <= cutoff]

            seen = b_seen_map.value.get(uid, set())
            filtered = [(a,s) for a,s in valid if a not in seen]

            for rank, (a,s) in enumerate(filtered[:K1], start=1):
                recs["user_id"].append(uid)
                recs["rank"].append(rank)
                recs["article_id"].append(int(a))
                recs["score"].append(float(s))
            for rank, (a,s) in enumerate(filtered[:K2], start=1):
                recs["user_id"].append(uid)
                recs["rank"].append(rank)
                recs["article_id"].append(int(a))
                recs["score"].append(float(s))
        
        yield pd.DataFrame(recs)


In [None]:
# Apply the UDF in parallel
result = up_df.mapInPandas(recommend_batch, schema=schema)

# Write out your top-5 and top-10
result.filter("rank <= 5") \
      .write.mode("overwrite") \
      .csv("../datasets/content_recs_top5", header=True)

result.filter("rank <= 10") \
      .write.mode("overwrite") \
      .csv("../datasets/content_recs_top10", header=True)

spark.stop()


25/04/30 14:44:05 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

In [10]:
from pathlib import Path
import pandas as pd

# 1) Read recs into pandas (they’re small: one row per user × K)

def load_spark_csvs(output_dir: str) -> pd.DataFrame:
    base = Path(output_dir)
    parts = [
        p for p in base.rglob("part-*.csv")
        if not any(seg.startswith("attempt_") for seg in p.parts)
    ]
    if not parts:
        raise FileNotFoundError(f"No committed CSV shards found in {output_dir}")
    return pd.concat((pd.read_csv(p) for p in parts), ignore_index=True)

# load Top-5 (2 shards at root) and Top-10 (1 shard under task_…)
rec5_df  = load_spark_csvs("../datasets/content_recs_top5")
rec10_df = load_spark_csvs("../datasets/content_recs_top10")


In [12]:
# 2) Pivot to one row per user
top5  = rec5_df .groupby("user_id")["article_id"].apply(list).rename("recs5")
top10 = rec10_df.groupby("user_id")["article_id"].apply(list).rename("recs10")

# 3) Load validation clicks
val = pd.read_parquet("../datasets/valid_clicks.parquet", engine="pyarrow")
val = val.rename(columns={"click_article_id": "true_click"}).set_index("user_id")

# 4) Merge
eval_df = val.join(top5,  how="left").join(top10, how="left")

# 5) Compute metrics
def compute_metrics(row, K):
    recs = row[f"recs{K}"]
    true = row["true_click"]
    if not isinstance(recs, list): 
        return pd.Series({"hit": 0, "prec": 0.0, "rr": 0.0})
    hit = int(true in recs)
    prec = hit / K
    # reciprocal rank: 1/(position), or 0 if not present
    rr = 1.0 / (recs.index(true) + 1) if hit else 0.0
    return pd.Series({"hit": hit, "prec": prec, "rr": rr})

# apply for K=5 and K=10
m5  = eval_df.apply(lambda r: compute_metrics(r, 5),  axis=1)
m10 = eval_df.apply(lambda r: compute_metrics(r, 10), axis=1)

results = pd.DataFrame({
    "Recall@5":    m5["hit"].mean(),
    "Precision@5": m5["prec"].mean(),
    "MRR@5":       m5["rr"].mean(),
    "Recall@10":   m10["hit"].mean(),
    "Precision@10":m10["prec"].mean(),
    "MRR@10":      m10["rr"].mean(),
}, index=[0])

print(results.T)


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  top10 = rec10_df.groupby("user_id")["article_id"].apply(list).rename("recs10")


                     0
Recall@5      0.000061
Precision@5   0.000012
MRR@5         0.000043
Recall@10     0.000000
Precision@10  0.000000
MRR@10        0.000000


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, IntegerType, DoubleType
import pandas as pd
import numpy as np
import pickle

# ─── 1) Start Spark with console‐progress enabled ────────────────────────────────
spark = SparkSession.builder \
    .appName("ContentBasedSpark") \
    .config("spark.ui.showConsoleProgress", "true") \
    .getOrCreate()

sc = spark.sparkContext

# ─── 2) Config & broadcast data ─────────────────────────────────────────────────
M_CANDIDATES      = 50
K1, K2            = 5, 10
EMB_FP            = "../datasets/articles_embeddings.pickle"
META_FP           = "../datasets/articles_metadata.csv"
TRAIN_FP          = "../datasets/train_clicks.parquet"
VALID_FP          = "../datasets/valid_clicks.parquet"
UP_FP             = "../datasets/user_profiles.parquet"

# Load embeddings & normalize
with open(EMB_FP, "rb") as f:
    embs = pickle.load(f)
emb_norm = embs / np.linalg.norm(embs, axis=1)[:, None]

# Load metadata and build maps
meta = pd.read_csv(META_FP, usecols=["article_id","created_at_ts"])
meta["created_at_ts"] = pd.to_datetime(meta["created_at_ts"], unit="ms")
meta = meta.sort_values("article_id").reset_index(drop=True)
article_ids = meta["article_id"].to_numpy()
pub_map     = dict(zip(meta["article_id"], meta["created_at_ts"]))

# Load clicks and build seen/val–time maps
train_pd     = pd.read_parquet(TRAIN_FP, engine="pyarrow")
val_pd       = pd.read_parquet(VALID_FP, engine="pyarrow")
val_pd["click_timestamp"] = pd.to_datetime(val_pd["click_timestamp"], unit="ms")
seen_map     = train_pd.groupby("user_id")["click_article_id"].apply(set).to_dict()
val_time_map = val_pd.set_index("user_id")["click_timestamp"].to_dict()

# Broadcast to executors
b_article_ids  = sc.broadcast(article_ids)
b_emb_norm     = sc.broadcast(emb_norm)
b_pub_map      = sc.broadcast(pub_map)
b_seen_map     = sc.broadcast(seen_map)
b_val_time_map = sc.broadcast(val_time_map)

# Load user profiles
up_df = spark.read.parquet(UP_FP)




from pyspark.sql.types import StructType, StructField, LongType, IntegerType, DoubleType

# Define output schema
schema = StructType([
    StructField("user_id",    LongType(),    False),
    StructField("rank",       IntegerType(), False),
    StructField("article_id", LongType(),    False),
    StructField("score",      DoubleType(),  False),
])

def recommend_batch(pdf_iter):
    for pdf in pdf_iter:
        recs = {"user_id":[], "rank":[], "article_id":[], "score":[]}
        emb_cols = [c for c in pdf.columns if c != "user_id"]
        for row in pdf.itertuples(index=False):
            uid = int(row.user_id)
            uemb = np.array([getattr(row,c) for c in emb_cols], dtype=float)
            norm = np.linalg.norm(uemb)
            u_norm = uemb/norm if norm>0 else uemb

            sims = b_emb_norm.value.dot(u_norm)
            idxs = np.argpartition(-sims, M_CANDIDATES)[:M_CANDIDATES]
            idxs = idxs[np.argsort(-sims[idxs])]
            aids = b_article_ids.value[idxs]
            ss  = sims[idxs]

            cutoff = b_val_time_map.value.get(uid, pd.Timestamp.max)
            valid = [(a,s) for a,s in zip(aids,ss)
                     if b_pub_map.value.get(a,pd.Timestamp.min) <= cutoff]

            seen = b_seen_map.value.get(uid, set())
            filtered = [(a,s) for a,s in valid if a not in seen]

            for rank, (a,s) in enumerate(filtered[:K1], start=1):
                recs["user_id"].append(uid)
                recs["rank"].append(rank)
                recs["article_id"].append(int(a))
                recs["score"].append(float(s))
            for rank, (a,s) in enumerate(filtered[:K2], start=1):
                recs["user_id"].append(uid)
                recs["rank"].append(rank)
                recs["article_id"].append(int(a))
                recs["score"].append(float(s))
        
        yield pd.DataFrame(recs)
        
        
# Apply the UDF in parallel
result = up_df.mapInPandas(recommend_batch, schema=schema)

# Write out your top-5 and top-10
result.filter("rank <= 5") \
      .write.mode("overwrite") \
      .csv("../datasets/content_recs_top5", header=True)

result.filter("rank <= 10") \
      .write.mode("overwrite") \
      .csv("../datasets/content_recs_top10", header=True)

spark.stop()

In [27]:
import pandas as pd
import numpy as np
import pickle
from sklearn.neighbors import NearestNeighbors

# ─── CONFIG ─────────────────────────────────────────────────────────────────────
USER_PROFILES_PATH = "../datasets/user_profiles.parquet"
EMB_PATH           = "../datasets/articles_embeddings.pickle"
TRAIN_PATH         = "../datasets/train_clicks.parquet"
OUTPUT_PATH        = "../datasets/user_recs_content.parquet"
M_CANDIDATES       = 50  # number of neighbors to retrieve before filtering
K_RECOMMEND        = 10  # final number of recommendations per user

In [25]:


# ─── 1. Load user profiles and article embeddings ───────────────────────────────
user_profiles = pd.read_parquet(USER_PROFILES_PATH, engine="pyarrow")

with open(EMB_PATH, "rb") as f:
    embs = pickle.load(f)  # numpy array (num_articles, emb_dim)
# Load metadata to align article IDs
meta = pd.read_csv("articles_metadata.csv", usecols=["article_id"]).sort_values("article_id").reset_index(drop=True)
assert embs.shape[0] == len(meta), "Embedding count and metadata count mismatch"

# Build embeddings DataFrame: index=article_id, columns emb_0...emb_dim
emb_dim = embs.shape[1]
emb_cols = [f"emb_{i}" for i in range(emb_dim)]
emb_df = pd.DataFrame(embs, index=meta["article_id"], columns=emb_cols)
emb_df.index.name = "article_id"

# ─── 2. Fit NearestNeighbors on article embeddings ───────────────────────────────
nn_model = NearestNeighbors(n_neighbors=M_CANDIDATES, metric="cosine")
nn_model.fit(emb_df.values)
article_ids = emb_df.index.to_numpy()

# ─── 3. Load train clicks to get seen articles per user ─────────────────────────
train_df = pd.read_parquet(TRAIN_PATH, engine="pyarrow")
seen_articles = train_df.groupby("user_id")["click_article_id"].apply(set).to_dict()

# ─── 4. Generate and store top-K recs for each user ─────────────────────────────
records = []
for idx, row in user_profiles.iterrows():
    uid = row["user_id"]
    profile_vec = row[emb_cols].to_numpy().reshape(1, -1)
    dists, idxs = nn_model.kneighbors(profile_vec)
    sims = 1 - dists.flatten()
    candidates = article_ids[idxs.flatten()]
    
    # Filter out seen articles
    seen = seen_articles.get(uid, set())
    filtered = [(aid, sim) for aid, sim in zip(candidates, sims) if aid not in seen]
    
    # Take top-K after filtering
    for rank, (aid, sim) in enumerate(filtered[:K_RECOMMEND], start=1):
        records.append({
            "user_id": uid,
            "recommendation_rank": rank,
            "article_id": aid,
            "score": sim
        })

# Build DataFrame and save
recs_df = pd.DataFrame(records)
recs_df.to_parquet(OUTPUT_PATH, engine="pyarrow", index=False)
print(f"Saved content-based recommendations for {recs_df['user_id'].nunique()} users → '{OUTPUT_PATH}'")


In [1]:
2+2

4