# 8) Personalization via Cohorts (PyTorch Embeddings)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

%%capture
!pip -q install --upgrade pip
!pip -q install datasets transformers sentence-transformers faiss-cpu rank-bm25 torchmetrics scikit-learn lightgbm langdetect unidecode pandas matplotlib tqdm nltk

In [3]:

import numpy as np, pandas as pd, torch, torch.nn as nn, torch.optim as optim
from datasets import load_dataset
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
device = "cuda" if torch.cuda.is_available() else "cpu"
np.random.seed(42); torch.manual_seed(42)

<torch._C.Generator at 0x7eb33db33c10>

In [4]:

# ds = load_dataset("amazon_reviews_multi","en", split="train[:20%]")
# df = ds.to_pandas()[["review_id","product_id","review_title","review_body","reviewer_id","stars"]].dropna()

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Datasets/amazon_reviews_multi/en/train.csv")
df = df[df["language"] == "en"]
df = df[["review_id", "product_id", "review_title", "review_body", "reviewer_id", "stars"]].dropna()

if "reviewer_id" not in df.columns:
    df["reviewer_id"] = df["review_id"].apply(lambda x: abs(hash(str(x)))%1000000)
def cohort_from_title(t):
    toks = [w for w in str(t).lower().split() if len(w)>3]
    return toks[0] if toks else "misc"
df["cohort"] = df["review_title"].apply(cohort_from_title)
u_ids = {u:i for i,u in enumerate(df["reviewer_id"].unique().tolist())}
i_ids = {p:i for i,p in enumerate(df["product_id"].unique().tolist())}
c_ids = {c:i for i,c in enumerate(df["cohort"].unique().tolist())}
df = df.sample(frac=1.0, random_state=42).head(200000)
pairs = np.stack([
    df["reviewer_id"].map(u_ids).values,
    df["product_id"].map(i_ids).values,
    df["cohort"].map(c_ids).values,
    (df["stars"]>=4).astype(int).values
], axis=1)
U = len(u_ids); I = len(i_ids); C = len(c_ids)

In [6]:

class CohortCTR(nn.Module):
    def __init__(self, U,I,C,d=32):
        super().__init__()
        self.u = nn.Embedding(U,d); self.i = nn.Embedding(I,d); self.c = nn.Embedding(C,d)
        self.mlp = nn.Sequential(nn.Linear(3*d,128), nn.ReLU(), nn.Linear(128,1))
    def forward(self, uid, iid, cid):
        x = torch.cat([self.u(uid), self.i(iid), self.c(cid)], dim=1)
        return self.mlp(x)

model = CohortCTR(U,I,C).to(device); opt=optim.AdamW(model.parameters(), lr=3e-3); loss=nn.BCEWithLogitsLoss()
TR = int(0.8*len(pairs))
Xtr, ytr = torch.tensor(pairs[:TR,:3]), torch.tensor(pairs[:TR,3], dtype=torch.float32).unsqueeze(1)
Xdv, ydv = torch.tensor(pairs[TR:,:3]), torch.tensor(pairs[TR:,3], dtype=torch.float32).unsqueeze(1)
for ep in range(10):
    model.train(); tot=0
    for i in range(0, TR, 1024):
        b = Xtr[i:i+1024].to(device); yt=ytr[i:i+1024].to(device)
        opt.zero_grad(); p = model(b[:,0],b[:,1],b[:,2]); l=loss(p,yt); l.backward(); opt.step(); tot+=l.item()*len(b)
    with torch.no_grad():
        pr = torch.sigmoid(model(Xdv[:,0].to(device), Xdv[:,1].to(device), Xdv[:,2].to(device))).cpu().numpy().ravel()
    print("ep", ep, "loss", tot/TR, "AUC", roc_auc_score(ydv.numpy().ravel(), pr))

ep 0 loss 0.550225232366266 AUC 0.812557281099819
ep 1 loss 0.475867165551909 AUC 0.8173092053419091
ep 2 loss 0.42544919774173334 AUC 0.7974368809612256
ep 3 loss 0.3553376573262763 AUC 0.7650677480928103
ep 4 loss 0.274685123682154 AUC 0.7360052716863951
ep 5 loss 0.20758097401084888 AUC 0.724006703633194
ep 6 loss 0.15455463291380564 AUC 0.7107860392307985
ep 7 loss 0.10332670480581656 AUC 0.6968989175535465
ep 8 loss 0.06657832432505553 AUC 0.6902879234766879
ep 9 loss 0.04700838260209955 AUC 0.6842622426749699
