In [None]:
import json, re, pandas as pd

NEWS_COLS = ["news_id","category","subcategory","title","abstract","url","title_entities","abstract_entities"]
news = pd.read_csv("../mind_data/MINDsmall_train/news.tsv", sep="\t", header=None, names=NEWS_COLS)
news = news.drop_duplicates("news_id").set_index("news_id")

def _clean(s):
    if not isinstance(s, str): return ""
    s = re.sub(r"[\r\n]+", " ", s).strip()
    return s[:2000]  # safety cap

news["text"] = (news["title"].map(_clean) + " " + news["abstract"].map(_clean)).str.strip()

def _parse_entities(s):
    if not isinstance(s, str) or not s.strip(): return []
    try:
        return json.loads(s)
    except Exception:
        return json.loads(s.replace("'", '"'))  # fallback for rare dumps

news["title_entities"]    = news["title_entities"].map(_parse_entities)
news["abstract_entities"] = news["abstract_entities"].map(_parse_entities)


In [17]:
#print the  first row
print(news.iloc[0].to_dict())


{'category': 'lifestyle', 'subcategory': 'lifestyleroyals', 'title': 'The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By', 'abstract': "Shop the notebooks, jackets, and more that the royals can't live without.", 'url': 'https://assets.msn.com/labs/mind/AAGH0ET.html', 'title_entities': [{'Label': 'Prince Philip, Duke of Edinburgh', 'Type': 'P', 'WikidataId': 'Q80976', 'Confidence': 1.0, 'OccurrenceOffsets': [48], 'SurfaceForms': ['Prince Philip']}, {'Label': 'Charles, Prince of Wales', 'Type': 'P', 'WikidataId': 'Q43274', 'Confidence': 1.0, 'OccurrenceOffsets': [28], 'SurfaceForms': ['Prince Charles']}, {'Label': 'Elizabeth II', 'Type': 'P', 'WikidataId': 'Q9682', 'Confidence': 0.97, 'OccurrenceOffsets': [11], 'SurfaceForms': ['Queen Elizabeth']}], 'abstract_entities': [], 'text': "The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By Shop the notebooks, jackets, and more that the royals can't live without."}


In [None]:
from collections import defaultdict

def read_behaviors(path):
    BEH_COLS = ["imp_id","user_id","time","history","impressions"]
    beh = pd.read_csv(path, sep="\t", header=None, names=BEH_COLS)
    def ph(x): return [] if pd.isna(x) or not x else x.split()
    def pi(x):
        out=[]
        for tok in x.split():
            if "-" in tok:
                nid, lab = tok.split("-")
                out.append((nid, int(lab)))
            else:  # test split may have no labels
                out.append((tok, None))
        return out
    beh["hist_list"] = beh["history"].map(ph)
    beh["impr_list"] = beh["impressions"].map(pi)
    return beh

train_beh = read_behaviors("../mind_data/MINDsmall_train/behaviors.tsv")
#dev_beh   = read_behaviors("../mind_data/MINDsmall_dev/behaviors.tsv")

# indices
all_news_ids = set(news.index.tolist())
def collect_ids(beh):
    nids=set(); uids=set()
    for h in beh["hist_list"]:
        nids.update(h)
    for im in beh["impr_list"]:
        nids.update([nid for nid,_ in im])
    uids.update(beh["user_id"].tolist())
    return uids, nids

uids_tr, nids_tr = collect_ids(train_beh)
#uids_dev, nids_dev = collect_ids(dev_beh)

# Keep only news that appear in news.tsv (some may not)
#used_news = (nids_tr | nids_dev) & all_news_ids

news2idx = {nid:i for i,nid in enumerate(sorted(used_news))}
#user2idx = {uid:i for i,uid in enumerate(sorted(uids_tr | uids_dev))}
