In [None]:
import json, re, pandas as pd

NEWS_COLS = ["news_id","category","subcategory","title","abstract","url","title_entities","abstract_entities"]
news = pd.read_csv("../mind_data/MINDsmall_train/news.tsv", sep="\t", header=None, names=NEWS_COLS)
news = news.drop_duplicates("news_id").set_index("news_id")

def _clean(s):
    if not isinstance(s, str): return ""
    s = re.sub(r"[\r\n]+", " ", s).strip()
    return s[:2000]  # safety cap

news["text"] = (news["title"].map(_clean) + " " + news["abstract"].map(_clean)).str.strip()

def _parse_entities(s):
    if not isinstance(s, str) or not s.strip(): return []
    try:
        return json.loads(s)
    except Exception:
        return json.loads(s.replace("'", '"'))  # fallback for rare dumps

news["title_entities"]    = news["title_entities"].map(_parse_entities)
news["abstract_entities"] = news["abstract_entities"].map(_parse_entities)


In [17]:
#print the  first row
print(news.iloc[0].to_dict())


{'category': 'lifestyle', 'subcategory': 'lifestyleroyals', 'title': 'The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By', 'abstract': "Shop the notebooks, jackets, and more that the royals can't live without.", 'url': 'https://assets.msn.com/labs/mind/AAGH0ET.html', 'title_entities': [{'Label': 'Prince Philip, Duke of Edinburgh', 'Type': 'P', 'WikidataId': 'Q80976', 'Confidence': 1.0, 'OccurrenceOffsets': [48], 'SurfaceForms': ['Prince Philip']}, {'Label': 'Charles, Prince of Wales', 'Type': 'P', 'WikidataId': 'Q43274', 'Confidence': 1.0, 'OccurrenceOffsets': [28], 'SurfaceForms': ['Prince Charles']}, {'Label': 'Elizabeth II', 'Type': 'P', 'WikidataId': 'Q9682', 'Confidence': 0.97, 'OccurrenceOffsets': [11], 'SurfaceForms': ['Queen Elizabeth']}], 'abstract_entities': [], 'text': "The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By Shop the notebooks, jackets, and more that the royals can't live without."}


In [None]:
from collections import defaultdict

def read_behaviors(path):
    BEH_COLS = ["imp_id","user_id","time","history","impressions"]
    beh = pd.read_csv(path, sep="\t", header=None, names=BEH_COLS)
    def ph(x): return [] if pd.isna(x) or not x else x.split()
    def pi(x):
        out=[]
        for tok in x.split():
            if "-" in tok:
                nid, lab = tok.split("-")
                out.append((nid, int(lab)))
            else:  # test split may have no labels
                out.append((tok, None))
        return out
    beh["hist_list"] = beh["history"].map(ph)
    beh["impr_list"] = beh["impressions"].map(pi)
    return beh

train_beh = read_behaviors("../mind_data/MINDsmall_train/behaviors.tsv")
#dev_beh   = read_behaviors("../mind_data/MINDsmall_dev/behaviors.tsv")

# indices
all_news_ids = set(news.index.tolist())
def collect_ids(beh):
    nids=set(); uids=set()
    for h in beh["hist_list"]:
        nids.update(h)
    for im in beh["impr_list"]:
        nids.update([nid for nid,_ in im])
    uids.update(beh["user_id"].tolist())
    return uids, nids

uids_tr, nids_tr = collect_ids(train_beh)



In [20]:
train_beh

Unnamed: 0,imp_id,user_id,time,history,impressions,hist_list,impr_list
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,"[N55189, N42782, N34694, N45794, N18445, N6330...","[(N55689, 1), (N35729, 0)]"
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,"[N31739, N6072, N63045, N23979, N35656, N43353...","[(N20678, 0), (N39317, 0), (N58114, 0), (N2049..."
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,"[N10732, N25792, N7563, N21087, N41087, N5445,...","[(N50014, 0), (N23877, 0), (N35389, 0), (N4971..."
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,"[N45729, N2203, N871, N53880, N41375, N43142, ...","[(N35729, 0), (N33632, 0), (N49685, 1), (N2758..."
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,"[N10078, N56514, N14904, N33740]","[(N39985, 0), (N36050, 0), (N16096, 0), (N8400..."
...,...,...,...,...,...,...,...
156960,156961,U21593,11/14/2019 10:24:05 PM,N7432 N58559 N1954 N43353 N14343 N13008 N28833...,N2235-0 N22975-0 N64037-0 N47652-0 N11378-0 N4...,"[N7432, N58559, N1954, N43353, N14343, N13008,...","[(N2235, 0), (N22975, 0), (N64037, 0), (N47652..."
156961,156962,U10123,11/13/2019 6:57:04 AM,N9803 N104 N24462 N57318 N55743 N40526 N31726 ...,N3841-0 N61571-0 N58813-0 N28213-0 N4428-0 N25...,"[N9803, N104, N24462, N57318, N55743, N40526, ...","[(N3841, 0), (N61571, 0), (N58813, 0), (N28213..."
156962,156963,U75630,11/14/2019 10:58:13 AM,N29898 N59704 N4408 N9803 N53644 N26103 N812 N...,N55913-0 N62318-0 N53515-0 N10960-0 N9135-0 N5...,"[N29898, N59704, N4408, N9803, N53644, N26103,...","[(N55913, 0), (N62318, 0), (N53515, 0), (N1096..."
156963,156964,U44625,11/13/2019 2:57:02 PM,N4118 N47297 N3164 N43295 N6056 N38747 N42973 ...,N6219-0 N3663-0 N31147-0 N58363-0 N4107-0 N457...,"[N4118, N47297, N3164, N43295, N6056, N38747, ...","[(N6219, 0), (N3663, 0), (N31147, 0), (N58363,..."
