In [None]:
# MIND small 뉴스ID: [UserIDs]

import pandas as pd
from collections import defaultdict

# 1. 전체 뉴스 ID 수집 (train + dev)
train_news = pd.read_csv(
    'download/MINDsmall_train/news.tsv',
    sep='\t',
    names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities']
)
dev_news = pd.read_csv(
    'download/MINDsmall_dev/news.tsv',
    sep='\t',
    names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities']
)
all_news_ids = set(train_news['NewsID']) | set(dev_news['NewsID'])

# 2. 클릭된 뉴스-유저 매핑 (train + dev behaviors)
train_behaviors = pd.read_csv(
    'download/MINDsmall_train/behaviors.tsv',
    sep='\t',
    names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
)
dev_behaviors = pd.read_csv(
    'download/MINDsmall_dev/behaviors.tsv',
    sep='\t',
    names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
)

combined_behaviors = pd.concat([train_behaviors, dev_behaviors], ignore_index=True)

# ✅ 통계 출력
all_user_ids = set(combined_behaviors['UserID'])
total_impressions = combined_behaviors['Impressions'].str.split().explode().shape[0]
print(f"총 뉴스 ID 수 (train+dev): {len(all_news_ids)}")
print(f"총 사용자 수: {len(all_user_ids)}")
print(f"총 Impression 수: {total_impressions}")

news_clicks = defaultdict(set)

for _, row in combined_behaviors.iterrows():
    user_id = row['UserID']
    for item in row['Impressions'].split():
        news_id, clicked = item.split('-')
        if clicked == '1':
            news_clicks[news_id].add(user_id)

# 3. 전체 뉴스 ID 기준 결과 정리
records = []
for news_id in sorted(all_news_ids):
    user_ids = sorted(news_clicks.get(news_id, []))
    records.append({"NewsID": news_id, "UserIDs": ','.join(user_ids)})

# 4. CSV 저장
output_df = pd.DataFrame(records)
output_df.to_csv("MINDsmall.csv", index=False)


총 뉴스 ID 수 (train+dev): 65238
총 사용자 수: 94057
총 Impression 수: 8584442


In [None]:
# MIND large 뉴스ID: [UserIDs]

import pandas as pd
from collections import defaultdict

# 1. 전체 뉴스 ID 수집 (train + dev)
train_news = pd.read_csv(
    'download/MINDlarge_train/news.tsv',
    sep='\t',
    names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities']
)
dev_news = pd.read_csv(
    'download/MINDlarge_dev/news.tsv',
    sep='\t',
    names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities']
)
all_news_ids = set(train_news['NewsID']) | set(dev_news['NewsID'])

# 2. 클릭된 뉴스-유저 매핑 (train + dev behaviors)
train_behaviors = pd.read_csv(
    'download/MINDlarge_train/behaviors.tsv',
    sep='\t',
    names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
)
dev_behaviors = pd.read_csv(
    'download/MINDlarge_dev/behaviors.tsv',
    sep='\t',
    names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
)

combined_behaviors = pd.concat([train_behaviors, dev_behaviors], ignore_index=True)

# ✅ 통계 출력
all_user_ids = set(combined_behaviors['UserID'])
total_impressions = combined_behaviors['Impressions'].str.split().explode().shape[0]
print(f"총 뉴스 ID 수 (train+dev): {len(all_news_ids)}")
print(f"총 사용자 수: {len(all_user_ids)}")
print(f"총 Impression 수: {total_impressions}")

news_clicks = defaultdict(set)

for _, row in combined_behaviors.iterrows():
    user_id = row['UserID']
    for item in row['Impressions'].split():
        news_id, clicked = item.split('-')
        if clicked == '1':
            news_clicks[news_id].add(user_id)

# 3. 전체 뉴스 ID 기준 결과 정리
records = []
for news_id in sorted(all_news_ids):
    user_ids = sorted(news_clicks.get(news_id, []))
    records.append({"NewsID": news_id, "UserIDs": ','.join(user_ids)})

# 4. CSV 저장
output_df = pd.DataFrame(records)
output_df.to_csv("MINDlarge.csv", index=False)



총 뉴스 ID 수 (train+dev): 104151
총 사용자 수: 750434
총 Impression 수: 97592931
