In [2]:
# train 유저 데이터

import pandas as pd
from tqdm import tqdm

# 데이터 불러오기
BEHAVIOR_COLUMNS = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
train_behaviors_df = pd.read_csv('download/MINDsmall_train/behaviors.tsv', sep='\t', names=BEHAVIOR_COLUMNS, header=0)

NEWS_COLUMNS = ['NewsID', 'PublishTime']
news_df = pd.read_csv("news_publish_time.tsv", sep='\t', names=NEWS_COLUMNS, header=0)


In [3]:
# user history 파일 생성

user_history = {}

for _, row in train_behaviors_df.iterrows():
    user = row['UserID']
    history = row['History']
    user_history[user] = history

history_df = pd.DataFrame(list(user_history.items()), columns=["UserID", "History"])

history_df.to_csv("train_user_history.tsv", sep="\t", index=False, header=False)


In [4]:
# train impression에서 클릭한 뉴스 개수 확인

click_news_count = 0  

for impression_row in train_behaviors_df["Impressions"]:
    impression_list = impression_row.split()
    
    for impression in impression_list:
        news_id, label = impression.split("-")                           
        label = int(label)
        
        if label == 1:
            click_news_count += 1

print('train impressions-1인 뉴스 개수: ', click_news_count)

train impressions-1인 뉴스 개수:  236343


In [4]:
# train 유저리스트 정리하기

train_user_list= []

for _, row in tqdm(train_behaviors_df.iterrows(), total=len(train_behaviors_df)):
    user = row['UserID']
    time = row['Time']
    impressions = str(row['Impressions']).split()
    imp_list = []
    
    for imp in impressions:
        news_id, label = imp.split('-')
        if int(label) == 1:
            imp_list.append(imp)

    train_user_list.append({
        'UserID':user,
        'Time': time,
        'ClickNews': imp_list
    })


100%|██████████| 156964/156964 [00:04<00:00, 35120.15it/s]


In [6]:
# train 사용자 중복 유저 데이터터

import random
from collections import defaultdict
from tqdm import tqdm

# 전체 유저 집합
all_user_ids = set(user['UserID'] for user in train_user_list)

# 뉴스 ID -> 클릭한 (UserID, Time) 리스트 저장 (중복 클릭 포함)
news_click_users = defaultdict(list)

# 사전 정보 구성
for user in train_user_list:
    uid = user['UserID']
    time = user['Time']
    for imp in user['ClickNews']:
        if '-' in imp:  # 안전하게 split
            news_id, label = imp.split('-')
            if label == '1':
                news_click_users[news_id].append((uid, time))

# 중복 포함 Impression 리스트
train_overlap_news_list = []

# 고유하게 증가하는 ImpressionID
impression_id = 1

# tqdm 진행 표시와 함께 뉴스 순회
for _, row in tqdm(news_df.iterrows(), total=len(news_df), desc="Generating Overlap Impressions"):
    news_id = row['NewsID']
    clicked_user_info = news_click_users.get(news_id, [])

    # 클릭한 유저 ID들 추출 (set으로 중복 제거하여 negative sample 추출용)
    clicked_user_ids = set(uid for uid, _ in clicked_user_info)
    other_user_ids = list(all_user_ids - clicked_user_ids)

    for uid, clicked_time in clicked_user_info:
        clicked_user = f"{uid}-1"

        sampled_user_ids = random.sample(other_user_ids, min(20, len(other_user_ids)))
        sampled_non_clicks = [f"{nid}-0" for nid in sampled_user_ids]

        impression_users = [clicked_user] + sampled_non_clicks
        random.shuffle(impression_users)

        train_overlap_news_list.append({
            'ImpressionID': impression_id,
            'NewsID': news_id,
            'Time': clicked_time,
            'ImpressionUsers': impression_users
        })

        impression_id += 1

# DataFrame으로 변환 및 TSV 저장
df = pd.DataFrame(train_overlap_news_list)
df['ImpressionUsers'] = df['ImpressionUsers'].apply(lambda x: ' '.join(x))
df.to_csv('２_train_user_dataset.tsv', sep='\t', index=False, header=False)


Generating Overlap Impressions: 100%|██████████| 22771/22771 [00:40<00:00, 560.26it/s]


In [5]:
# train 사용자 중복 유저 데이터터

import random
from collections import defaultdict
from tqdm import tqdm

# 전체 유저 집합
all_user_ids = set(user['UserID'] for user in train_user_list)

# 뉴스 ID -> 클릭한 (UserID, Time) 리스트 저장 (중복 클릭 포함)
news_click_users = defaultdict(list)

# 사전 정보 구성
for user in train_user_list:
    uid = user['UserID']
    time = user['Time']
    for imp in user['ClickNews']:
        if '-' in imp:  # 안전하게 split
            news_id, label = imp.split('-')
            if label == '1':
                news_click_users[news_id].append((uid, time))

# 중복 포함 Impression 리스트
train_overlap_news_list = []

# 고유하게 증가하는 ImpressionID
impression_id = 1

# tqdm 진행 표시와 함께 뉴스 순회
for _, row in tqdm(news_df.iterrows(), total=len(news_df), desc="Generating Overlap Impressions"):
    news_id = row['NewsID']
    clicked_user_info = news_click_users.get(news_id, [])

    # 클릭한 유저 ID들 추출 (set으로 중복 제거하여 negative sample 추출용)
    clicked_user_ids = set(uid for uid, _ in clicked_user_info)
    other_user_ids = list(all_user_ids - clicked_user_ids)

    for uid, clicked_time in clicked_user_info:
        clicked_user = f"{uid}-1"

        sampled_user_ids = random.sample(other_user_ids, min(20, len(other_user_ids)))
        sampled_non_clicks = [f"{nid}-0" for nid in sampled_user_ids]

        impression_users = [clicked_user] + sampled_non_clicks
        random.shuffle(impression_users)

        train_overlap_news_list.append({
            'ImpressionID': impression_id,
            'NewsID': news_id,
            'Time': clicked_time,
            'ImpressionUsers': impression_users
        })

        impression_id += 1

# DataFrame으로 변환 및 TSV 저장
df = pd.DataFrame(train_overlap_news_list)
df['ImpressionUsers'] = df['ImpressionUsers'].apply(lambda x: ' '.join(x))
df.to_csv('3_train_user_dataset.tsv', sep='\t', index=False, header=False)


Generating Overlap Impressions: 100%|██████████| 22771/22771 [00:42<00:00, 531.65it/s]


In [6]:
# dev 유저 데이터

import pandas as pd
from tqdm import tqdm

# 데이터 불러오기
BEHAVIOR_COLUMNS = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
dev_behaviors_df = pd.read_csv('download/MINDsmall_dev/behaviors.tsv', sep='\t', names=BEHAVIOR_COLUMNS, header=0)

NEWS_COLUMNS = ['NewsID', 'PublishTime']
news_df = pd.read_csv("news_publish_time.tsv", sep='\t', names=NEWS_COLUMNS, header=0)


In [8]:
# user history 파일 생성

user_history = {}

for _, row in dev_behaviors_df.iterrows():
    user = row['UserID']
    history = row['History']
    user_history[user] = history

history_df = pd.DataFrame(list(user_history.items()), columns=["UserID", "History"])

history_df.to_csv("dev_user_history.tsv", sep="\t", index=False, header=False)


In [9]:
# dev impression에서 클릭한 뉴스 개수 확인

click_news_count = 0  

for impression_row in dev_behaviors_df["Impressions"]:
    impression_list = impression_row.split()
    
    for impression in impression_list:
        news_id, label = impression.split("-")                           
        label = int(label)
        
        if label == 1:
            click_news_count += 1

print('train impressions-1인 뉴스 개수: ', click_news_count)

train impressions-1인 뉴스 개수:  111382


In [7]:
# dev 유저리스트 정리하기

dev_user_list= []

for _, row in tqdm(dev_behaviors_df.iterrows(), total=len(dev_behaviors_df)):
    user = row['UserID']
    time = row['Time']
    impressions = str(row['Impressions']).split()
    imp_list = []
    
    for imp in impressions:
        news_id, label = imp.split('-')
        if int(label) == 1:
            imp_list.append(imp)

    dev_user_list.append({
        'UserID':user,
        'Time': time,
        'ClickNews': imp_list
    })


100%|██████████| 73151/73151 [00:02<00:00, 31837.05it/s]


In [11]:
# dev 사용자 중복 유저 데이터터

import random
from collections import defaultdict
from tqdm import tqdm

# 전체 유저 집합
all_user_ids = set(user['UserID'] for user in dev_user_list)

# 뉴스 ID -> 클릭한 (UserID, Time) 리스트 저장 (중복 클릭 포함)
news_click_users = defaultdict(list)

# 사전 정보 구성
for user in dev_user_list:
    uid = user['UserID']
    time = user['Time']
    for imp in user['ClickNews']:
        if '-' in imp:  # 안전하게 split
            news_id, label = imp.split('-')
            if label == '1':
                news_click_users[news_id].append((uid, time))

# 중복 포함 Impression 리스트
dev_overlap_news_list = []

# 고유하게 증가하는 ImpressionID
impression_id = 1

# tqdm 진행 표시와 함께 뉴스 순회
for _, row in tqdm(news_df.iterrows(), total=len(news_df), desc="Generating Overlap Impressions"):
    news_id = row['NewsID']
    clicked_user_info = news_click_users.get(news_id, [])

    # 클릭한 유저 ID들 추출 (set으로 중복 제거하여 negative sample 추출용)
    clicked_user_ids = set(uid for uid, _ in clicked_user_info)
    other_user_ids = list(all_user_ids - clicked_user_ids)

    for uid, clicked_time in clicked_user_info:
        clicked_user = f"{uid}-1"

        sampled_user_ids = random.sample(other_user_ids, min(20, len(other_user_ids)))
        sampled_non_clicks = [f"{nid}-0" for nid in sampled_user_ids]

        impression_users = [clicked_user] + sampled_non_clicks
        random.shuffle(impression_users)

        dev_overlap_news_list.append({
            'ImpressionID': impression_id,
            'NewsID': news_id,
            'Time': clicked_time,
            'ImpressionUsers': impression_users
        })

        impression_id += 1

# DataFrame으로 변환 및 TSV 저장
df = pd.DataFrame(dev_overlap_news_list)
df['ImpressionUsers'] = df['ImpressionUsers'].apply(lambda x: ' '.join(x))
df.to_csv('２_dev_user_dataset.tsv', sep='\t', index=False, header=False)


Generating Overlap Impressions: 100%|██████████| 22771/22771 [00:38<00:00, 588.98it/s]


In [8]:
# dev 사용자 중복 유저 데이터 (중복 유지 버전)

import random
from collections import defaultdict
import pandas as pd
from tqdm import tqdm

NEG_PER_IMP = 20
RNG_SEED = 42
random.seed(RNG_SEED)

# 전체 유저 집합
all_user_ids = set(user['UserID'] for user in dev_user_list)

# 뉴스 ID -> 클릭한 (UserID, Time) 리스트 저장 (중복 클릭 포함)
news_click_users = defaultdict(list)

# 사전 정보 구성
for user in dev_user_list:
    uid = user['UserID']
    time = user['Time']
    for imp in user['ClickNews']:
        if '-' in imp:  # 안전 split
            news_id, label = imp.split('-')
            if label == '1':
                # ✅ 중복 유지: 그대로 append
                news_click_users[news_id].append((uid, time))

# 중복 포함 Impression 리스트
dev_overlap_news_list = []
impression_id = 1

# tqdm 진행 표시와 함께 뉴스 순회
for _, row in tqdm(news_df.iterrows(), total=len(news_df), desc="Generating Overlap Impressions (dev, keep dup)"):
    news_id = row['NewsID']
    clicked_user_info = news_click_users.get(news_id, [])

    # ✅ set으로 고유화하지 않음(중복 유지)
    for uid, clicked_time in clicked_user_info:
        clicked_user = f"{uid}-1"

        # ✅ 네거티브 풀: 현재 양성 유저 1명만 제외
        neg_pool = list(all_user_ids - {uid})
        k = min(NEG_PER_IMP, len(neg_pool))
        sampled_user_ids = random.sample(neg_pool, k)
        sampled_non_clicks = [f"{nid}-0" for nid in sampled_user_ids]

        impression_users = [clicked_user] + sampled_non_clicks
        random.shuffle(impression_users)

        dev_overlap_news_list.append({
            'ImpressionID': impression_id,
            'NewsID': news_id,
            'Time': clicked_time,
            'ImpressionUsers': impression_users
        })
        impression_id += 1

# DataFrame으로 변환 및 TSV 저장
df = pd.DataFrame(dev_overlap_news_list)
df['ImpressionUsers'] = df['ImpressionUsers'].apply(lambda x: ' '.join(x))
df.to_csv('3_dev_user_dataset.tsv', sep='\t', index=False, header=False)


Generating Overlap Impressions (dev, keep dup): 100%|██████████| 22771/22771 [02:54<00:00, 130.16it/s] 
