In [1]:
import pandas as pd
import random
from tqdm import tqdm
from collections import defaultdict

# 데이터 불러오기
BEHAVIOR_COLUMNS = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
train = pd.read_csv('download/MINDsmall_train/behaviors.tsv', sep='\t', names=BEHAVIOR_COLUMNS, header=0)
dev = pd.read_csv('download/MINDsmall_dev/behaviors.tsv', sep='\t', names=BEHAVIOR_COLUMNS, header=0)
behaviors_df = pd.concat([train, dev], ignore_index=True)

NEWS_COLUMNS = ['NewsID', 'PublishTime']
news_df = pd.read_csv("news_publish_time.tsv", sep='\t', names=NEWS_COLUMNS, header=0)


In [2]:
behaviors_df.to_csv('total_behaviors.tsv', sep='\t', index=False)

In [3]:
# impression에서 클릭한 뉴스 개수 확인

click_news_count = 0  

for impression_row in behaviors_df["Impressions"]:
    impression_list = impression_row.split()
    
    for impression in impression_list:
        news_id, label = impression.split("-")                           
        label = int(label)
        
        if label == 1:
            click_news_count += 1

print('impressions-1인 뉴스 개수: ', click_news_count)

impressions-1인 뉴스 개수:  347725


In [4]:
# user history 파일 생성

user_history = {}

for _, row in behaviors_df.iterrows():
    user = row['UserID']
    history = row['History']
    user_history[user] = history

history_df = pd.DataFrame(list(user_history.items()), columns=["UserID", "History"])

history_df.to_csv("user_history.tsv", sep="\t", index=False)


In [None]:
# 유저리스트 정리하기

user_list= []

for _, row in tqdm(behaviors_df.iterrows(), total=len(behaviors_df)):
    user = row['UserID']
    time = row['Time']
    impressions = str(row['Impressions']).split()
    imp_list = []
    
    for imp in impressions:
        news_id, label = imp.split('-')
        if int(label) == 1:
            imp_list.append(imp)

    user_list.append({
        'UserID':user,
        'Time': time,
        'ClickNews': imp_list
    })


100%|██████████| 230115/230115 [00:06<00:00, 35094.34it/s]


In [7]:
# 클릭한 유저와 뉴스들 수집 뉴스: 유저 아이디s

all_user_ids = set(user['UserID'] for user in user_list)

# 유저 ID -> 클릭한 뉴스 ID Set + Time 저장
user_click_info = {}
# 뉴스 ID -> 클릭한 유저 ID Set
news_click_users = defaultdict(set)

# 사전 정보 구성
for user in user_list:
    uid = user['UserID']
    click_news_ids = set(imp.split('-')[0] for imp in user['ClickNews'])
    user_click_info[uid] = {
        'click_set': click_news_ids,
        'time': user['Time']
    }
    for nid in click_news_ids:
        news_click_users[nid].add(uid)

In [8]:
# 사용자 중복없는 뉴스 리스트

news_list = []
impression_id = 1

for _, row in tqdm(news_df.iterrows(), total=len(news_df), desc="Generating Impressions"):
    news_id = row['NewsID']
    clicked_user_ids = news_click_users.get(news_id, set())
    other_user_ids = list(all_user_ids - clicked_user_ids)

    for uid in clicked_user_ids:
        clicked_user = f"{uid}-1"
        clicked_time = user_click_info[uid]['time']

        sampled_user_ids = random.sample(other_user_ids, min(20, len(other_user_ids)))
        sampled_non_clicks = [f"{nid}-0" for nid in sampled_user_ids]

        impression_users = [clicked_user] + sampled_non_clicks

        news_list.append({
            'ImpressionID': impression_id,
            'NewsID': news_id,
            'Time': clicked_time,
            'ImpressionUsers': impression_users
        })

        impression_id += 1

Generating Impressions: 100%|██████████| 22771/22771 [01:22<00:00, 277.69it/s]


In [9]:
df = pd.DataFrame(news_list)

df['ImpressionUsers'] = df['ImpressionUsers'].apply(lambda x: ' '.join(x))
df.to_csv('user_dataset.tsv', sep='\t', index=False)

In [10]:
import random
from collections import defaultdict
from tqdm import tqdm
import pandas as pd

# 전체 유저 집합
all_user_ids = set(user['UserID'] for user in user_list)

# 뉴스 ID -> 클릭한 (UserID, Time) 리스트 저장 (중복 클릭 포함)
news_click_users = defaultdict(list)

# 사전 정보 구성
for user in user_list:
    uid = user['UserID']
    time = user['Time']
    for imp in user['ClickNews']:
        if '-' in imp:  # 안전하게 split
            news_id, label = imp.split('-')
            if label == '1':
                news_click_users[news_id].append((uid, time))

# 중복 포함 Impression 리스트
overlap_news_list = []

# 고유하게 증가하는 ImpressionID
impression_id = 1

# tqdm 진행 표시와 함께 뉴스 순회
for _, row in tqdm(news_df.iterrows(), total=len(news_df), desc="Generating Overlap Impressions"):
    news_id = row['NewsID']
    clicked_user_info = news_click_users.get(news_id, [])

    # 클릭한 유저 ID들 추출 (set으로 중복 제거하여 negative sample 추출용)
    clicked_user_ids = set(uid for uid, _ in clicked_user_info)
    other_user_ids = list(all_user_ids - clicked_user_ids)

    for uid, clicked_time in clicked_user_info:
        clicked_user = f"{uid}-1"

        sampled_user_ids = random.sample(other_user_ids, min(20, len(other_user_ids)))
        sampled_non_clicks = [f"{nid}-0" for nid in sampled_user_ids]

        impression_users = [clicked_user] + sampled_non_clicks

        overlap_news_list.append({
            'ImpressionID': impression_id,
            'NewsID': news_id,
            'Time': clicked_time,
            'ImpressionUsers': impression_users
        })

        impression_id += 1

# DataFrame으로 변환 및 TSV 저장
df = pd.DataFrame(overlap_news_list)
df['ImpressionUsers'] = df['ImpressionUsers'].apply(lambda x: ' '.join(x))
df.to_csv('overlap_user_dataset.tsv', sep='\t', index=False)


Generating Overlap Impressions: 100%|██████████| 22771/22771 [01:22<00:00, 276.36it/s]


In [17]:
import pandas as pd

# TSV 파일 로딩
USER_COLUMNS = ['ImpressionID', 'NewsID', 'Time', 'ImpressionUsers']
user_data_df = pd.read_csv("user_dataset.tsv", sep='\t', names=USER_COLUMNS, header=0)

# 분석할 뉴스 ID
target_news_id = 'N55237'

# 해당 뉴스ID에 해당하는 모든 행 필터링
target_rows = user_data_df[user_data_df['NewsID'] == target_news_id]

if target_rows.empty:
    print(f"❌ 뉴스 {target_news_id} 없음")
else:
    clicked_users = set()
    non_clicked_users = set()

    # 여러 행의 ImpressionUsers 처리
    for users_str in target_rows['ImpressionUsers']:
        for u in str(users_str).split():
            if '-' not in u:
                continue
            uid, label = u.rsplit('-', 1)
            if label == '1':
                clicked_users.add(uid)
            elif label == '0':
                non_clicked_users.add(uid)

    # 교집합 확인
    overlap = clicked_users & non_clicked_users

    print(f"📰 뉴스 {target_news_id} 전체 행 수: {len(target_rows)}")
    print(f"✅ 클릭 유저 수: {len(clicked_users)}")
    print(f"🟦 미클릭 유저 수: {len(non_clicked_users)}")

    if overlap:
        print(f"⚠️ 중복 유저 있음 ({len(overlap)}명): {sorted(overlap)}")
    else:
        print("✅ 중복 없음")


📰 뉴스 N55237 전체 행 수: 1516
✅ 클릭 유저 수: 1516
🟦 미클릭 유저 수: 25836
✅ 중복 없음


In [19]:
import pandas as pd
from tqdm import tqdm  # <- 진행 바 표시용

# 파일 로딩
USER_COLUMNS = ['ImpressionID', 'NewsID', 'Time', 'ImpressionUsers']
NEWS_COLUMNS = ['NewsID', 'PublishTime']

user_data_df = pd.read_csv("user_dataset.tsv", sep='\t', names=USER_COLUMNS, header=0)
news_df = pd.read_csv("news_publish_time.tsv", sep='\t', names=NEWS_COLUMNS, header=0)

# 중복 결과 저장
overlap_results = []

# tqdm으로 뉴스ID별 진행 표시
for news_id in tqdm(news_df['NewsID'].unique(), desc="Checking NewsID Overlaps"):
    target_rows = user_data_df[user_data_df['NewsID'] == news_id]

    if target_rows.empty:
        continue

    clicked_users = set()
    non_clicked_users = set()

    for users_str in target_rows['ImpressionUsers']:
        for u in str(users_str).split():
            if '-' not in u: continue
            uid, label = u.rsplit('-', 1)
            if label == '1':
                clicked_users.add(uid)
            elif label == '0':
                non_clicked_users.add(uid)

    overlap = clicked_users & non_clicked_users
    if overlap:
        overlap_results.append({
            'NewsID': news_id,
            'ClickedUsers': len(clicked_users),
            'NonClickedUsers': len(non_clicked_users),
            'OverlapCount': len(overlap),
            'OverlapUsers': sorted(overlap)
        })

# 결과 출력
if overlap_results:
    print(f"\n⚠️ 중복 유저가 있는 뉴스 개수: {len(overlap_results)}")
    for result in overlap_results:
        print(f"News {result['NewsID']} - 중복 {result['OverlapCount']}명")
else:
    print("\n✅ 모든 뉴스에서 중복 없음")


Checking NewsID Overlaps: 100%|██████████| 22771/22771 [04:06<00:00, 92.20it/s]


✅ 모든 뉴스에서 중복 없음





In [11]:
345187
73152
156964

156964