In [28]:
# impressions의 게시된 시간 추출

import pandas as pd
from datetime import datetime

BEHAVIOR_COLUMNS = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
DATE_FORMAT = "%m/%d/%Y %I:%M:%S %p"

def load_behaviors(train_path: str, dev_path: str) -> pd.DataFrame:
    train = pd.read_csv(train_path, sep='\t', names=BEHAVIOR_COLUMNS, header=0)
    dev = pd.read_csv(dev_path, sep='\t', names=BEHAVIOR_COLUMNS, header=0)
    return pd.concat([train, dev], ignore_index=True)

def extract_earliest_times(df: pd.DataFrame) -> pd.DataFrame:
    df = df.dropna(subset=['Impressions', 'Time'])
    df['Time'] = pd.to_datetime(df['Time'], format=DATE_FORMAT, errors='coerce')
    df = df.dropna(subset=['Time'])

    # 뉴스 ID만 추출 (숫자 부분 제거)
    df['ImpressionList'] = df['Impressions'].str.split().apply(
        lambda imps: [imp.split('-')[0] for imp in imps if '-' in imp and imp.split('-')[0] != '']
    )

    exploded = df[['Time', 'ImpressionList']].explode('ImpressionList')
    exploded.rename(columns={'ImpressionList': 'news_id'}, inplace=True)

    result = exploded.groupby('news_id', as_index=False)['Time'].min().rename(columns={'Time': 'publish_time'})
    print("[결과 데이터] 상위 5개:")
    print(result.head())
    return result

def save_to_csv(df: pd.DataFrame, path: str):
    df.to_csv(path, sep='\t', index=False)

if __name__ == '__main__':
    behaviors = load_behaviors(
        'download/MINDsmall_train/behaviors.tsv',
        'download/MINDsmall_dev/behaviors.tsv'
    )
    result = extract_earliest_times(behaviors)
    save_to_csv(result, 'news_publish_times.tsv')


[결과 데이터] 상위 5개:
  news_id        publish_time
0  N10005 2019-11-09 09:20:40
1  N10007 2019-11-14 08:46:12
2  N10008 2019-11-15 12:28:50
3  N10010 2019-11-13 10:33:08
4  N10011 2019-11-11 04:28:37


In [49]:
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

# 데이터 불러오기
BEHAVIOR_COLUMNS = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
train = pd.read_csv('download/MINDsmall_train/behaviors.tsv', sep='\t', names=BEHAVIOR_COLUMNS, header=0)
dev = pd.read_csv('download/MINDsmall_dev/behaviors.tsv', sep='\t', names=BEHAVIOR_COLUMNS, header=0)
behaviors_df = pd.concat([train, dev], ignore_index=True)

behaviors_df['Time'] = pd.to_datetime(behaviors_df['Time'])

news_time = defaultdict(list)

for _, row in tqdm(behaviors_df.iterrows(), total=len(behaviors_df)):
    time = row['Time']
    impressions = str(row['Impressions']).split()
    for imp in impressions:
        news_id = imp.split('-')[0] 
        news_time[news_id].append(time)

publish_time = {}

for news, times in news_time.items():
    publish_time[news] = min(times)

publish_df = pd.DataFrame([
    {'news_id': news_id, 'publish_time': time.strftime('%m/%d/%Y %I:%M:%S %p')}
    for news_id, time in publish_time.items()
])

publish_df.to_csv('news_publish_time.tsv', sep='\t', index=False)


  behaviors_df['Time'] = pd.to_datetime(behaviors_df['Time'])
100%|██████████| 230115/230115 [00:05<00:00, 39810.13it/s]


In [None]:
# filter_expand_news.py
import pandas as pd
import random
from datetime import datetime, timedelta

BEHAVIOR_COLUMNS = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
DATE_FORMAT = "%m/%d/%Y %I:%M:%S %p"

if __name__ == '__main__':
    # 1. 뉴스별 publish_time 로드 및 end_time 계산
    publish_df = pd.read_csv('news_publish_times.csv')
    publish_df['publish_time'] = pd.to_datetime(publish_df['publish_time'])
    publish_df['end_time'] = publish_df['publish_time'] + timedelta(hours=24)

    # 2. expanded_behaviors.csv 로드
    expanded_df = pd.read_csv('expanded_behaviors.csv', sep='\t')
    expanded_df['Time'] = pd.to_datetime(expanded_df['Time'], format=DATE_FORMAT, errors='coerce')
    expanded_df.dropna(subset=['Time'], inplace=True)

    # 3. 각 row별로 end_time > Time 인 news_id만 필터링하여 20개 랜덤 샘플링
    for i in range(len(expanded_df)):
        row_time = expanded_df.at[i, 'Time']

        valid_news_df = publish_df[publish_df['end_time'] > row_time]
        valid_news = valid_news_df['news_id'].tolist()

        if len(valid_news) < 20:
            sampled = valid_news
        else:
            sampled = random.sample(valid_news, 20)

        # 기존 Impressions + 샘플 추가
        old_val = str(expanded_df.at[i, 'Impressions'])
        new_val = f"{old_val} {' '.join([f'{nid}-0' for nid in sampled])}"
        expanded_df.at[i, 'Impressions'] = new_val

    # 4. 저장
    expanded_df.to_csv('news_dataset.csv', sep='\t', index=False)


In [26]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from tqdm import tqdm

# tqdm을 pandas의 apply에서 사용 가능하도록 설정
tqdm.pandas()

# 1. 학습 및 검증 behaviors 데이터 불러오기
train_behaviors = pd.read_csv(
    'download/MINDsmall_train/behaviors.tsv',
    sep='\t',
    names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
)

dev_behaviors = pd.read_csv(
    'download/MINDsmall_dev/behaviors.tsv',
    sep='\t',
    names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
)

# 2. 학습/검증 데이터를 하나의 DataFrame으로 통합
df = pd.concat([train_behaviors, dev_behaviors], ignore_index=True)

# 3. 클릭한 뉴스와 원래 행동 정보 분리 저장
user_behavior_rows = []
zero_label_news_ids = set()

for _, row in df.iterrows():
    impression_id, user_id, time_str, history, impressions = row
    time = datetime.strptime(time_str, "%m/%d/%Y %I:%M:%S %p")
    click_news = []
    for item in impressions.split():
        news_id, label = item.split('-')
        if label == '1':
            click_news.append(f"{news_id}-1")
        elif label == '0':
            zero_label_news_ids.add(news_id)

    for clicked in click_news:
        user_behavior_rows.append({
            'ImpressionID': impression_id,
            'UserID': user_id,
            'Time': time,
            'History': history,
            'clicked_news': clicked
        })

# 4. -0 뉴스는 추천된 시각과 함께 저장
zero_label_records = []
for _, row in df.iterrows():
    time = datetime.strptime(row['Time'], "%m/%d/%Y %I:%M:%S %p")
    for item in row['Impressions'].split():
        news_id, label = item.split('-')
        if label == '0':
            zero_label_records.append({'NewsID': news_id, 'Time': time})

zero_df = pd.DataFrame(zero_label_records)

# 5. 뉴스 발행 시간 불러오기 및 datetime 변환
news_df = pd.read_csv('news_publish_times.csv')
news_df['publish_time'] = pd.to_datetime(news_df['publish_time'])
news_publish_map = dict(zip(news_df['news_id'], news_df['publish_time']))

# 6. (제거됨) 각 뉴스 ID 기준 최신 행동 시간 계산 불필요 → 바로 row['Time'] 활용 가능
def within_24_hours(row):
    pub_time = news_publish_map.get(row['NewsID'])
    return pub_time is not None and row['Time'] <= pub_time + timedelta(hours=24)

# tqdm 적용하여 필터링 처리 진행률 확인
zero_df['valid'] = zero_df.progress_apply(within_24_hours, axis=1)
zero_df_filtered = zero_df[zero_df['valid']].drop(columns='valid')

# 8. 클릭 뉴스와 -0 뉴스 20개를 묶어서 하나의 impressions 필드로 생성
expanded_rows = []
for row in tqdm(user_behavior_rows, desc='Generating impressions'):
    sampled = zero_df_filtered.sample(n=min(20, len(zero_df_filtered)), random_state=None)
    zero_imps = " ".join([f"{nid}-0" for nid in sampled['NewsID']])
    full_impression = f"{row['clicked_news']} {zero_imps}"
    expanded_rows.append({
        'ImpressionID': row['ImpressionID'],
        'UserID': row['UserID'],
        'Time': row['Time'],
        'History': row['History'],
        'impressions': full_impression
    })

# 9. 최종 결과 DataFrame 생성
expanded_df = pd.DataFrame(expanded_rows)


100%|██████████| 8236715/8236715 [01:02<00:00, 131236.33it/s]
Generating impressions:   9%|▊         | 30323/347727 [1:03:15<11:02:13,  7.99it/s]


KeyboardInterrupt: 

In [27]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from tqdm import tqdm
import random

# tqdm을 pandas의 apply에서 사용 가능하도록 설정
tqdm.pandas()

# 1. 학습 및 검증 behaviors 데이터 불러오기
train_behaviors = pd.read_csv(
    'download/MINDsmall_train/behaviors.tsv',
    sep='\t',
    names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
)

dev_behaviors = pd.read_csv(
    'download/MINDsmall_dev/behaviors.tsv',
    sep='\t',
    names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
)

# 2. 학습/검증 데이터를 하나의 DataFrame으로 통합
df = pd.concat([train_behaviors, dev_behaviors], ignore_index=True)

# 3. 클릭한 뉴스와 원래 행동 정보 분리 저장 + -0 뉴스 ID 수집
user_behavior_rows = []
zero_label_news_ids = set()

for _, row in df.iterrows():
    impression_id, user_id, time_str, history, impressions = row
    time = datetime.strptime(time_str, "%m/%d/%Y %I:%M:%S %p")
    click_news = []
    for item in impressions.split():
        news_id, label = item.split('-')
        if label == '1':
            click_news.append(f"{news_id}-1")
        elif label == '0':
            zero_label_news_ids.add(news_id)

    for clicked in click_news:
        user_behavior_rows.append({
            'ImpressionID': impression_id,
            'UserID': user_id,
            'Time': time,
            'History': history,
            'clicked_news': clicked
        })

# 4. 뉴스 발행 시간 불러오기 및 datetime 변환
news_df = pd.read_csv('news_publish_times.csv')
news_df['publish_time'] = pd.to_datetime(news_df['publish_time'])
news_publish_map = dict(zip(news_df['news_id'], news_df['publish_time']))

# 5. 함수: 특정 시간 기준으로 24시간 내 뉴스만 반환
def get_valid_zero_news_within_24h(current_time):
    return [
        nid for nid in zero_label_news_ids
        if (
            nid in news_publish_map and
            news_publish_map[nid] <= current_time <= news_publish_map[nid] + timedelta(hours=24)
        )
    ]

# 클릭 뉴스와 -0 뉴스 20개를 묶어서 하나의 impressions 필드로 생성
expanded_rows = []
for row in tqdm(user_behavior_rows, desc='Generating impressions'):
    time = row['Time']
    candidate_news = get_valid_zero_news_within_24h(time)
    sampled = random.sample(candidate_news, k=min(20, len(candidate_news)))
    zero_imps = " ".join([f"{nid}-0" for nid in sampled])
    full_impression = f"{row['clicked_news']} {zero_imps}"

    expanded_rows.append({
        'ImpressionID': row['ImpressionID'],
        'UserID': row['UserID'],
        'Time': row['Time'],
        'History': row['History'],
        'impressions': full_impression
    })

# 6. 최종 결과 DataFrame 생성
expanded_df = pd.DataFrame(expanded_rows)


Generating impressions: 100%|██████████| 347727/347727 [6:08:44<00:00, 15.72it/s]  


In [23]:
# -1 레이블이 2개 이상인 행을 찾는다
def count_label_1(imps):
    return sum(1 for x in imps.split() if x.endswith('-1'))

multi_label1_rows = expanded_df[expanded_df['impressions'].apply(count_label_1) > 1]
print(f"❗ -1 뉴스가 2개 이상인 행 수: {len(multi_label1_rows)}")


❗ -1 뉴스가 2개 이상인 행 수: 0


In [24]:
# impressions에서 -0만 추출
def extract_label_0(imps):
    return [x for x in imps.split() if x.endswith('-0')]

# 모든 행이 정확히 20개의 -0 뉴스를 가지고 있는지 확인
expanded_df['num_label_0'] = expanded_df['impressions'].apply(lambda x: len(extract_label_0(x)))
print("🧪 -0 뉴스 개수 통계:")
print(expanded_df['num_label_0'].value_counts().sort_index())


🧪 -0 뉴스 개수 통계:
num_label_0
20    347727
Name: count, dtype: int64


In [25]:
# 시간 기준 검증
def check_all_within_24h(row):
    time = row['Time']
    for x in row['impressions'].split():
        if x.endswith('-0'):
            news_id = x.replace('-0', '')
            pub_time = news_publish_map.get(news_id)
            if not pub_time:
                return False
            if not (timedelta(0) <= (time - pub_time) <= timedelta(hours=24)):
                return False
    return True

within_check = expanded_df.progress_apply(check_all_within_24h, axis=1)
print(f"✅ 24시간 조건 만족하는 행 수: {within_check.sum()} / {len(expanded_df)}")


100%|██████████| 347727/347727 [00:03<00:00, 104715.78it/s]

✅ 24시간 조건 만족하는 행 수: 0 / 347727



