# Full Data Preparation Pipeline
This notebook consolidates all preprocessing steps used to create `review_business_5up_5aspect_3sentiment_vectorized_clean.json`.

In [9]:
import pandas as pd
import json
import re
import os
from collections import defaultdict
from tqdm import tqdm
from transformers import DebertaV2Tokenizer

In [2]:
# --- Step 1: business.json preprocessing ---
df_B = pd.read_json("data/raw/yelp_academic_dataset_business.json", lines=True)
business_df = df_B.copy()
drop_cols = ['postal_code','latitude','longitude','attributes','hours']
business_df = business_df.drop(columns=drop_cols)
business_df.loc[business_df['city'].str.lower().str.contains("philadelphia", na=False),'city'] = "Philadelphia"

def load_categories(fp):
    with open(fp,'r',encoding='utf-8') as f:
        return set(line.strip().lower() for line in f if line.strip())
food_categories = load_categories('data/raw/food.txt')
restaurant_categories = load_categories('data/raw/restaurant.txt')
target_categories = food_categories.union(restaurant_categories)

def category_match(row):
    if isinstance(row,str):
        biz_categories = set(cat.strip().lower() for cat in row.split(','))
        return bool(biz_categories & target_categories)
    return False
business_food_df = business_df[business_df['categories'].apply(category_match)]
top_state = business_food_df['state'].value_counts().idxmax()
business_pa_df = business_food_df[business_food_df['state']==top_state]
business_paph_df = business_pa_df[business_pa_df['city']=="Philadelphia"]
mask = business_paph_df.apply(lambda col: col.map(lambda x: pd.isna(x) or (isinstance(x,str) and x.strip()==""))).any(axis=1)
business_paph_df_2 = business_paph_df[~mask].reset_index(drop=True)
business_paph_df_2.to_json("data/output/business.json", orient="records", lines=True, force_ascii=False)
print(f"총 {len(business_paph_df_2)}개 항목이 'data/output/business.json'에 저장되었습니다.")

총 6855개 항목이 'data/output/business.json'에 저장되었습니다.


In [3]:
# --- Step 1: review.json preprocessing ---
chunk_size=100000
chunks=pd.read_json("data/raw/yelp_academic_dataset_review.json", lines=True, chunksize=chunk_size)
df_review=pd.concat(chunk for chunk in chunks)
business_ids=set(business_paph_df_2['business_id'])
df_review=df_review[df_review['business_id'].isin(business_ids)]
df_review=df_review.drop(columns=['funny','cool'])

tqdm.pandas()
tokenizer=DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")
df_review['token_length']=df_review['text'].progress_apply(lambda x: len(tokenizer.tokenize(x)))
df_review.to_json("data/output/review.json", orient="records", lines=True, force_ascii=False)

100%|██████████| 730552/730552 [02:11<00:00, 5541.64it/s]


In [4]:
# --- Step 1: user.json preprocessing ---
chunks=pd.read_json("data/raw/yelp_academic_dataset_user.json", lines=True, chunksize=100000)
df_user=pd.concat(chunk for chunk in chunks)
drop_columns=['yelping_since','funny','cool','elite','friends','fans','compliment_hot','compliment_more','compliment_profile','compliment_cute','compliment_list','compliment_note','compliment_plain','compliment_cool','compliment_funny','compliment_writer','compliment_photos']
df_user=df_user.drop(columns=drop_columns)
review_counts=df_review['user_id'].value_counts()
user_ids_5plus=review_counts[review_counts>=5].index
df_user=df_user[df_user['user_id'].isin(user_ids_5plus)]
df_user.to_json("data/output/user.json", orient="records", lines=True, force_ascii=False)

In [10]:
# --- Step 2: merge review, user and business ---
def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

reviews = load_jsonl("data/output/review.json")
users = load_jsonl("data/output/user.json")
businesses = load_jsonl("data/output/business.json")

user_dict = {u['user_id']: u for u in users}
business_dict = {b['business_id']: b for b in businesses}

merged_data = []
for r in tqdm(reviews, desc='병합 중'):
    uid = r['user_id']
    bid = r['business_id']
    if uid in user_dict and bid in business_dict:
        m = r.copy()
        for k, v in user_dict[uid].items():
            m[f'user_{k}'] = v
        for k, v in business_dict[bid].items():
            m[f'business_{k}'] = v
        merged_data.append(m)

# DataFrame으로 변환
df = pd.DataFrame(merged_data)
# 불필요한 ID 컬럼 제거
df.drop(columns=["user_user_id", "business_business_id"], inplace=True, errors="ignore")
# 컬럼 이름 리네이밍
df.rename(columns={
    "stars": "review_stars",
    "useful": "review_useful",
    "date": "review_date"}, inplace=True)
# 저장 경로
output_path = "data/output/merged_dataset.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# JSONL 형식으로 저장
with open(output_path, "w", encoding="utf-8") as f:
    for row in df.to_dict(orient="records"):
        json.dump(row, f, ensure_ascii=False)
        f.write("\n")

print("병합 완료: merged_dataset.json")

병합 중: 100%|██████████| 730552/730552 [00:02<00:00, 281974.51it/s]


병합 완료: merged_dataset.json


In [11]:
import json
import pandas as pd

# 1. JSONL 파일 경로
path = "data/output/merged_dataset.json"  # 필요 시 경로 수정

# 2. JSONL 파일 읽기
with open(path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# 3. pandas DataFrame으로 변환
df = pd.DataFrame(data)

# 4. 기본 정보 출력
print("📊 [기본 정보]")
print(df.info())
print()

# 5. 컬럼 목록
print("🧾 [컬럼 목록]")
print(df.columns.tolist())
print()

# 6. 예시 데이터
print("🔍 [샘플 데이터]")
print(df.head(3).to_string(index=False))

📊 [기본 정보]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451185 entries, 0 to 451184
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   review_id              451185 non-null  object 
 1   user_id                451185 non-null  object 
 2   business_id            451185 non-null  object 
 3   review_stars           451185 non-null  int64  
 4   review_useful          451185 non-null  int64  
 5   text                   451185 non-null  object 
 6   review_date            451185 non-null  int64  
 7   token_length           451185 non-null  int64  
 8   user_name              451185 non-null  object 
 9   user_review_count      451185 non-null  int64  
 10  user_useful            451185 non-null  int64  
 11  user_average_stars     451185 non-null  float64
 12  business_name          451185 non-null  object 
 13  business_address       451185 non-null  object 
 14  business_city          451

In [None]:
# --- Step 3: attach IDs to ABSA results and vectorize ---
# filter merged dataset for users with >=5 reviews
user_review_counts=defaultdict(int)
with open("merged_dataset.json","r",encoding='utf-8') as f:
    for line in f:
        obj=json.loads(line)
        user_review_counts[obj['user_id']]+=1
qualified_users={u for u,c in user_review_counts.items() if c>=5}
filtered_reviews=[]
with open("merged_dataset.json","r",encoding='utf-8') as f:
    for line in f:
        obj=json.loads(line)
        if obj['user_id'] in qualified_users:
            filtered_reviews.append(obj)
with open("merged_dataset_5up_users_only.json","w",encoding='utf-8') as f:
    for obj in filtered_reviews:
        f.write(json.dumps(obj,ensure_ascii=False)+"")
print("✅ 필터링 완료:",len(filtered_reviews),'개 리뷰 저장 → merged_dataset_5up_users_only.json')

id_map={}
for obj in filtered_reviews:
    rid=obj['review_id']
    id_map[rid]={
        'user_id':obj['user_id'],
        'business_id':obj['business_id'],
        'stars':obj['review_stars'],
        'review_useful':obj['review_useful'],
        'review_date':obj['review_date']
    }
updated=[]
with open("review_5up_5aspect_3sentiment.json","r",encoding='utf-8') as f:
    for line in tqdm(f, desc='ID 및 평점 추가 중'):
        obj=json.loads(line)
        rid=obj.get('review_id')
        if rid in id_map:
            obj.update(id_map[rid])
            updated.append(obj)
with open("review_5up_5aspect_3sentiment_with_ids.json","w",encoding='utf-8') as f:
    for obj in updated:
        f.write(json.dumps(obj,ensure_ascii=False)+"")
print("저장 완료:",len(updated),'건 → review_5up_5aspect_3sentiment_with_ids.json')

input_file="review_5up_5aspect_3sentiment_with_ids.json"
output_file="review_5up_5aspect_3sentiment_vectorized_clean.json"

def sentiment_to_vector(sentiment_dict):
    aspects=['food','service','price','ambience','location']
    polarities=['Negative','Neutral','Positive']
    vector=[]
    for asp in aspects:
        scores=sentiment_dict.get(asp,{}).get('scores',{})
        for pol in polarities:
            vector.append(scores.get(pol,0.0))
    return vector
with open(input_file,'r',encoding='utf-8') as fin, open(output_file,'w',encoding='utf-8') as fout:
    for line in fin:
        obj=json.loads(line)
        vec=sentiment_to_vector(obj.get('sentiment',{}))
        cleaned={'review_id':obj.get('review_id'),'user_id':obj.get('user_id'),'business_id':obj.get('business_id'),'stars':obj.get('stars'),'review_date':obj.get('review_date'),'sentiment_vector':vec}
        fout.write(json.dumps(cleaned,ensure_ascii=False)+"")
print("완료: text와 sentiment 제거 후 저장 →",output_file)

In [None]:
# --- Step 4: filter users with <5 unique businesses ---
input_file="review_5up_5aspect_3sentiment_vectorized_clean.json"
output_file="review_business_5up_5aspect_3sentiment_vectorized_clean.json"
user_biz_ids=defaultdict(set)
with open(input_file,'r',encoding='utf-8') as f:
    for line in f:
        obj=json.loads(line)
        user_biz_ids[obj['user_id']].add(obj['business_id'])
with open(input_file,'r',encoding='utf-8') as fin, open(output_file,'w',encoding='utf-8') as fout:
    for line in fin:
        obj=json.loads(line)
        if len(user_biz_ids[obj['user_id']])>=5:
            fout.write(json.dumps(obj,ensure_ascii=False)+"")
print("완료: business_id가 5개 미만인 사용자 제거 후 저장 →",output_file)