# Data Preparation
Load and clean Yelp dataset for Philadelphia restaurants

In [None]:
import pandas as pd
import numpy as np
import json
import os

# Create directories
os.makedirs("data/raw", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("results", exist_ok=True)
os.makedirs("figures", exist_ok=True)

print("Directories created successfully!")

Directories created successfully!


In [None]:
# Load business data
# NOTE: Update the path to your actual Yelp dataset location
business_file = "data/raw/yelp_academic_dataset_business.json"

businesses = []
with open(business_file, "r", encoding="utf-8") as f:
    for line in f:
        businesses.append(json.loads(line))

business_df = pd.DataFrame(businesses)
print(f"Total businesses loaded: {len(business_df):,}")

# Filter for Philadelphia restaurants
philly_restaurants = business_df[
    (business_df["city"] == "Philadelphia")
    & (business_df["categories"].str.contains("Restaurants", na=False))
].copy()

print(f"Philadelphia restaurants: {len(philly_restaurants):,}")
print(f"\nSample business IDs saved for filtering reviews")

# Save business IDs for review filtering
philly_business_ids = set(philly_restaurants["business_id"].unique())
print(f"Unique business IDs: {len(philly_business_ids):,}")

Total businesses loaded: 150,346
Philadelphia restaurants: 5,852

Sample business IDs saved for filtering reviews
Unique business IDs: 5,852


In [None]:
# Load and filter reviews
# NOTE: Update the path to your actual Yelp dataset location
review_file = "data/raw/yelp_academic_dataset_review.json"

print("Loading reviews (this may take a few minutes)...")

reviews = []
chunk_size = 100000
count = 0

with open(review_file, "r", encoding="utf-8") as f:
    for line in f:
        review = json.loads(line)
        if review["business_id"] in philly_business_ids:
            reviews.append(review)

        count += 1
        if count % chunk_size == 0:
            print(
                f"Processed {count:,} reviews, found {len(reviews):,} Philadelphia reviews"
            )

print(f"\nTotal Philadelphia restaurant reviews: {len(reviews):,}")

# Convert to DataFrame
review_df = pd.DataFrame(reviews)
print(f"Review DataFrame shape: {review_df.shape}")

Loading reviews (this may take a few minutes)...
Processed 100,000 reviews, found 12,498 Philadelphia reviews
Processed 200,000 reviews, found 25,327 Philadelphia reviews
Processed 300,000 reviews, found 37,858 Philadelphia reviews
Processed 400,000 reviews, found 47,970 Philadelphia reviews
Processed 500,000 reviews, found 55,951 Philadelphia reviews
Processed 600,000 reviews, found 63,135 Philadelphia reviews
Processed 700,000 reviews, found 70,113 Philadelphia reviews
Processed 800,000 reviews, found 82,501 Philadelphia reviews
Processed 900,000 reviews, found 95,975 Philadelphia reviews
Processed 1,000,000 reviews, found 109,595 Philadelphia reviews
Processed 1,100,000 reviews, found 121,428 Philadelphia reviews
Processed 1,200,000 reviews, found 130,773 Philadelphia reviews
Processed 1,300,000 reviews, found 138,966 Philadelphia reviews
Processed 1,400,000 reviews, found 146,961 Philadelphia reviews
Processed 1,500,000 reviews, found 158,447 Philadelphia reviews
Processed 1,600,00

In [None]:
# Sample 100K reviews if dataset is too large
if len(review_df) > 100000:
    print(f"Sampling 100,000 reviews from {len(review_df):,} total reviews")
    review_df = review_df.sample(n=100000, random_state=42).copy()
else:
    print(f"Using all {len(review_df):,} reviews")

# Merge with business information
df = review_df.merge(
    philly_restaurants[["business_id", "name", "city", "state", "stars"]],
    on="business_id",
    how="left",
    suffixes=("_review", "_business"),
)

print(f"Merged DataFrame shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Sampling 100,000 reviews from 687,289 total reviews
Merged DataFrame shape: (100000, 13)
Columns: ['review_id', 'user_id', 'business_id', 'stars_review', 'useful', 'funny', 'cool', 'text', 'date', 'name', 'city', 'state', 'stars_business']


In [None]:
# Data cleaning

# Convert date to datetime
df["date"] = pd.to_datetime(df["date"])

# Remove duplicates
df = df.drop_duplicates(subset=["review_id"])
print(f"After removing duplicates: {len(df):,} reviews")

# Remove reviews with missing text
df = df[df["text"].notna()].copy()
print(f"After removing missing text: {len(df):,} reviews")

# Add text length feature
df["text_length"] = df["text"].str.len()

# Remove extremely short reviews (likely spam)
df = df[df["text_length"] >= 10].copy()
print(f"After removing short reviews: {len(df):,} reviews")

# Sort by business and date
df = df.sort_values(["business_id", "date"]).reset_index(drop=True)

print("\nData cleaning complete!")

Cleaning data...
After removing duplicates: 100,000 reviews
After removing missing text: 100,000 reviews
After removing short reviews: 99,997 reviews

Data cleaning complete!


In [None]:
# Basic statistics
print("=" * 60)
print("DATASET SUMMARY")
print("=" * 60)
print(f"Total reviews: {len(df):,}")
print(f"Unique businesses: {df['business_id'].nunique():,}")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"\nRating distribution:")
print(df["stars_review"].value_counts().sort_index())
print(f"\nAverage rating: {df['stars_review'].mean():.2f}")
print(
    f"Negative reviews (≤2 stars): {(df['stars_review'] <= 2).sum():,} ({(df['stars_review'] <= 2).sum() / len(df) * 100:.1f}%)"
)

DATASET SUMMARY
Total reviews: 99,997
Unique businesses: 5,393
Date range: 2005-06-24 to 2022-01-19

Rating distribution:
stars_review
1.0     9922
2.0     8317
3.0    13261
4.0    28348
5.0    40149
Name: count, dtype: int64

Average rating: 3.80
Negative reviews (≤2 stars): 18,239 (18.2%)


In [None]:
# Save cleaned data
output_file = "data/processed/reviews_clean.csv"
df.to_csv(output_file, index=False)
print(f"Saved cleaned data to: {output_file}")
print(f"  Shape: {df.shape}")
print(f"  Size: {os.path.getsize(output_file) / 1024 / 1024:.1f} MB")


✓ Saved cleaned data to: data/processed/reviews_clean.csv
  Shape: (99997, 14)
  Size: 70.9 MB
