# Generate Reviews

In [1]:
# imports
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [2]:
# Initialize random seeds for reproducibility
random.seed(1234)
np.random.seed(1234)

In [3]:
def generate_reviews(year, num_reviews):
    """
    Generate Reviews data for a given year
    
    Parameters:
    -----------
    year : int
        Year for the data (2023, 2024, or 2025)
    num_reviews : int
        Number of reviews to generate
    
    Returns:
    --------
    pandas.DataFrame
        Reviews data with Date, Rating, and Platform columns
    """
    
    # Platform distribution: Facebook > Yelp > Google
    platform_probs = {
        'Facebook': 0.35,
        'Yelp': 0.34,
        'Google': 0.31
    }
    
    # Rating distribution (6-10 scale, skewed toward higher ratings)
    # More weight on 8, 9, 10
    rating_probs = {
        6: 0.10,
        7: 0.15,
        8: 0.25,
        9: 0.30,
        10: 0.20
    }
    
    data = []
    
    # Date range for the year
    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)
    days_in_year = (end_date - start_date).days + 1
    
    for _ in range(num_reviews):
        # Generate random date within the year
        random_days = random.randint(0, days_in_year - 1)
        date_reviewed = start_date + timedelta(days=random_days)
        date_str = date_reviewed.strftime("%-m/%-d/%Y")
        
        # Generate rating based on distribution
        rating = np.random.choice(
            list(rating_probs.keys()),
            p=list(rating_probs.values())
        )
        
        # Generate platform based on distribution
        platform = np.random.choice(
            list(platform_probs.keys()),
            p=list(platform_probs.values())
        )
        
        data.append({
            'Date': date_str,
            'Rating': rating,
            'Platform': platform
        })
    
    # Create DataFrame and sort by date
    df = pd.DataFrame(data)
    df['date_sort'] = pd.to_datetime(df['Date'])
    df = df.sort_values('date_sort').drop('date_sort', axis=1).reset_index(drop=True)
    
    return df


In [6]:
reviews_by_year = {
        2023: 3100,
        2024: 3300,
        2025: 3500
    }
    
for year, num_reviews in reviews_by_year.items():
    print(f"Generating {year}reviews.csv with {num_reviews:,} reviews...")
    
    # Generate data
    df = generate_reviews(year, num_reviews)
    
    # Save to CSV
    filename = f"{year}reviews.csv"
    df.to_csv(f'../data_new/Reviews/{filename}', index=False)
    
    # Print summary statistics
    print(f"  ✓ Created {filename}")
    print(f"    - Total reviews: {len(df):,}")
    
    # Platform distribution
    platform_counts = df['Platform'].value_counts()
    for platform in ['Facebook', 'Yelp', 'Google']:
        count = platform_counts.get(platform, 0)
        pct = (count / len(df)) * 100
        print(f"    - {platform}: {count:,} ({pct:.1f}%)")
    
    # Rating distribution
    rating_counts = df['Rating'].value_counts().sort_index()
    avg_rating = df['Rating'].mean()
    print(f"    - Average rating: {avg_rating:.2f}")
    print(f"    - Rating distribution:", end="")
    for rating in sorted(df['Rating'].unique()):
        count = rating_counts.get(rating, 0)
        print(f" {rating}★:{count}", end="")
    print("\n")

Generating 2023reviews.csv with 3,100 reviews...
  ✓ Created 2023reviews.csv
    - Total reviews: 3,100
    - Facebook: 1,063 (34.3%)
    - Yelp: 1,017 (32.8%)
    - Google: 1,020 (32.9%)
    - Average rating: 8.34
    - Rating distribution: 6★:313 7★:473 8★:777 9★:916 10★:621

Generating 2024reviews.csv with 3,300 reviews...
  ✓ Created 2024reviews.csv
    - Total reviews: 3,300
    - Facebook: 1,153 (34.9%)
    - Yelp: 1,148 (34.8%)
    - Google: 999 (30.3%)
    - Average rating: 8.39
    - Rating distribution: 6★:307 7★:489 8★:833 9★:968 10★:703

Generating 2025reviews.csv with 3,500 reviews...
  ✓ Created 2025reviews.csv
    - Total reviews: 3,500
    - Facebook: 1,173 (33.5%)
    - Yelp: 1,190 (34.0%)
    - Google: 1,137 (32.5%)
    - Average rating: 8.39
    - Rating distribution: 6★:344 7★:486 8★:850 9★:1085 10★:735

