# Generating Web Stats Data

In [1]:
# imports
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime
import random

In [2]:
# Initialize
fake = Faker()
random.seed(1234)
np.random.seed(1234)

In [3]:
def generate_webstats(year, num_sessions):
    """
    Generate WebStats data for a given year
    
    Parameters:
    -----------
    year : int
        Year for the data (2023, 2024, or 2025)
    num_sessions : int
        Number of sessions to generate
    
    Returns:
    --------
    pandas.DataFrame
        WebStats data with all required columns
    """
    
    # Device type distribution - increasing mobile share
    device_distributions = {
        2023: {'mobile': 0.60, 'desktop': 0.30, 'tablet': 0.10},
        2024: {'mobile': 0.63, 'desktop': 0.27, 'tablet': 0.10},
        2025: {'mobile': 0.65, 'desktop': 0.25, 'tablet': 0.10}
    }
    
    # Base conversion rates - slight improvement over time
    base_conversion_rates = {
        2023: 53,
        2024: 54,
        2025: 55
    }
    
    data = []
    
    for user_id in range(1, num_sessions + 1):
        # Generate device type based on year-specific distribution
        device_dist = device_distributions[year]
        device_type = np.random.choice(
            list(device_dist.keys()), 
            p=list(device_dist.values())
        )
        
        # Generate browser 
        browser = np.random.choice(
            ['Chrome', 'Safari', 'Firefox', 'Edge'],
            p=[0.26, 0.25, 0.25, 0.24]
        )
        
        # Generate page views (most sessions have fewer pages)
        if random.random() < 0.7:
            page_views = random.randint(50, 3000)
        else:
            page_views = random.randint(3000, 10000)
        
        # Generate time on page (seconds)
        time_on_page = random.randint(40, 3500)
        
        # Generate conversion rate with high variance
        base_rate = base_conversion_rates[year]
        conversion_rate = base_rate + np.random.uniform(-40, 40)
        conversion_rate = np.clip(conversion_rate, 0, 100)
        conversion_rate = round(conversion_rate, 2)
        
        # Generate bounce rate
        bounce_rate = round(np.random.uniform(0, 95), 2)
        
        # Generate date within the year
        start_date = datetime(year, 1, 1)
        end_date = datetime(year, 12, 31)
        date_visited = fake.date_between(start_date=start_date, end_date=end_date)
        date_visited_str = date_visited.strftime("%-m/%-d/%Y")
        
        # Generate time of day
        hour = random.randint(0, 23)
        minute = random.randint(0, 59)
        period = "AM" if hour < 12 else "PM"
        display_hour = hour if hour <= 12 else hour - 12
        if display_hour == 0:
            display_hour = 12
        time_of_day = f"{display_hour}:{minute:02d} {period}"
        
        # Append row
        data.append({
            'user_id': user_id,
            'page_views': page_views,
            'time_on_page': time_on_page,
            'device_type': device_type,
            'browser': browser,
            'conversion_rate': conversion_rate,
            'bounce_rate': bounce_rate,
            'date_visited': date_visited_str,
            'time_of_day_visited': time_of_day
        })
    
    return pd.DataFrame(data)


In [None]:
sessions_by_year = {
        2023: 5500,
        2024: 7500,
        2025: 10000
    }
    
for year, num_sessions in sessions_by_year.items():
    print(f"Generating WebStats{year}.csv with {num_sessions:,} sessions...")
    
    # Generate data
    df = generate_webstats(year, num_sessions)
    
    # Save to CSV
    filename = f"WebStats{year}.csv"
    df.to_csv(f'../data_new/WebStats/{filename}', index=False)
    
    # # Print summary statistics
    print(f"  ✓ Created {filename}")


Generating WebStats2023.csv with 5,500 sessions...
  ✓ Created WebStats2023.csv
Generating WebStats2024.csv with 7,500 sessions...
  ✓ Created WebStats2024.csv
Generating WebStats2025.csv with 10,000 sessions...
  ✓ Created WebStats2025.csv
