In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(0)
random.seed(0)

# Initialize Faker to generate fake data
faker = Faker()

# Number of rows for the dataset
num_rows = 2000

# Define countries for each continent with their respective latitude and longitude
countries_by_continent = {
    'Africa': {'Algeria': (28.0339, 1.6596), 'Nigeria': (9.0820, 8.6753), 'Egypt': (26.8206, 30.8025), 'Kenya': (-0.0236, 37.9062), 'South Africa': (-30.5595, 22.9375), 'Botswana': (-22.3285, 24.6849), 'Ghana': (7.9465, -1.0232), 'Tanzania': (-6.3690, 34.8888), 'Morocco': (31.7917, 7.0926), 'Zimbabwe': (-19.0154, 29.1549)},
    'Asia': {'China': (35.8617, 104.1954), 'India': (20.5937, 78.9629), 'Japan': (36.2048, 138.2529), 'Indonesia': (-0.7893, 113.9213), 'Pakistan': (30.3753, 69.3451), 'Bangladesh': (23.6850, 90.3563), 'Philippines': (12.8797, 121.7740), 'Vietnam': (14.0583, 108.2772), 'Iran': (32.4279, 53.6880), 'Turkey': (38.9637, 35.2433)},
    'Europe': {'France': (46.6034, 1.8883), 'Germany': (51.1657, 10.4515), 'United Kingdom': (55.3781, -3.4360), 'Italy': (41.8719, 12.5674), 'Spain': (40.4637, -3.7492), 'Ukraine': (48.3794, 31.1656), 'Poland': (51.9194, 19.1451), 'Romania': (45.9432, 24.9668), 'Netherlands': (52.1326, 5.2913), 'Belgium': (50.5039, 4.4699)},
    'North America': {'United States': (37.0902, -95.7129), 'Canada': (56.1304, -106.3468), 'Mexico': (23.6345, -102.5528), 'Cuba': (21.5218, -77.7812), 'Jamaica': (18.1096, -77.2975), 'Honduras': (15.2000, -86.2419), 'Costa Rica': (9.7489, -83.7534), 'Panama': (8.5380, -80.7821), 'Dominican Republic': (18.7357, -70.1627), 'Guatemala': (15.7835, -90.2308)},
    'South America': {'Brazil': (-14.2350, -51.9253), 'Argentina': (-38.4161, -63.6167), 'Colombia': (4.5709, -74.2973), 'Peru': (-9.1900, -75.0152), 'Venezuela': (6.4238, -66.5897), 'Chile': (-35.6751, -71.5430), 'Ecuador': (-1.8312, -78.1834), 'Bolivia': (-16.2902, -63.5887), 'Paraguay': (-23.4425, -58.4438), 'Uruguay': (-32.5228, -55.7658)}
}

# Define specific pages available on a sporting website
specific_pages = ['/home', '/about', '/events', '/teams', '/players', '/news', '/fixtures', '/tickets', '/shop', '/gallery', '/fanzone', '/contact']

# Generate synthetic data for each column
data = {
    'Date': [faker.date_between(start_date='-6M', end_date='today').strftime('%Y-%m-%d') for _ in range(num_rows)], 
    'Time': [faker.date_time_this_year().strftime('%H:%M:%S') for _ in range(num_rows)],
    'IP Address': [faker.ipv4() for _ in range(num_rows)],
    'HTTP Method': [random.choice(['GET', 'POST', 'PUT']) for _ in range(num_rows)],
    'Request': [random.choice(specific_pages) for _ in range(num_rows)],
    'Status Code': [random.choice([200, 304, 404]) for _ in range(num_rows)],
    'Gender': [random.choice(['Male', 'Female']) for _ in range(num_rows)],
    'Sport Viewed': [random.choice(['Football', 'Basketball', 'Tennis', 'Netball', 'Swimming', 'Baseball', 'Boxing']) for _ in range(num_rows)],
    'Ad Displayed': [random.choice(['Yes', 'No']) for _ in range(num_rows)],
    'Rating': [random.randint(1, 5) for _ in range(num_rows)],
    'Feedback': [random.choice(['Good', 'Excellent', 'Poor', 'Average']) for _ in range(num_rows)],
    'Session Duration (seconds)': [random.randint(600, 11400) for _ in range(num_rows)],
    'Continent': [faker.random_element(elements=('Africa', 'Asia', 'Europe', 'North America', 'South America')) for _ in range(num_rows)]
}

# Assign countries based on the generated continent
data['Country'] = [random.choice(list(countries_by_continent[continent].keys())) for continent in data['Continent']]

# Add latitude and longitude based on the assigned country
data['Latitude'] = [countries_by_continent[continent][country][0] for continent, country in zip(data['Continent'], data['Country'])]
data['Longitude'] = [countries_by_continent[continent][country][1] for continent, country in zip(data['Continent'], data['Country'])]

# Create DataFrame
df = pd.DataFrame(data)

# Save data to CSV file
df.to_csv('funolympic_data.csv', index=False)
