In [None]:
import random
from faker import Faker
from datetime import datetime, timedelta

In [None]:
# Initialize Faker
fake = Faker()

# Dictionary mapping European countries to a list of major cities
country_cities = {
    'AL': ['Tirana', 'Durrës', 'Vlorë'],
    'AT': ['Vienna', 'Graz', 'Linz'],
    'BE': ['Brussels', 'Antwerp', 'Ghent'],
    'BG': ['Sofia', 'Plovdiv', 'Varna'],
    'HR': ['Zagreb', 'Split', 'Rijeka'],
    'CY': ['Nicosia', 'Limassol', 'Larnaca'],
    'CZ': ['Prague', 'Brno', 'Ostrava'],
    'DK': ['Copenhagen', 'Aarhus', 'Odense'],
    'EE': ['Tallinn', 'Tartu', 'Narva'],
    'FI': ['Helsinki', 'Espoo', 'Tampere'],
    'FR': ['Paris', 'Marseille', 'Lyon'],
    'DE': ['Berlin', 'Munich', 'Hamburg'],
    'GR': ['Athens', 'Thessaloniki', 'Patras'],
    'HU': ['Budapest', 'Debrecen', 'Szeged'],
    'IS': ['Reykjavik', 'Kopavogur', 'Hafnarfjordur'],
    'IE': ['Dublin', 'Cork', 'Limerick'],
    'IT': ['Rome', 'Milan', 'Naples'],
    'LV': ['Riga', 'Daugavpils', 'Liepaja'],
    'LT': ['Vilnius', 'Kaunas', 'Klaipeda'],
    'LU': ['Luxembourg', 'Esch-sur-Alzette', 'Differdange'],
    'MT': ['Valletta', 'Birkirkara', 'Mosta'],
    'NL': ['Amsterdam', 'Rotterdam', 'The Hague'],
    'NO': ['Oslo', 'Bergen', 'Stavanger'],
    'PL': ['Warsaw', 'Krakow', 'Gdansk'],
    'PT': ['Lisbon', 'Porto', 'Amadora'],
    'RO': ['Bucharest', 'Cluj-Napoca', 'Timișoara'],
    'RU': ['Moscow', 'Saint Petersburg', 'Novosibirsk'],
    'SK': ['Bratislava', 'Košice', 'Prešov'],
    'SI': ['Ljubljana', 'Maribor', 'Celje'],
    'ES': ['Madrid', 'Barcelona', 'Valencia'],
    'SE': ['Stockholm', 'Gothenburg', 'Malmö'],
    'CH': ['Zurich', 'Geneva', 'Basel'],
    'UA': ['Kyiv', 'Kharkiv', 'Odessa'],
    'GB': ['London', 'Manchester', 'Birmingham']
    # Add more countries and cities as needed
}

# Function to generate random date within a given range
def random_date(start_date, end_date):
    return start_date + timedelta(days=random.randint(0, (end_date - start_date).days))

# Generate synthetic user data
def generate_users(num_users):
    users = []
    for user_id in range(1, num_users + 1):
        name = fake.name()
        birth_date = fake.date_of_birth(minimum_age=18, maximum_age=80)
        registration_date = random_date(datetime(2010, 1, 1), datetime(2023, 12, 31))
        country = random.choice(list(country_cities.keys()))  # Randomly assign a European country code
        city = random.choice(country_cities[country])  # Randomly select a city from the chosen country
        users.append({
            "user_id": user_id,
            "name": name,
            "birth_date": birth_date,
            "registration_date": registration_date,
            "location": country,
            "city": city
        })
    return users

# Generate 10 synthetic users
users = generate_users(10000)

# Display generated users
for user in users:
    print(user)

In [None]:
def generate_subscriptions(users):
    subscriptions = []
    subscription_id_counter = 1  # Initialize a counter for subscription IDs

    for user in users:
        # Use a weighted choice to favor fewer subscriptions
        num_subscriptions = random.choices([0, 1, 2, 3], weights=[30, 50, 15, 5])[0]
        last_date = user["registration_date"]
        for _ in range(num_subscriptions):
            # Generate subscription date, starting from the registration date
            subscription_date = random_date(last_date, datetime(2024, 12, 31))
            subscription_cancel_date = random_date(subscription_date, datetime(2024, 12, 31)) if random.choice([True, False]) else None
            
            subscriptions.append({
                "subscription_id": subscription_id_counter,  # Use the counter as the subscription ID
                "user_id": user["user_id"],
                "subscription_date": subscription_date,
                "subscription_cancel_date": subscription_cancel_date
            })
            subscription_id_counter += 1  # Increment the counter for the next subscription
            
            # Update last date to either the cancel date (if the subscription was canceled) or the subscription date
            if subscription_cancel_date:
                last_date = subscription_cancel_date  # Allow new subscription only after cancellation
            else:
                last_date = subscription_date  # If no cancel date, use the subscription date for the next iteration

    return subscriptions

# Generate subscriptions for these users
subscriptions = generate_subscriptions(users)

# Display generated subscriptions
for subscription in subscriptions:
    print(subscription)

In [None]:
import numpy as np
import random

# List of devices with the specified probabilities
devices = ["Mobile", "Tablet", "Desktop"]
device_weights = [0.4, 0.1, 0.5]  # 40% for Mobile, 10% for Tablet, 50% for Desktop

def generate_sessions(users):
    sessions = []
    session_id_counter = 1

    for user in users:
        num_sessions = np.random.poisson(lam=5)  # Using Poisson distribution with lambda = 5
        for _ in range(num_sessions):
            session_date = random_date(user["registration_date"], datetime(2024, 12, 31))
            time_spent_seconds = int(np.random.lognormal(mean=4, sigma=1) * 60)  # Log-normal distribution
            device = random.choices(devices, weights=device_weights, k=1)[0]  # Randomly select a device based on weights
            sessions.append({
                "session_id": session_id_counter,
                "user_id": user["user_id"],
                "session_date": session_date,
                "time_spent_seconds": time_spent_seconds,
                "device": device  # Add device information
            })
            session_id_counter += 1

    return sessions

# Generate sessions for these users
sessions = generate_sessions(users)

# Display generated sessions
for session in sessions:
    print(session)

In [None]:
import random

def generate_referrals(users):
    referrals = []
    referral_id_counter = 1  # Initialize the referral ID counter

    for referrer in users:
        # Potential referees must have a registration date after or on the referrer's registration date
        potential_referees = [
            user for user in users 
            if user["registration_date"] > referrer["registration_date"]
        ]
        
        # Determine the number of referrals this user might make with weighted probabilities
        num_referrals = random.choices([0, 1, 2, 3, 4], weights=[85, 8, 5, 1, 1])[0]
        
        if num_referrals > 0 and potential_referees:
            for _ in range(num_referrals):
                if not potential_referees:  # Check if there are still potential referees left
                    break
                referee = random.choice(potential_referees)
                referrals.append({
                    "referral_id": referral_id_counter,  # Incremental referral ID
                    "referrer_id": referrer["user_id"],
                    "referee_id": referee["user_id"]
                })
                referral_id_counter += 1  # Increment the referral ID counter
                # Remove the selected referee to avoid duplication
                potential_referees.remove(referee)
    
    return referrals

# Generate sessions for these users
referrals = generate_referrals(users)

# Display generated sessions
for referral in referrals:
    print(referral)

## Data Integrity

In [None]:
# Test if for each customer, the subscription date is after the registration date. If no issue is found, no output will be displayed.
for subscription in subscriptions:
    user_id = subscription["user_id"]
    subscription_date = subscription["subscription_date"]
    registration_date = next(user["registration_date"] for user in users if user["user_id"] == user_id)
    if subscription_date < registration_date:
        print(f"Subscription date {subscription_date} is before registration date {registration_date} for user {user_id}")

In [None]:
# Test if for each customer, the subscription date is after the registration date. If no issue is found, no output will be displayed.
def check_registration_before_subscription(users, subscriptions):
    for subscription in subscriptions:
        user_id = subscription["user_id"]
        user = next(user for user in users if user["user_id"] == user_id)
        registration_date = user["registration_date"]
        subscription_date = subscription["subscription_date"]
        
        if subscription_date < registration_date:
            print(f"Error: Subscription date {subscription_date} is before registration date {registration_date} for user_id {user_id}")

# Generate 10 synthetic users
users = generate_users(10)

# Generate subscriptions for these users
subscriptions = generate_subscriptions(users)

# Check each subscription to ensure registration date is before or equal to subscription date
check_registration_before_subscription(users, subscriptions)

# If no errors are printed, all users have valid subscription dates.

In [None]:
referrals