In [187]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import string

In [188]:
# For reproducibility
np.random.seed(2)
random.seed(2)

# Constants
TOTAL_RECORDS = 364000  # 26K per day * 14 days
TEST_START = datetime(2025, 7, 21)
TEST_END = datetime(2025, 8, 4)

print(f"Generating {TOTAL_RECORDS:,} records for Jooble A/B test...")

Generating 364,000 records for Jooble A/B test...


In [206]:
# Helper functions
def generate_user_id():
    """Generate user ID"""
    # User ID format: u_<8 random digits>
    suffix = ''.join(random.choices(string.digits, k=8))
    return f"u_{suffix}"

def generate_session_id(user_id, session_num, timestamp):
    """Generate session ID"""
    # Session ID format: s_<YYYYMMDD>_<user_id_suffix>_<session_num>
    timestamp_part = timestamp.date().strftime("%Y%m%d")
    return f"s_{timestamp_part}_{user_id.split('_')[1]}_{session_num}"

def weighted_choice(choices, weights):
    """
    Select item from choices based on weights.
    Works for both primitive values and complex types like tuples or dicts.
    """
    try:
        # Attempt NumPy-style selection — works for primitive types
        return np.random.choice(choices, p=weights)
    except ValueError:
        # Fallback for structured types
        return random.choices(choices, weights=weights, k=1)[0]


In [190]:
# Country distribution (based on Jooble's actual presence)
countries = ['PL', 'IT', 'DE', 'UA', 'ES', 'PT', 'TR', 'HU', 'ID', 'FR', 
            'UK', 'RS', 'US', 'CA', 'NL', 'SE', 'CZ', 'RO', 'IN', 'SK']
country_weights = [0.12, 0.12, 0.11, 0.10, 0.09, 0.07, 0.06, 0.05, 0.05, 0.04,
                   0.04, 0.035, 0.03, 0.02, 0.02, 0.015, 0.01, 0.01, 0.005, 0.005]

# Device distribution (job boards are mostly mobile)
devices = ['mobile', 'desktop', 'tablet']
device_weights = [0.55, 0.40, 0.05]  # Mobile-first for job searching

In [191]:
# Generate base users with sessions
print("Generating users and sessions...")
users_data = []
user_sessions = {}

# Control group session distribution
control_sessions_dist = [1, 2, 3, 4, 5, 6]
control_sessions_weights = [0.65, 0.22, 0.08, 0.03, 0.015, 0.005]

# Treatment group session distribution  
treatment_sessions_dist = [1, 2, 3, 4, 5, 6]
treatment_sessions_weights = [0.52, 0.28, 0.12, 0.05, 0.02, 0.01]

# Estimate unique users (accounting for multiple sessions)
avg_sessions_control = sum(s * w for s, w in zip(control_sessions_dist, control_sessions_weights))
avg_sessions_treatment = sum(s * w for s, w in zip(treatment_sessions_dist, treatment_sessions_weights))
avg_sessions_total = (avg_sessions_control + avg_sessions_treatment) / 2

estimated_unique_users = int(TOTAL_RECORDS / avg_sessions_total)

# Generate users and their sessions
for _ in range(estimated_unique_users):
    user_id = generate_user_id()
    variant = weighted_choice(['control', 'treatment'], [0.5, 0.5])
    
    # Determine number of sessions for this user
    if variant == 'control':
        num_sessions = weighted_choice(control_sessions_dist, control_sessions_weights)
    else:
        num_sessions = weighted_choice(treatment_sessions_dist, treatment_sessions_weights)
    
    user_sessions[user_id] = {
        'variant': variant,
        'num_sessions': num_sessions,
        'country': weighted_choice(countries, country_weights),
        'device': weighted_choice(devices, device_weights)
    }

Generating users and sessions...


In [None]:
# Generate all records
print("Generating session records...")
data = []
record_count = 0

for user_id, user_info in user_sessions.items():
    if record_count >= TOTAL_RECORDS:
        break
    
    user_test_start_date = None  # Store test start date for this user
    last_session_timestamp = None  # Track last session timestamp for this user
        
    for session_num in range(1, user_info['num_sessions'] + 1):
        if record_count >= TOTAL_RECORDS:
            break
            
        # Generate realistic timestamp
        if session_num == 1:
            # First session: random day within test period
            days_offset = np.random.randint(0, 14)
            base_date = TEST_START
        else:
            # Subsequent sessions: start from last session timestamp
            max_days_remaining = (TEST_END - last_session_timestamp).days
            if max_days_remaining <= 0:
                break  # Can't add more sessions within test period
            # Weighted choice for days to add while not exceeding test end
            if max_days_remaining > 7:
                weights = [0.26, 0.74] if variant == 'control' else [0.35, 0.65]
                days_to_add = weighted_choice([(1, 7), (8, max_days_remaining)], weights)
                days_to_add = np.random.randint(days_to_add[0], days_to_add[1] + 1)
            else:
                days_to_add = np.random.randint(1, max_days_remaining + 1)
            base_date = last_session_timestamp
            days_offset = days_to_add
        
        hours_offset = np.random.choice(24, p=[0.02, 0.01, 0.01, 0.01, 0.01, 0.02, 0.04, 0.05, 
                                               0.06, 0.07, 0.08, 0.09, 0.08, 0.07, 0.06, 0.05, 
                                               0.05, 0.05, 0.04, 0.03, 0.03, 0.03, 0.02, 0.02])  # Realistic hourly distribution
        minutes_offset = np.random.randint(0, 60)
        seconds_offset = np.random.randint(0, 60)
        
        timestamp = base_date + timedelta(days=days_offset, hours=hours_offset, 
                                         minutes=minutes_offset, seconds=seconds_offset)
        
        # Test start date (when user first entered test)
        if session_num == 1:
            # First session: set test start date
            user_test_start_date = timestamp.date()
        
        test_start_date = user_test_start_date  # Use same test start date for all sessions
        last_session_timestamp = timestamp  # Update last session timestamp
        
        session_id = generate_session_id(user_id, session_num, timestamp)
        variant = user_info['variant']
        
        # Generate primary conversion
        if variant == 'control':
            apply_click_conversion = weighted_choice([0, 1], [0.916, 0.084])
        else:
            apply_click_conversion = weighted_choice([0, 1], [0.876, 0.124])
        
        # Generate jobs_applied_count
        if variant == 'control':
            jobs_applied_count = weighted_choice([0, 1, 2, 3, 4, 5, 6, 7], 
                                                 [0.916, 0.050, 0.020, 0.008, 0.004, 0.0015, 0.0004, 0.0001])
        else:
            jobs_applied_count = weighted_choice([0, 1, 2, 3, 4, 5, 6, 7], 
                                                 [0.876, 0.058, 0.038, 0.02, 0.005, 0.002, 0.0008, 0.0002])
        # Ensure jobs_applied_count is zero if apply_click_conversion is zero
        if jobs_applied_count > 0 and apply_click_conversion == 0:
            jobs_applied_count = 0
        
        # Generate job_saves_count
        if variant == 'control':
            job_saves_count = weighted_choice([0, 1, 2, 3, 4, 5, 6, 7], 
                                              [0.68, 0.15, 0.08, 0.04, 0.025, 0.015, 0.007, 0.003])
        else:
            job_saves_count = weighted_choice([0, 1, 2, 3, 4, 5, 6, 7], 
                                              [0.61, 0.16, 0.11, 0.055, 0.035, 0.020, 0.008, 0.002])
        
        # Generate email_signup
        if variant == 'control':
            email_signup = weighted_choice([0, 1], [0.875, 0.125])
        else:
            email_signup = weighted_choice([0, 1], [0.82, 0.18])
        
        # Generate create_cv
        if variant == 'control':
            create_cv = weighted_choice([0, 1], [0.935, 0.065])
        else:
            create_cv = weighted_choice([0, 1], [0.918, 0.082])
        
        # Generate session_duration_min
        if variant == 'control':
            duration_ranges = [(0, 2), (2, 5), (5, 10), (10, 20), (20, 40), (40, 60)]
            duration_weights = [0.35, 0.28, 0.20, 0.12, 0.04, 0.01]
        else:
            duration_ranges = [(0, 2), (2, 5), (5, 10), (10, 20), (20, 40), (40, 60)]
            duration_weights = [0.28, 0.26, 0.22, 0.16, 0.07, 0.01]
        
        selected_range = weighted_choice(duration_ranges, duration_weights)
        session_duration_min = np.random.randint(selected_range[0], selected_range[1] + 1)
        
        # Generate return_visit_7d
        if (session_num != 1) and (days_offset <= 7):
            return_visit_7d = 1
        else:
            return_visit_7d = 0
        
        # Append record
        data.append({
            'user_id': user_id,
            'session_id': session_id,
            'timestamp': timestamp,
            'test_start_date': test_start_date,
            'variant': variant,
            'country_code': user_info['country'],
            'device_type': user_info['device'],
            'apply_click_conversion': apply_click_conversion,
            'jobs_applied_count': jobs_applied_count,
            'job_saves_count': job_saves_count,
            'email_signup': email_signup,
            'create_cv': create_cv,
            'session_duration_min': session_duration_min,
            'return_visit_7d': return_visit_7d
        })
        
        record_count += 1
        
        if record_count % 50000 == 0:
            print(f"Generated {record_count:,} records...")

Generating session records...
Generated 50,000 records...
Generated 100,000 records...
Generated 150,000 records...
Generated 200,000 records...
Generated 250,000 records...
Generated 300,000 records...


In [328]:
# Create DataFrame
print("Creating DataFrame...")
df = pd.DataFrame(data)

# Sort by timestamp
df = df.sort_values('timestamp').reset_index(drop=True)

Creating DataFrame...


In [None]:
# Convert data types
df.test_start_date = pd.to_datetime(df.test_start_date)
df.return_visit_7d = df.return_visit_7d.astype("Int32")
df.session_duration_min = df.session_duration_min.astype("Int32")
df.jobs_applied_count = df.jobs_applied_count.astype("Int32")

In [None]:
# Save to CSV
df.to_csv(
    r'D:\High-usage\Data Science\Data Analyst Path\Projects\Analytical Projects\Jooble\AB Test Results\AB Test Projects\ML Search Result Ranking\jooble_ml_ab_test_data_generation.csv', 
    index=False,
    encoding='utf-8'
)

print("Data generation complete. Saved as 'jooble_ml_ab_test_data_generation.csv'.")