In [None]:
# run first time to install jupyte and ipykernel
# %pip3 install ipykernel jupyter

In [None]:
# Only run first time to install dependencies
# %pip install -r requirements.txt

# Problem definition and data overview
**Objective:** Evaluate the A/B test to determine which variant (A or B) yields a higher session-based conversion rate (i.e. sessions leading to a purchase).

**Datasets Overview:**
- **events.csv**: Contains session-level data (session_id, user_id, variant, timestamp, event_type) that captures user interactions.
- **users.csv**: Contains user demographics (user_id, variant, country, device, age_group) that can help segment and explain behavior.

## Key Metrics
- **Primary Metrics:**
    - **Overall Session Conversion Rate**
    
        $\text{Conversion Rate} = \frac{\text{Number of Sessions with a Purchase Event}}{\text{Total Sessions}}$
​    
    - **Conversion Rate per Variant:**
    Compute conversion rates separately for Variant A and Variant B.

    - **Percentage Difference:**

        $\text{Percentage Change} = \frac{\text{Conversion Rate}_B − \text{Conversion Rate}_A}{\text{Conversion Rate}_A} \times 100$

- **Secondary Metrics:**
    - **Interaction-Based Impact:** Assess if specific events (e.g., “add to cart”, “page view”) correlate with higher conversion rates.
    - **Demographic Impact:** Analyze whether age, country, or device type significantly affect conversion rates.

## Data prep & cleaning

In [6]:
# Import required libraries
import pandas as pd

# Load events_abler.csv and users_abler.csv
df_events = pd.read_csv('data/events.csv')
df_users = pd.read_csv('data/users.csv')

# Display the first few rows for a quick inspection
print("Events DataFrame (first 5 rows):")
print(df_events.head(), "\n")
print("Users DataFrame (first 5 rows):")
print(df_users.head(), "\n")

Events DataFrame (first 5 rows):
   session_id  user_id    variant            timestamp          event_type
0     1055627     3283  variant_b  2023-01-16 19:00:00           page_view
1     1042101      606  variant_b  2023-01-20 17:00:00           page_view
2     1042101      606  variant_b  2023-01-20 17:00:10  purchase_completed
3     1042101      606  variant_b  2023-01-20 17:00:28    purchase_started
4     1052179     1048  variant_a  2023-01-02 09:00:00           page_view 

Users DataFrame (first 5 rows):
   user_id    variant  country   device age_group
0        1  variant_b  Germany  desktop     25-34
1        2  variant_a  Iceland  desktop     45-54
2        3  variant_a   France  desktop     35-44
3        4  variant_b  Iceland   tablet     35-44
4        5  variant_a  Iceland   mobile     45-54 



In [None]:
# ----------------------------
# 2. Data Cleaning
# ----------------------------
# Convert 'timestamp' in events to datetime
df_events['timestamp'] = pd.to_datetime(df_events['timestamp'])
print("Converted 'timestamp' to datetime.\n")

# Check for missing values in each DataFrame
print("Missing values in events:")
print(df_events.isnull().sum(), "\n")
print("Missing values in users:")
print(df_users.isnull().sum(), "\n")

# Handle missing values if any (example: drop rows with missing values)
# Note: Depending on your project, you may prefer imputing missing values rather than dropping.
df_events = df_events.dropna()
df_users = df_users.dropna()
print("Dropped rows with missing values (if any).\n")

# Check for duplicate rows
duplicates_events = df_events.duplicated().sum()
duplicates_users = df_users.duplicated().sum()
print(f"Found {duplicates_events} duplicate rows in events.")
print(f"Found {duplicates_users} duplicate rows in users.\n")

# Drop duplicate rows
df_events = df_events.drop_duplicates()
df_users = df_users.drop_duplicates()
print("Removed duplicate rows (if any).\n")

# ----------------------------
# 3. Merge Datasets
# ----------------------------
# Merge events with users on 'user_id'. Using 'left' join ensures all events are kept.
merged_df = pd.merge(df_events, df_users, on='user_id', how='left')

# Quick check of the merged DataFrame
print("Merged DataFrame (first 5 rows):")
print(merged_df.head(), "\n")

# Optional: Verify if any rows in the merged DataFrame lack user info
missing_user_info = merged_df.isnull().sum()
print("Missing values in the merged DataFrame:")
print(missing_user_info)