In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
print("Environment working")


Environment working


In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
from datetime import datetime, timedelta
import random
import uuid

engine = create_engine("postgresql+psycopg2://marketing_user:marketing_pass@localhost:5432/marketing_attribution")
print("Connected")

Connected


In [3]:
with engine.begin() as conn:
    conn.execute(text("TRUNCATE TABLE web_events"))
    conn.execute(text("TRUNCATE TABLE conversions"))
    conn.execute(text("TRUNCATE TABLE campaign_spend"))
print("Tables cleared")


Tables cleared


In [4]:
np.random.seed(42)
random.seed(42)

channels = ["Paid Search", "Social", "Email", "Display", "Organic"]
start_date = datetime(2025, 10, 1)
days = 90

base_spend = {
    "Paid Search": 1600,
    "Social": 1100,
    "Email": 220,
    "Display": 850,
    "Organic": 0
}

ctr_range = {
    "Paid Search": (0.02, 0.05),
    "Social": (0.008, 0.02),
    "Email": (0.015, 0.04),
    "Display": (0.002, 0.008),
    "Organic": (0.02, 0.06)
}

rows = []
for i in range(days):
    date = (start_date + timedelta(days=i)).date()

    weekday_factor = 0.9 if (start_date + timedelta(days=i)).weekday() >= 5 else 1.0

    for ch in channels:
        spend = max(0, base_spend[ch] * weekday_factor + np.random.normal(0, 180))
        spend = round(spend, 2)

        if ch == "Organic":
            impressions = int(90000 * weekday_factor + np.random.normal(0, 8000))
        else:
            impressions = int(spend * random.uniform(110, 220))

        ctr = random.uniform(*ctr_range[ch])
        clicks = int(impressions * ctr)

        rows.append([date, ch, f"{ch}_Campaign", spend, max(impressions,0), max(clicks,0)])

campaign_spend_df = pd.DataFrame(rows, columns=["date","channel","campaign","spend","impressions","clicks"])
campaign_spend_df.head()


Unnamed: 0,date,channel,campaign,spend,impressions,clicks
0,2025-10-01,Paid Search,Paid Search_Campaign,1689.41,304663,6321
1,2025-10-01,Social,Social_Campaign,1075.11,150787,1610
2,2025-10-01,Email,Email_Campaign,336.58,64290,2051
3,2025-10-01,Display,Display_Campaign,1124.15,233980,590
4,2025-10-01,Organic,Organic_Campaign,0.0,88126,3249


In [6]:
users = 6000                      # number of unique users over 90 days
avg_sessions_per_user = 2.2       # average number of sessions per user
conversion_rate = 0.045           # overall conversion rate across users

touch_probs = {
    "Paid Search": 0.22,
    "Social": 0.26,
    "Email": 0.10,
    "Display": 0.18,
    "Organic": 0.24
}

step_rates = {
    "Paid Search": {"v2c": 0.10, "c2p": 0.40},
    "Social":      {"v2c": 0.05, "c2p": 0.30},
    "Email":       {"v2c": 0.12, "c2p": 0.45},
    "Display":     {"v2c": 0.03, "c2p": 0.22},
    "Organic":     {"v2c": 0.08, "c2p": 0.35},
}

source_medium_map = {
    "Paid Search": "paid search / cpc",
    "Social": "social / paid",
    "Email": "email / newsletter",
    "Display": "display / cpm",
    "Organic": "organic / search"
}

def weighted_choice(d):
    keys = list(d.keys())
    weights = list(d.values())
    return random.choices(keys, weights=weights, k=1)[0]

web_rows = []
conv_rows = []

user_ids = [f"u{i}" for i in range(1, users+1)]
order_counter = 200000

for u in user_ids:
    n_sessions = max(1, int(np.random.poisson(avg_sessions_per_user)))

    user_converts = (random.random() < conversion_rate)

    session_days = np.random.choice(range(days), size=n_sessions, replace=True)
    session_days.sort()

    conversion_session_idx = n_sessions - 1 if user_converts else None

    for s_idx in range(n_sessions):
        session_id = str(uuid.uuid4())[:8]
        day_offset = int(session_days[s_idx])
        base_ts = start_date + timedelta(days=day_offset, hours=random.randint(8, 23), minutes=random.randint(0,59))

        ch = weighted_choice(touch_probs)
        sm = source_medium_map[ch]

        # always at least a visit
        web_rows.append([u, session_id, base_ts, "visit", sm])

        # add_to_cart?
        if random.random() < step_rates[ch]["v2c"]:
            web_rows.append([u, session_id, base_ts + timedelta(minutes=random.randint(2, 10)), "add_to_cart", sm])

            # purchase within session?
            made_purchase = False
            if random.random() < step_rates[ch]["c2p"]:
                purchase_ts = base_ts + timedelta(minutes=random.randint(8, 25))
                web_rows.append([u, session_id, purchase_ts, "purchase", sm])
                made_purchase = True

                # if this is the user's converting session, create conversion record
                if user_converts and s_idx == conversion_session_idx:
                    order_counter += 1
                    # revenue distribution: lognormal-ish, varies by channel
                    channel_multiplier = {
                        "Paid Search": 1.15,
                        "Social": 0.95,
                        "Email": 1.05,
                        "Display": 0.90,
                        "Organic": 1.00
                    }[ch]
                    revenue = float(np.random.lognormal(mean=4.4, sigma=0.45) * channel_multiplier)
                    revenue = round(min(max(revenue, 15), 450), 2)
                    conv_rows.append([u, purchase_ts, f"o{order_counter}", revenue])

            # If user is marked as converter but didn’t purchase in-session,
            # we’ll still allow later session to be the converting one.
        else:
            # no cart; still possible converter later session
            pass

web_events_df = pd.DataFrame(web_rows, columns=["user_id","session_id","timestamp","event_type","source_medium"])
conversions_df = pd.DataFrame(conv_rows, columns=["user_id","conversion_time","order_id","revenue"])

len(web_events_df), len(conversions_df), web_events_df.head(), conversions_df.head()


(15232,
 4,
   user_id session_id           timestamp event_type      source_medium
 0      u1   38670bf2 2025-10-24 18:55:00      visit  paid search / cpc
 1      u1   fd41c8d4 2025-11-04 20:14:00      visit   organic / search
 2      u1   d6505d22 2025-11-07 18:47:00      visit      social / paid
 3      u2   5d4b928d 2025-10-09 12:31:00      visit   organic / search
 4      u2   33f6acf7 2025-11-03 18:55:00      visit  paid search / cpc,
   user_id     conversion_time order_id  revenue
 0     u26 2025-12-24 23:57:00  o200001    80.10
 1   u1054 2025-11-13 21:25:00  o200002   113.64
 2   u1465 2025-11-30 11:03:00  o200003    88.97
 3   u3556 2025-10-14 13:02:00  o200004    87.05)

In [7]:
from sqlalchemy import create_engine, text

engine = create_engine("postgresql+psycopg2://marketing_user:marketing_pass@localhost:5432/marketing_attribution")

# Clear existing rows
with engine.begin() as conn:
    conn.execute(text("TRUNCATE TABLE web_events;"))
    conn.execute(text("TRUNCATE TABLE conversions;"))
    conn.execute(text("TRUNCATE TABLE campaign_spend;"))

# Insert fresh
campaign_spend_df.to_sql("campaign_spend", engine, if_exists="append", index=False)
web_events_df.to_sql("web_events", engine, if_exists="append", index=False)
conversions_df.to_sql("conversions", engine, if_exists="append", index=False)

print("Inserted ")
print("campaign_spend_df:", len(campaign_spend_df))
print("web_events_df:", len(web_events_df))
print("conversions_df:", len(conversions_df))

Inserted 
campaign_spend_df: 450
web_events_df: 15232
conversions_df: 4


In [8]:
with engine.connect() as conn:
    spend_rows = conn.execute(text("SELECT COUNT(*) FROM campaign_spend")).scalar()
    event_rows = conn.execute(text("SELECT COUNT(*) FROM web_events")).scalar()
    conv_rows = conn.execute(text("SELECT COUNT(*) FROM conversions")).scalar()

print("DB counts ")
print("campaign_spend:", spend_rows)
print("web_events:", event_rows)
print("conversions:", conv_rows)

DB counts 
campaign_spend: 450
web_events: 15232
conversions: 4
