In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from collections import defaultdict, Counter, defaultdict
import random
import warnings
warnings.filterwarnings("ignore")

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
# =====================================================================================
# CONFIG / PARAMETERS (TUNE THESE)
# =====================================================================================

NUM_CUSTOMERS = 500
START_DATE = datetime(2025, 10, 1)
END_DATE = datetime(2025, 11, 30)
CONVERSION_PROB = 0.25  # probability a customer converts in synthetic data

EVENT_STATE_MAPPING = {
    "email_open": "Email",
    "paid_search_click": "Paid_Search",
    "social_ad_click": "Social",
    "organic_search": "Organic",
    "homepage_visit": "Homepage",
    "product_page_view": "Product_Page",
    "pricing_page_view": "Pricing_Page",
    "cart_add": "Cart",
    "purchase": "Conversion",
    "support_contact": "Support",
}

ABSORBING_STATES = ["Conversion", "Null"]
START_STATE = "Start"

# 

In [None]:
# =====================================================================================
# STEP 1 – GENERATE SYNTHETIC EVENT DATA
# =====================================================================================

def generate_synthetic_events(
    num_customers=NUM_CUSTOMERS,
    start_date=START_DATE,
    end_date=END_DATE,
    conversion_prob=CONVERSION_PROB,
    event_state_mapping=EVENT_STATE_MAPPING,
):
    np.random.seed(42)
    random.seed(42)

    customer_ids = [f"CUST_{str(i).zfill(4)}" for i in range(1, num_customers + 1)]
    all_events = []

    for customer_id in customer_ids:
        num_events = random.randint(1, 10)

        customer_start = start_date + timedelta(days=random.randint(0, 30))
        timestamps = sorted(
            [
                customer_start
                + timedelta(
                    days=random.randint(0, 30),
                    hours=random.randint(0, 23),
                    minutes=random.randint(0, 59),
                )
                for _ in range(num_events)
            ]
        )

        converted = random.random() < conversion_prob

        # Build a plausible journey
        awareness_events = [
            "email_open",
            "paid_search_click",
            "social_ad_click",
            "organic_search",
        ]
        journey_events = [random.choice(awareness_events)]

        for _ in range(num_events - 2 if num_events > 2 else 0):
            r = random.random()
            if r < 0.4:
                journey_events.append("homepage_visit")
            elif r < 0.4 + 0.5:
                journey_events.append("product_page_view")
            elif r < 0.4 + 0.5 + 0.3:
                journey_events.append("pricing_page_view")
            elif r < 0.4 + 0.5 + 0.3 + 0.2:
                journey_events.append("cart_add")
            else:
                journey_events.append(random.choice(["email_open", "support_contact"]))

        journey_events = journey_events[:num_events]

        if converted and num_events > 1:
            journey_events.append("purchase")

        for ts, event_type in zip(timestamps, journey_events):
            all_events.append(
                {
                    "customer_id": customer_id,
                    "event_time": ts,
                    "event_type": event_type,
                    "state": event_state_mapping[event_type],
                }
            )

    events_df = pd.DataFrame(all_events).sort_values(["customer_id", "event_time"])
    events_df["event_time"] = pd.to_datetime(events_df["event_time"])
    return events_df


# =====================================================================================
# STEP 2 – BUILD CUSTOMER PATHS
# =====================================================================================

def build_customer_paths(events_df, start_state=START_STATE):
    """
    Build per-customer state paths and a summary DataFrame for Markov journey analysis.

    This function constructs a path of states for each customer by:
    - Prepending a global `start_state`.
    - Appending a terminal `"Null"` state if the last observed state is not `"Conversion"`.

    It returns both:
    - `paths`: a list of lists where each inner list is the full state sequence for a customer
      (including `start_state` and possibly `"Null"`).
    - `paths_df`: a summary DataFrame with one row per customer containing:
        * `customer_id`: the customer's identifier.
        * `path_length`: the number of states in the constructed path.
        * `path`: a human-readable string of the path joined by `" → "`.
        * `converted`: whether the customer's final observed state equals `"Conversion"`.
        * `num_states`: the count of unique *intermediate* states excluding
          `{start_state, "Null", "Conversion"}`.

    Parameters
    ----------
    events_df : pandas.DataFrame
        A DataFrame of customer events containing at least:
        - `customer_id` (hashable): identifier used to group events.
        - `state` (str): the journey state at each event.
        The rows for each `customer_id` are assumed to be in chronological order. If not,
        sort before calling, e.g. `events_df.sort_values(["customer_id", "event_time"])`.
    start_state : str, optional
        The state used to prepend each customer's path (default: `START_STATE`).

    Returns
    -------
    paths : list[list[str]]
        The list of per-customer state sequences, each beginning with `start_state`
        and ending with `"Conversion"` or `"Null"`.
    paths_df : pandas.DataFrame
        Summary per customer with columns:
        `customer_id`, `path_length`, `path`, `converted`, `num_states`.

    Raises
    ------
    KeyError
        If `events_df` does not contain `customer_id` or `state` columns.
    IndexError
        If a customer group has zero rows (i.e., `events_df` has customers with no events),
        since the function accesses `states[-1]`.

    Notes
    -----
    - A terminal `"Null"` state is appended only when the last observed state is not `"Conversion"`.
    - `num_states` excludes `{start_state, "Null", "Conversion"}` to focus on intermediate
      journey states relevant for transition attribution.

    Examples
    --------
    >>> df = pd.DataFrame({
    ...     "customer_id": [1, 1, 2],
    ...     "state": ["Email", "Conversion", "Social"]
    ... })
    >>> paths, paths_df = build_customer_paths(df, start_state="Start")
    >>> paths
    [['Start', 'Email', 'Conversion'], ['Start', 'Social', 'Null']]
    >>> paths_df[["customer_id", "path", "converted"]]
       customer_id                      path  converted
    0            1      Start → Email → Conversion       True
    1            2           Start → Social → Null      False
    """
    paths = []
    rows = []

    for cust_id, group in events_df.groupby("customer_id"):
        states = group["state"].tolist()
        path = [start_state] + states
        if states[-1] != "Conversion":
            path.append("Null")

        paths.append(path)
        rows.append(
            {
                "customer_id": cust_id,
                "path_length": len(path),
                "path": " → ".join(path),
                "converted": states[-1] == "Conversion",
                "num_states": len(set(path) - {start_state, "Null", "Conversion"}),
            }
        )

    paths_df = pd.DataFrame(rows)
    return paths, paths_df

# =====================================================================================
# STEP 3 – COUNT TRANSITIONS & BUILD TRANSITION MATRIX
# =====================================================================================

def count_transitions(paths):
    """
    Count state-to-state transitions across all customer paths.

    Parameters
    ----------
    paths : list of list of str
        Each inner list is a sequence of states for a single customer.

    Returns
    -------
    transition_counts : dict of tuple[str, str] to int
        Python-native mapping (from_state, to_state) -> count.
    all_states : set of str
        Set of all unique states.
    transition_records : list of dict
        JSON-ready list with keys: 'from', 'to', 'count'.

    Notes
    -----
    - `transition_counts` uses tuple keys which are not JSON-serializable.
      Use `transition_records` when emitting to JSON or UI layers.
    """
    transition_counts = defaultdict(int)
    all_states = set()
    for path in paths:
        all_states.update(path)
        for i in range(len(path) - 1):
            transition_counts[(path[i], path[i + 1])] += 1
            
    # Build JSON-friendly records
    transition_records = [{"from": src, "to": dst, "count": cnt}
                          for (src, dst), cnt in transition_counts.items()]

    return dict(transition_counts), all_states, transition_records

def build_transition_matrix(transition_counts, all_states, absorbing_states=ABSORBING_STATES, smoothing=0.0):
    """
    Build a row-normalized Markov transition probability matrix.

    This function constructs a square transition matrix over the provided `all_states`.
    It first initializes the matrix with a uniform `smoothing` value, then adds
    observed transition counts from `transition_counts`. Each row is normalized
    to sum to 1, producing a row-stochastic matrix of transition probabilities.
    Finally, any state specified in `absorbing_states` is converted to an absorbing
    state by setting its row to all zeros except a 1 on the diagonal.

    Parameters
    ----------
    transition_counts : dict[tuple[str, str], int]
        Mapping from (from_state, to_state) pairs to observed transition counts.
        Usually produced by `count_transitions(paths)`.
    all_states : set[str] or list[str]
        The universe of states to include in the matrix. The output matrix will be
        square over these states. States are sorted lexicographically to define the
        row/column order.
    absorbing_states : Iterable[str], optional
        States that should be treated as absorbing (i.e., once entered, the process
        remains there with probability 1). Defaults to `ABSORBING_STATES`.
    smoothing : float, optional
        Additive smoothing value applied as a baseline to every (from_state, to_state)
        cell before counts are added. Use `0.0` for no smoothing, or a small value
        (e.g. `1e-6` or `1.0`) to avoid zero rows and improve numerical stability.

    Returns
    -------
    trans_mat : pandas.DataFrame
        A row-stochastic transition probability matrix with:
        - Index: states (sorted).
        - Columns: states (sorted).
        - Values: probabilities in [0, 1].
        Absorbing states (if present) will have rows with a 1 on the diagonal and 0 elsewhere.

    Notes
    -----
    - Row normalization is performed via `mat.div(mat.sum(axis=1), axis=0)`.
      If a row sum is zero (e.g., no outgoing transitions and `smoothing == 0.0`),
      the resulting row will contain `NaN`. Consider using a positive `smoothing`
      or post-filling `NaN` as needed.
    - The order of states in `trans_mat` is the sorted order of `all_states`.
    - If a transition `(from_state, to_state)` appears in `transition_counts` but one of
      the states is not in `all_states`, it will be ignored (only states within `all_states`
      are represented).

    Examples
    --------
    >>> transition_counts = {
    ...     ("Start", "Email"): 3,
    ...     ("Email", "Conversion"): 2,
    ...     ("Start", "Social"): 1,
    ...     ("Social", "Null"): 1,
    ... }
    >>> all_states = {"Start", "Email", "Social", "Conversion", "Null"}
    >>> absorbing_states = {"Conversion", "Null"}
    >>> trans_mat = build_transition_matrix(transition_counts, all_states,
    ...                                     absorbing_states=absorbing_states,
    ...                                     smoothing=0.0)
    >>> trans_mat.loc["Start"]
    Email         0.75
    Social        0.25
    Conversion    0.00
    Null          0.00
    Name: Start, dtype: float64
    >>> trans_mat.loc["Conversion"]
    Start         0.0
    Email         0.0
    Social        0.0
    Conversion    1.0
    Name: Conversion, dtype: float64
    """
    states = sorted(all_states)
    mat = pd.DataFrame(smoothing, index=states, columns=states)

    for (from_s, to_s), c in transition_counts.items():
        mat.loc[from_s, to_s] += c

    trans_mat = mat.div(mat.sum(axis=1), axis=0)

    for s in absorbing_states:
        if s in trans_mat.index:
            trans_mat.loc[s, :] = 0.0
            trans_mat.loc[s, s] = 1.0

    return trans_mat

# =====================================================================================
# STEP 4 – CONVERSION PROBABILITIES (ABSORBING MARKOV CHAIN)
# =====================================================================================

def calculate_conversion_probabilities(transition_matrix, absorbing_states=ABSORBING_STATES):
    """
    Compute per-state conversion probabilities in an absorbing Markov chain.

    Given a row-stochastic transition matrix that includes transient states and
    absorbing states (e.g., {"Conversion", "Null"}), this function calculates the
    probability of eventual absorption into the **Conversion** state for each
    transient state using the fundamental matrix of the chain.

    The calculation follows the standard absorbing Markov chain formulation:
    - Partition the transition matrix into submatrices:
        * Q: transitions among transient states.
        * R: transitions from transient states to absorbing states.
    - Compute the fundamental matrix N = (I - Q)^{-1}. If (I - Q) is singular,
      the Moore–Penrose pseudoinverse is used as a fallback.
    - Absorption probabilities are given by B = N @ R.
    - The conversion probability vector is extracted from the first column of B,
      assuming the **first absorbing state** in `absorbing_states` is `"Conversion"`.

    Parameters
    ----------
    transition_matrix : pandas.DataFrame
        Row-stochastic transition probability matrix where both the index and columns
        are states. Must include the absorbing states present in `absorbing_states`.
        Rows should sum to ~1 (floating point tolerance), except for any missing data.
    absorbing_states : Iterable[str], optional
        Collection of absorbing states included in the transition matrix. The function
        assumes the **first** item corresponds to `"Conversion"`, and will set:
        - `conv_probs["Conversion"] = 1.0`
        - `conv_probs["Null"] = 0.0`
        by convention. Defaults to `ABSORBING_STATES`.

    Returns
    -------
    conv_probs : pandas.Series
        A Series indexed by states with the conversion probability for each state.
        - Transient states: probability in [0, 1] of eventually reaching `"Conversion"`.
        - `"Conversion"`: 1.0
        - `"Null"` (or other non-conversion absorbing states): 0.0 (by convention).
        The Series is sorted in descending order.

    Raises
    ------
    KeyError
        If any state in `absorbing_states` is not present in the transition matrix
        index/columns.
    ValueError
        If there are no transient states (i.e., all states are absorbing), making
        the computation undefined.

    Notes
    -----
    - The function treats all states not in `absorbing_states` as **transient**.
    - The fundamental matrix is computed as `N = inv(I - Q)`. If `(I - Q)` is singular,
      `pinv(I - Q)` is used to produce a stable result.
    - The first absorbing state is assumed to be `"Conversion"`. If your absorbing states
      are ordered differently, adjust the extraction of `B[:, 0]` accordingly.
    - If your absorbing set includes states other than `"Conversion"` and `"Null"`,
      you may wish to set those entries in `conv_probs` explicitly to 0.0 (unless they
      represent alternative forms of conversion).

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> states = ["Start", "Email", "Social", "Conversion", "Null"]
    >>> tm = pd.DataFrame(0.0, index=states, columns=states)
    >>> # Define some transitions (row-stochastic for transient states)
    >>> tm.loc["Start", ["Email", "Social"]] = [0.7, 0.3]
    >>> tm.loc["Email", ["Conversion", "Null"]] = [0.6, 0.4]
    >>> tm.loc["Social", ["Email", "Null"]] = [0.5, 0.5]
    >>> # Absorbing rows
    >>> tm.loc["Conversion", "Conversion"] = 1.0
    >>> tm.loc["Null", "Null"] = 1.0
    >>> conv_probs = calculate_conversion_probabilities(
    ...     tm, absorbing_states=["Conversion", "Null"]
    ... )
    >>> float(conv_probs["Start"]) > 0
    True
    >>> conv_probs["Conversion"] == 1.0 and conv_probs["Null"] == 0.0
    True
    """

    import numpy as np

    trans_states = [s for s in transition_matrix.index if s not in absorbing_states]
    # Q[i, j] is the probability of transitioning from transient state i to transient state j
    Q = transition_matrix.loc[trans_states, trans_states].values
    # R[i, k] is the probability of transitioning from transient state i to absorbing state k
    R = transition_matrix.loc[trans_states, absorbing_states].values

    I = np.eye(len(trans_states))
    # N[i, j] is the expected number of visits to transient state j starting from transient state i before absorption.
    # It tells you how “sticky” certain states are in the journey
    try:
        N = np.linalg.inv(I - Q)
    except np.linalg.LinAlgError:
        N = np.linalg.pinv(I - Q) # If I - Q is singular (non-invertible)
        
    # B[i, k] is the probability of being absorbed into absorbing state k when starting from transient state i
    B = N @ R
    conv_probs = pd.Series(B[:, 0], index=trans_states)  # first absorbing is Conversion
    conv_probs["Conversion"] = 1.0
    conv_probs["Null"] = 0.0
    return conv_probs.sort_values(ascending=False)

# =====================================================================================
# STEP 5 – REMOVAL EFFECT ATTRIBUTION
# =====================================================================================

def calculate_removal_effect(transition_matrix, conversion_probs, paths, absorbing_states=ABSORBING_STATES):
    """
    Estimate channel removal effects on total conversion using a Markov-chain reallocation heuristic.

    This function quantifies the contribution of each non-absorbing, non-start channel by
    measuring the drop in total expected conversions after virtually removing that channel
    from the transition matrix. Removal is modeled in two steps:
      1. **Redistribution of inbound probabilities to the removed channel**:
         For each `from_state`, the probability that previously flowed to the removed channel
         is set to 0 and redistributed proportionally across the remaining outgoing states
         (preserving row-stochasticity).
      2. **Channel becomes absorbing to "Null"**:
         The removed channel's row is set to all zeros except a 1 on `"Null"`, making it an
         absorbing non-conversion terminal.

    After adjusting the transition matrix for each channel, the function recomputes
    per-state conversion probabilities and compares the **baseline total expected conversions**
    to the **new total**. The difference is recorded as the channel's removal effect score.

    Parameters
    ----------
    transition_matrix : pandas.DataFrame
        Row-stochastic transition probability matrix (index and columns are identical state sets).
        Must include the `absorbing_states` (e.g., {"Conversion", "Null"}) and the `START_STATE`
        (referenced globally). Absorbing rows should have a 1 on the diagonal and 0 elsewhere.
    conversion_probs : pandas.Series
        Baseline per-state probability of eventual absorption in `"Conversion"`. Typically produced
        by `calculate_conversion_probabilities(transition_matrix, absorbing_states)`. Must include
        entries for all transient states and absorbing states (with `"Conversion" = 1.0`, `"Null" = 0.0`).
    paths : list[list[str]]
        Customer paths as lists of states (including the `START_STATE` as the first element).
        The function uses `path[1]` (the **first post-start state**) as the entry state when
        summing conversion probabilities across customers. Paths with length ≤ 1 are ignored.
    absorbing_states : Iterable[str], optional
        The set of absorbing states present in the transition matrix. Defaults to `ABSORBING_STATES`
        (expected to include `"Conversion"` and `"Null"`).

    Returns
    -------
    percentages : pandas.Series
        A Series of removal effect percentages per channel, normalized to sum to 100 when the
        total removal effect is positive. If the total effect is non-positive, all percentages
        are set to 0.0. Sorted in descending order.
    scores : dict[str, float]
        Raw removal effect scores per channel: `baseline_total - new_total` after removing the channel.
        Larger positive values indicate a greater impact of the channel on total expected conversions.

    Notes
    -----
    - **Baseline total** is computed as:
      `sum(conversion_probs[path[1]] for path in paths if len(path) > 1)`,
      i.e., the sum of conversion probabilities of first post-start states across customers.
    - **Redistribution step** preserves each transient row's total probability mass
      (except for the removed channel), by distributing the removed probability
      proportionally to remaining outgoing probabilities.
    - The removed channel is set to an **absorbing "Null"** state:
      its row is zeroed and `"Null"` is set to 1.0 (if `"Null"` exists).
    - The function excludes `START_STATE` and absorbing states from the set of channels
      considered for removal.
    - If the sum of all channel scores is ≤ 0, percentage outputs are all 0.0 to avoid
      misleading normalization.

    Examples
    --------
    >>> # Assume `tm` is a valid transition matrix with states including START_STATE, "Conversion", "Null".
    >>> # Assume `paths` were generated by `build_customer_paths`.
    >>> conv_probs = calculate_conversion_probabilities(tm, absorbing_states=["Conversion", "Null"])
    >>> pct, raw_scores = calculate_removal_effect(tm, conv_probs, paths, absorbing_states=["Conversion", "Null"])
    >>> pct.head()
    Email      42.7
    Social     31.4
    Search     18.9
    Display     7.0
    dtype: float64

    Edge Cases
    ----------
    - If a `from_state` has zero remaining outgoing probability after removing the channel
      (i.e., all mass pointed to the removed channel), redistribution is skipped; the row
      will sum to 0 for that state until normalization by `calculate_conversion_probabilities`.
    - If `"Null"` is not present among `absorbing_states`/columns, the removed channel's row
      becomes all zeros; consider adding `"Null"` or handling this explicitly upstream.
    - Ensure `paths` contain at least one post-start state for meaningful totals; otherwise,
      `baseline_total` and `new_total` may be 0, leading to zero percentages.
    """

    channels = [s for s in transition_matrix.index if s not in (START_STATE, *absorbing_states)]
    baseline_total = sum(conversion_probs[path[1]] for path in paths if len(path) > 1)
    scores = {}

    for ch in channels:
        tm = transition_matrix.copy()

        # redistribute transitions going to this channel
        for from_s in tm.index:
            if from_s == ch:
                continue
            removed = tm.loc[from_s, ch]
            if removed <= 0:
                continue
            tm.loc[from_s, ch] = 0.0
            remain_states = [s for s in tm.columns if s != ch]
            remain_sum = tm.loc[from_s, remain_states].sum()
            if remain_sum > 0:
                tm.loc[from_s, remain_states] += removed * (tm.loc[from_s, remain_states] / remain_sum)

        # channel becomes absorbing to Null
        tm.loc[ch, :] = 0.0
        if "Null" in tm.columns:
            tm.loc[ch, "Null"] = 1.0

        new_conv_probs = calculate_conversion_probabilities(tm, absorbing_states)
        new_total = sum(new_conv_probs[path[1]] for path in paths if len(path) > 1)
        scores[ch] = baseline_total - new_total

    # Option B: zero-out negatives for the percentage view
    positive_scores = {k: max(v, 0.0) for k, v in scores.items()}
    pos_total = sum(positive_scores.values())
    
    if pos_total > 0:
        pct = {k: v / pos_total * 100.0 for k, v in positive_scores.items()}
    else:
        pct = {k: 0.0 for k in scores}
    
    attribution_pct = pd.Series(pct).sort_values(ascending=False)
    return attribution_pct, scores  # keep raw scores (including negatives) for diagnostics

# =====================================================================================
# STEP 6 – SIMPLE EDA & SAVES
# =====================================================================================

def basic_eda(events_df, paths_df, transition_matrix, conversion_probs, attribution_pct):
    print("\n=== BASIC EDA ===")
    print(f"Events: {len(events_df)}, Customers: {events_df['customer_id'].nunique()}")
    print(f"States: {events_df['state'].nunique()}, Unique transitions: {len(transition_matrix)**2 - (transition_matrix==0).sum().sum()}")
    print(f"Conversion rate: {paths_df['converted'].mean():.2%}")
    print("\nState distribution:")
    print(events_df["state"].value_counts())
    print("\nConversion probability by state:")
    print(conversion_probs)
    print("\nChannel attribution (%):")
    print(attribution_pct)

def save_all_outputs(
    events_df,
    paths_df,
    transition_matrix,
    conversion_probs,
    transition_counts,
    connectivity_df,
    attribution_pct,
    attribution_raw,
):
    # 01 events
    events_df.to_csv("01_customer_events.csv", index=False)

    # 02 paths
    paths_df.to_csv("02_customer_paths.csv", index=False)

    # 03 transition matrix
    transition_matrix.to_csv("03_transition_matrix.csv")

    # 04 conversion probabilities
    conv_out = pd.DataFrame(
        {
            "State": conversion_probs.index,
            "Conversion_Probability": conversion_probs.values,
            "Conversion_Percentage": conversion_probs.values * 100,
        }
    )
    conv_out.to_csv("04_conversion_probabilities.csv", index=False)

    # 05 channel attribution
    attr_df = pd.DataFrame(
        {
            "Channel": attribution_pct.index,
            "Attribution_Percentage": attribution_pct.values,
            "Raw_Score": [attribution_raw[c] for c in attribution_pct.index],
        }
    )
    attr_df.to_csv("05_channel_attribution.csv", index=False)

    # 06 transition counts
    tc_df = pd.DataFrame(
        {
            "From_State": [k[0] for k in transition_counts.keys()],
            "To_State": [k[1] for k in transition_counts.keys()],
            "Count": list(transition_counts.values()),
        }
    ).sort_values("Count", ascending=False)
    tc_df.to_csv("06_transition_counts.csv", index=False)

    # 07 connectivity
    in_deg = (transition_matrix > 0).sum(axis=0)
    out_deg = (transition_matrix > 0).sum(axis=1)
    connectivity_df = pd.DataFrame(
        {
            "State": transition_matrix.index,
            "In_Degree": in_deg.values,
            "Out_Degree": out_deg.values,
            "Total_Connections": in_deg.values + out_deg.values,
        }
    )
    connectivity_df.to_csv("07_state_connectivity.csv", index=False)

    print("Saved CSV files 01–07 in current directory.")

# =====================================================================================
# STEP 7 – INTERACTIVE HELPERS
# =====================================================================================

def analyze_customer(customer_id, paths_df, events_df):

    """
    Display a detailed view of a single customer's journey and event timeline.

    This function retrieves the customer's path summary from `paths_df` and their
    chronological event history from `events_df`. It prints:
    - The full path as a string.
    - Path length, conversion status, and number of unique intermediate states.
    - A timestamped event timeline with event type and associated state.

    Parameters
    ----------
    customer_id : str
        The unique identifier of the customer to analyze.
    paths_df : pandas.DataFrame
        DataFrame containing per-customer path summaries, typically produced by
        `build_customer_paths()`. Must include columns:
        - `customer_id`
        - `path` (string representation of the journey)
        - `path_length` (int)
        - `converted` (bool)
        - `num_states` (int)
    events_df : pandas.DataFrame
        DataFrame of raw customer events, typically used to build paths. Must include:
        - `customer_id`
        - `event_time` (datetime-like)
        - `event_type` (str)
        - `state` (str)

    Returns
    -------
    None
        Prints the customer's path summary and event timeline to stdout. If the
        customer ID is not found in `paths_df`, prints a message and returns early.

    Notes
    -----
    - The event timeline is sorted by `event_time` in ascending order.
    - If the customer has no recorded events or path, the function prints
      `"No data for {customer_id}"` and exits.
    - This function is intended for interactive exploration and does not return
      structured data.

    Examples
    --------
    >>> analyze_customer("CUST_0001", paths_df, events_df)
    === CUSTOMER CUST_0001 ===
    Path: Start → Email → Conversion
    Path length: 3, Converted: True, Unique states: 1

    Event timeline:
      2025-12-19 09:15:00 | Email Campaign       | Email
      2025-12-19 09:30:00 | Purchase             | Conversion
    """

    row = paths_df[paths_df["customer_id"] == customer_id]
    if row.empty:
        print(f"No data for {customer_id}")
        return
    row = row.iloc[0]
    print(f"\n=== CUSTOMER {customer_id} ===")
    print(f"Path: {row['path']}")
    print(f"Path length: {row['path_length']}, Converted: {row['converted']}, Unique states: {row['num_states']}")
    ev = events_df[events_df["customer_id"] == customer_id].sort_values("event_time")
    print("\nEvent timeline:")
    for _, e in ev.iterrows():
        print(f"  {e['event_time']} | {e['event_type']:20s} | {e['state']}")

def state_transition_analysis(from_state, transition_matrix):
    """
    Display outbound transition probabilities from a given state.

    This function retrieves the row corresponding to `from_state` in the
    transition matrix and prints all non-zero transitions to other states,
    sorted in descending order of probability. Each transition is displayed
    as a percentage for easy interpretation.

    Parameters
    ----------
    from_state : str
        The state from which outbound transitions should be analyzed.
        Must exist in the index of `transition_matrix`.
    transition_matrix : pandas.DataFrame
        A row-stochastic transition probability matrix where both index and
        columns represent states. Typically produced by `build_transition_matrix()`.

    Returns
    -------
    None
        Prints the transitions and their probabilities to stdout. If `from_state`
        is not found in the matrix, prints an error message and exits.

    Notes
    -----
    - Only transitions with probability > 0 are displayed.
    - Probabilities are shown as percentages with two decimal places.
    - This function is intended for interactive exploration and does not return
      structured data.

    Examples
    --------
    >>> state_transition_analysis("Email", transition_matrix)
    === Transitions from Email ===
      → Conversion      : 60.00%
      → Null            : 40.00%
    """
    if from_state not in transition_matrix.index:
        print(f"State '{from_state}' not in matrix.")
        return
    print(f"\n=== Transitions from {from_state} ===")
    row = transition_matrix.loc[from_state].sort_values(ascending=False)
    for s, p in row[row > 0].items():
        print(f"  → {s:15s}: {p*100:5.2f}%")


In [None]:
# =====================================================================================
# MAIN EXECUTION
# =====================================================================================

if __name__ == "__main__":
    print("Generating synthetic data...")
    events_df = generate_synthetic_events()
    print("Building customer paths...")
    paths, paths_df = build_customer_paths(events_df)
    print("Counting transitions...")
    transition_counts, all_states, transition_records = count_transitions(paths) # transition_counts exists but does not render in notebook
    print("Building transition matrix...")
    transition_matrix = build_transition_matrix(transition_counts, all_states)
    print("Computing conversion probabilities...")
    conversion_probs = calculate_conversion_probabilities(transition_matrix)
    print("Computing removal-effect attribution...")
    attribution_pct, attribution_raw = calculate_removal_effect(
        transition_matrix, conversion_probs, paths
    )

    in_deg = (transition_matrix > 0).sum(axis=0)
    out_deg = (transition_matrix > 0).sum(axis=1)
    connectivity_df = pd.DataFrame(
        {
            "State": transition_matrix.index,
            "In_Degree": in_deg.values,
            "Out_Degree": out_deg.values,
            "Total_Connections": in_deg.values + out_deg.values,
        }
    )

    basic_eda(events_df, paths_df, transition_matrix, conversion_probs, attribution_pct)

    # Example interactive usage
    print("\nExample: analyze one customer:")
    analyze_customer("CUST_0002", paths_df, events_df)

    print("\nExample: transitions from Email:")
    state_transition_analysis("Email", transition_matrix)

    print("\nDone.")

In [None]:
events_df.head(10)

## calculate_conversion_probabilities
What does this function do in plain business terms?  

It answers:  
**“If a customer starts in a certain state (like seeing an ad, visiting a webpage, or opening an email), what is the probability they will eventually convert?”**

Think of your customer journey as a map of possible steps (states). Some steps are transient (e.g., browsing, engaging with ads), and some are absorbing (end states like Conversion or Drop-off/Null). Once a customer reaches an absorbing state, they stay there.

## Business interpretation

- High conversion probability states = strong leverage points. Invest more here.
- Low conversion probability states = weak links. Maybe redesign or reduce spend.
- Journey insights: If “Social” has low conversion probability but often leads to “Email”, it might still be valuable as an assist channel.

### What does “Email = 0.232569” mean?
It means:  
**If a customer is currently in the “Email” step of the journey, there is about a 23% chance they will eventually convert (reach the Conversion state), considering all possible paths they might take from here.**

## Why This Method Beats First-Touch or Last-Touch Attribution
Traditional attribution models like first-touch (crediting the first interaction) or last-touch (crediting the final interaction) oversimplify the customer journey. They ignore the fact that most conversions happen after multiple interactions across channels.
Our approach uses an absorbing Markov chain, which:

- Considers the entire journey, not just the start or end.
- Accounts for all possible paths and loops a customer might take before converting or dropping off.
- Produces probabilities based on actual behavior, rather than arbitrary credit rules.

Business impact:

- You see which channels truly influence conversion—even if they’re not the first or last touch.
- You can identify assist channels that play a critical role in moving customers forward.
- Budget decisions become data-driven, focusing on steps that increase the likelihood of conversion.

# Channel Attribution Percent:
- These percentages are each channel’s share of the total, positive impact on conversions, measured by a “removal effect.”
- In plain terms: If we took a channel away and re-routed the journey based on observed behavior, how much would total conversions drop? The bigger the drop, the higher the attribution %.
- A **causal‑ish**, scenario-based measure of incremental contribution. It re-computes expected conversions after “removing” a channel (with probabilities re-allocated) and measures the drop.

## What `attribution_pct` Is
`attribution_pct` is a pandas Series that gives the percentage contribution of each channel (non-start, non-absorbing state) to total expected conversions, based on removal effects. It’s computed by:

1. Measuring how much total expected conversions would drop if a channel were removed (its removal effect score).
2. Normalizing those scores to percentages so they sum to 100% (when the total effect is positive).

This is a widely used approach in Markov chain attribution: **a channel’s importance is proportional to how much conversions would decrease if the channel didn’t exist in the journey.**