# Group 12 - Fingerhut FreshStart Customer Behavior Analysis

## Import Basic Libraries (specific libraries imported later)

In [None]:
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import mchmm as mc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pydtmc

## EDA

In [None]:
# import datasets

fingerhut = pd.read_csv('export.csv')
event_def = pd.read_csv('event_definitions.csv')

In [None]:
# Count how many rows have event_name == "order_shipped"

len(fingerhut[fingerhut['event_name'] == 'order_shipped'])

In [None]:
len(event_def)

In [None]:
# find number of unique values in each column

fingerhut.nunique()

In [None]:
len(fingerhut)

In [None]:
# number of unique values in 'customer_id' column
print(fingerhut['customer_id'].nunique())

In [None]:
# number of unique values in 'customer_id' column
print(fingerhut['customer_id'].nunique())

# number of unique values in 'account_id' column
print(fingerhut['account_id'].nunique())

In [None]:
account_customer_grouped = fingerhut.groupby('account_id')['customer_id'].nunique()

sum(account_customer_grouped > 1)

In [None]:
customer_account_grouped = fingerhut.groupby('customer_id')['account_id'].nunique()

sum(customer_account_grouped > 1)

In [None]:
# find earliest and latest dates in 'event_timestamp' column

print(fingerhut['event_timestamp'].min())
print(fingerhut['event_timestamp'].max())

In [None]:
fingerhut['event_timestamp'] = pd.to_datetime(fingerhut['event_timestamp'])

In [None]:
print(fingerhut['event_timestamp'].groupby(pd.to_datetime(fingerhut['event_timestamp']).dt.year).agg('count'))

In [None]:
fingerhut['event_timestamp'].groupby(pd.to_datetime(fingerhut['event_timestamp']).dt.year).agg('count').plot(kind='bar')

In [None]:
print(fingerhut['event_timestamp'].groupby(pd.to_datetime(fingerhut['event_timestamp']).dt.month).agg('count'))

In [None]:
# extract number of rows for each month in 'event_timestamp' column, and create plot of results

fingerhut['event_timestamp'].groupby(pd.to_datetime(fingerhut['event_timestamp']).dt.month).agg('count').plot(kind='line')

In [None]:
# repeat above for just 2021 and 2022:

fingerhut_2021_2022 = fingerhut[(fingerhut['event_timestamp'] >= '2021-01-01') & (fingerhut['event_timestamp'] < '2023-01-01')]

print(fingerhut_2021_2022['event_timestamp'].groupby(fingerhut_2021_2022['event_timestamp'].dt.month).agg('count'))

fingerhut_2021_2022['event_timestamp'].groupby(fingerhut_2021_2022['event_timestamp'].dt.month).agg('count').plot(kind='line')
plt.xlabel('Total number of events by month (2021-2022)')


In [None]:
fingerhut_2021_2022.resample('M', on = 'event_timestamp').size().plot()
plt.xlabel('Total number of events by month (2021-2022)')

In [None]:
# Find max journey steps value for each unique customer, and store values in list

unique_accounts = fingerhut['account_id'].unique()

In [None]:
max_journey_steps = fingerhut.groupby('account_id')['journey_steps_until_end'].max()

In [None]:
# make boxplot of max journey steps values

max_journey_steps.plot(kind='box')

## Data Cleaning

In [None]:
# ignoring journey_steps_until_end, find all duplicate rows

figerhut_no_journey_steps_until_end = fingerhut.drop(columns=['journey_steps_until_end'])

duplicates = figerhut_no_journey_steps_until_end[figerhut_no_journey_steps_until_end.duplicated()]

duplicates

In [None]:
fingerhut_copy = fingerhut.copy()

In [None]:
# remove fingerhut from memory

del fingerhut

In [None]:
# finding number of customer_ids per account_id
account_customer_grouped = fingerhut_copy.groupby('account_id')['customer_id'].nunique()

# return only the accounts with more than one customer
account_customer_grouped = account_customer_grouped[account_customer_grouped > 1]

In [None]:
# finding number of account_ids per customer_id
customer_account_grouped = fingerhut_copy.groupby('customer_id')['account_id'].nunique()

# return only the customers with more than one account
customer_account_grouped = customer_account_grouped[customer_account_grouped > 1]

In [None]:
# delete journey_steps_until_end column

del fingerhut_copy['journey_steps_until_end']

In [None]:
# remove duplicate rows

fingerhut_copy = fingerhut_copy[~fingerhut_copy.duplicated()]

In [None]:
# pull out the accounts with more than one customer

fingerhut_many_customers = fingerhut_copy[fingerhut_copy['account_id'].isin(account_customer_grouped.index)]

# pull out the customers with more than one account

fingerhut_many_accounts = fingerhut_copy[fingerhut_copy['customer_id'].isin(customer_account_grouped.index)]

# take out fingerhut_many_customers and fingerhut_many_accounts from fingerhut_copy

fingerhut_copy = fingerhut_copy[~fingerhut_copy['account_id'].isin(account_customer_grouped.index)]
fingerhut_copy = fingerhut_copy[~fingerhut_copy['customer_id'].isin(customer_account_grouped.index)]

In [None]:
# reset index for all three dataframes

fingerhut_copy.reset_index(drop=True, inplace=True)
fingerhut_many_customers.reset_index(drop=True, inplace=True)
fingerhut_many_accounts.reset_index(drop=True, inplace=True)

In [None]:
# create new column in fingerhut_copy called 'combined_id' starting at 0 and incrementing by 1 for each new account_id

fingerhut_copy['combined_id'] = fingerhut_copy.groupby('account_id').ngroup()

In [None]:
# sort fingerhut_copy by combined_id

fingerhut_copy.sort_values(by=['combined_id'], inplace=True)

In [None]:
# create new column in fingerhut_many_customers called 'combined_id' starting at fingerhut_copy['combined_id'].max()

fingerhut_many_customers['combined_id'] = fingerhut_many_customers.groupby('account_id').ngroup() + fingerhut_copy['combined_id'].max() + 1

In [None]:
# sort fingerhut_many_customers by combined_id

fingerhut_many_customers.sort_values(by='combined_id', inplace=True)

In [None]:
# create new column in fingerhut_many_accounts called 'combined_id' starting at fingerhut_many_customers['combined_id'].max()

fingerhut_many_accounts['combined_id'] = fingerhut_many_accounts.groupby('customer_id').ngroup() + fingerhut_many_customers['combined_id'].max() + 1

In [None]:
# sort fingerhut_many_accounts by combined_id

fingerhut_many_accounts.sort_values(by='combined_id', inplace=True)

In [None]:
# append all three dataframes together

fingerhut_combined = fingerhut_copy.append(fingerhut_many_customers)
fingerhut_combined = fingerhut_combined.append(fingerhut_many_accounts)

In [None]:
# for each combined_id, sort by event_timestamp

fingerhut_combined.sort_values(by=['combined_id', 'event_timestamp'], inplace=True)

In [None]:
# reset index of fingerhut_combined

fingerhut_combined.reset_index(drop=True, inplace=True)

In [None]:
# for every combined_id, add a column called 'journey_steps_until_end' that increments by 1 for each row

fingerhut_combined['journey_steps_until_end'] = fingerhut_combined.groupby('combined_id').cumcount(ascending=True)

In [None]:
# export fingerhut_combined to csv

fingerhut_combined.to_csv('fingerhut_combined.csv', index=False)

In [None]:
fingerhut_combined = pd.read_csv('fingerhut_combined.csv')

In [None]:
event_definitions = pd.read_csv('event_definitions.csv')

In [None]:
# create dictionary based on 'stage' and 'event_definition_id' columns in event_definitions, using 'event_definition_id' as the key and 'stage' as the value

event_dict = event_definitions.set_index('event_definition_id')['stage'].to_dict()

# add additional key value pair
event_dict[1] = 'Promotion Created'
event_dict[24] = 'Campaignemail Clicked'

event_dict

In [None]:
# append a new column to fingerhut_combined called 'stage' that contains the value from the dictionary based on the 'event_name' column

fingerhut_combined['stage'] = fingerhut_combined['ed_id'].map(event_dict)

fingerhut_combined

In [None]:
# count number of rows where 'stage' is NaN

fingerhut_combined['stage'].isna().sum()

In [None]:
customer_characteristics = pd.DataFrame(fingerhut_combined['combined_id'].unique(), columns=['combined_id'])

customer_characteristics['application'] = np.nan
customer_characteristics['activation'] = np.nan
customer_characteristics['promotion_exposure'] = np.nan
customer_characteristics['place_order_web'] = np.nan
customer_characteristics['place_order_phone'] = np.nan
customer_characteristics['order_shipped'] = np.nan

customer_characteristics

In [None]:
# check if there are any customers who do not have the 'Apply for Credit' stage in their journey, using groupby:

credit_applications = fingerhut_combined.groupby('combined_id')['stage'].apply(lambda x: 'Apply for Credit' in x.values)

In [None]:
sum(credit_applications)

In [None]:
# based on credit applications modify the customer_characteristics dataframe to add 0 to the 'application' column for customers who do not have the 'Apply for Credit' stage in their journey. Else add 1.

customer_characteristics['application'] = customer_characteristics['combined_id'].map(credit_applications)

customer_characteristics['application'] = customer_characteristics['application'].astype(int)

customer_characteristics

In [None]:
promotion_ids = [2, 9, 20, 21, 1, 24]

# check if there are any customers who have not been exposed to any promotions, using groupby (has at least one promotion_id as ed_id):

promotion_exposure = fingerhut_combined.groupby('combined_id')['ed_id'].apply(lambda x: any(i in promotion_ids for i in x.values))

sum(promotion_exposure)

In [None]:
# based on promotion exposure modify the customer_characteristics dataframe to add 1 to the 'promotion_exposure' column for customers who have not been exposed to any promotions. Else add 0.

customer_characteristics['promotion_exposure'] = customer_characteristics['combined_id'].map(promotion_exposure)

customer_characteristics['promotion_exposure'] = customer_characteristics['promotion_exposure'].astype(int)

In [None]:
# CHANGING SYSTEM-WIDE TYPO

# change all values 'account_activitation' to 'account_activation' in the 'event_name' column

fingerhut_combined['event_name'] = fingerhut_combined['event_name'].replace('account_activitation', 'account_activation')

In [None]:
# check if event_name 'account_activitation' is in the journey of each customer using groupby:

account_activations = fingerhut_combined.groupby('combined_id')['event_name'].apply(lambda x: 'account_activation' in x.values)

account_activations

In [None]:
sum(account_activations)

In [None]:
# based on account_activations modify the customer_characteristics dataframe to add 0 to the 'activation' column for customers who do not have the 'account_activation' stage in their journey. Else add 1.

customer_characteristics['activation'] = customer_characteristics['combined_id'].map(account_activations)

customer_characteristics['activation'] = customer_characteristics['activation'].astype(int)

customer_characteristics

In [None]:
# check if 'place_order_phone' or 'place_order_web' is in the journey of each customer using groupby:

place_orders_web = fingerhut_combined.groupby('combined_id')['event_name'].apply(lambda x: 'place_order_web' in x.values)

place_orders_phone = fingerhut_combined.groupby('combined_id')['event_name'].apply(lambda x: 'place_order_phone' in x.values)

sum(place_orders_web)
sum(place_orders_phone)

In [None]:
# based on place_orders modify the customer_characteristics dataframe to add 0 to the 'place_order' column for customers who do not have the 'place_order_phone' or 'place_order_web' stage in their journey. Else add 1.

customer_characteristics['place_order_web'] = customer_characteristics['combined_id'].map(place_orders_web)
customer_characteristics['place_order_phone'] = customer_characteristics['combined_id'].map(place_orders_phone)

customer_characteristics['place_order_web'] = customer_characteristics['place_order_web'].astype(int)
customer_characteristics['place_order_phone'] = customer_characteristics['place_order_phone'].astype(int)

customer_characteristics

In [None]:
# check if 'order_shipped' is in the journey of each customer using groupby:

order_shipped = fingerhut_combined.groupby('combined_id')['event_name'].apply(lambda x: 'order_shipped' in x.values)

sum(order_shipped)

In [None]:
# based on order_shipped modify the customer_characteristics dataframe to add 0 to the 'order_shipped' column for customers who do not have the 'order_shipped' stage in their journey. Else add 1.

customer_characteristics['order_shipped'] = customer_characteristics['combined_id'].map(order_shipped)

customer_characteristics['order_shipped'] = customer_characteristics['order_shipped'].astype(int)

In [None]:
# export the customer_characteristics dataframe as a pickle file

customer_characteristics.to_pickle('customer_characteristics.pkl')

In [None]:
# groupby 'combined_id' and store the 'ed_id' and 'stage' as a list in columns named 'ed_ids' and 'stages'

fingerhut_combined_grouped = fingerhut_combined.groupby('combined_id').agg({'ed_id': list, 'stage': list, 'event_timestamp': list}).reset_index()

fingerhut_combined_grouped

In [None]:
# import customer_characteristics.pkl

customer_characteristics = pd.read_pickle('customer_characteristics.pkl')

In [None]:
# merge the fingerhut_combined_grouped dataframe with the customer_characteristics dataframe

fingerhut_combined_grouped = pd.merge(fingerhut_combined_grouped, customer_characteristics, on='combined_id')

In [None]:
# export fingerhut_combined_grouped to a pickle file int ../Dataset

fingerhut_combined_grouped.to_pickle('fingerhut_combined_grouped.pkl')

In [None]:
def_df = pd.read_csv("event_definitions.csv")
df = pd.read_csv("export.csv")

In [None]:
print(df.shape)

In [None]:
df['event_timestamp'] = pd.to_datetime(df['event_timestamp']) # convert to pd datetime

In [None]:
# remove duplicate rows - don't factor in journey_steps_until_end because this counts up even for duplicate rows
df_dropped = df.drop(['journey_steps_until_end'], axis = 1)
df_dropped = df_dropped.drop_duplicates()
# df_dropped = df.drop(['journey_steps_until_end'], axis = 1)
df_dropped.head(10)

In [None]:
print(df_dropped.shape)
df.shape[0] - df_dropped.shape[0] # number of duplicate rows removed ~8 million

In [None]:
df_dropped['journey_steps_until_end'] = df_dropped.groupby(['customer_id', 'account_id']).cumcount() + 1 # add journey_steps_until_end back in

In [None]:
df_dropped.reset_index(drop=True, inplace=True)

In [None]:
df_dropped.dtypes

In [None]:
# get count of account_activations for each customer/account pair
activation_counts = df_dropped.groupby(['customer_id', 'account_id'])['event_name'].apply(lambda x: x.str.count('account_activitation').sum())
activation_counts

In [None]:
# get count of account_activation occurrences
activation_counts.value_counts()

In [None]:
# just get those with multiple activations
multiple_activations = activation_counts[activation_counts > 1]
multiple_activations.value_counts()

In [None]:
mult_act_cust = list(multiple_activations.index) # list of tuples of (customer_id, account_id) with multiple activations

# create the filter for dataframe rows that correspond to customers with multiple activations
filt = ~df_dropped.apply(lambda row: (row['customer_id'], row['account_id']) in mult_act_cust, axis=1)
df_filtered = df_dropped[filt] # remove customers with multiple activations

In [None]:
# now make sure all remaining customer/account combinations have only 0 or 1 account_activations
df_filtered.groupby(['customer_id', 'account_id'])['event_name'].apply(lambda x: x.str.count('account_activitation').sum()).value_counts()

In [None]:
df_dropped[['customer_id', 'account_id']].drop_duplicates().shape # 1735767 unique combinations of customer/account id

In [None]:
print(2173 / 1735767) # 0.001% of customer/account combinations would be removed if we removed those with multiple account activations
238751 / df_dropped.shape[0] # 0.4% of dataset would be dropped if we removed customers with multiple activations
# It seems that multiple account activations just indicates multiple journeys under the same customer_id and account_id which is interesting

# Because such a small portion of the dataset has multiple activations, we will drop these rows for simplicity and for the sake of keeping the data uniform.

# We will proceed with attempting to remove "incomplete" customers in the data

In [None]:
### Before proceeding with turning data into wide format, first add 'stage' column based on definition df

stage_dict = {'Apply for Credit' : 1, 'Credit Account' : 2, 'Discover' : 3, 'Downpayment' : 4, 'First Purchase' : 5, 
              'Order Shipped' : 6, 'Prospecting' : 7} # create dict to map stage to int
def_df['stage_int'] = def_df['stage'].map(stage_dict)

In [None]:
def_df.sort_values('event_definition_id')


In [None]:
event_dict = def_df.set_index('event_definition_id')['stage'].to_dict()

# these are missing, so manually assign them
event_dict[1] = 'Promotion Created'
event_dict[24] = 'Campaignemail Clicked'

In [None]:
df_filtered['stage'] = df_filtered['ed_id'].map(event_dict)

In [None]:
df_filtered['stage'].isna().sum()

In [None]:
# Group by 'customer_id' and aggregate the other columns into lists
df_grouped_cust_acct = df_filtered.groupby(['customer_id', 'account_id']).agg({
    'ed_id': list,
    'event_name': list,
    'event_timestamp': list,
    'journey_steps_until_end': list,
    'stage' : list
}).reset_index()

In [None]:
# ## add rows on account activation status

# 1 if row activated account, 0 if not
df_grouped_cust_acct['account_activation'] = df_grouped_cust_acct['event_name'].apply(lambda x: 1 if 'account_activitation' in x else 0)

# 1 if row placed order, 0 if did not
df_grouped_cust_acct['place_order'] = df_grouped_cust_acct['event_name'].apply(lambda x: 1 if any([i in x for i in ['place_order_web', 'place_order_phone']]) else 0)

In [None]:
activated_no_order = df_grouped_cust_acct[(df_grouped_cust_acct['account_activation'] == 1) & (df_grouped_cust_acct['place_order'] == 0)][['customer_id', 'account_id']]
activated_no_order

In [None]:
### Now proceed with removing incomplete customers - those who activated within the last 60 days

cutoff_date = df_dropped['event_timestamp'].max() # extract last date in dataset

activation_events = df_filtered[df_filtered['event_name'] == 'account_activitation'] # get account activations

# Merge to keep only activation events where the order wasn't shipped
df_merged = activation_events.merge(activated_no_order, on=['customer_id', 'account_id'])

# Group by 'customer_id' and 'account_id' to find the latest activation timestamp for each
activation_times = df_merged.groupby(['customer_id', 'account_id'])['event_timestamp'].max().reset_index()

# Calculate the number of days since activation for each customer/account pair
activation_times['days_since_activation'] = (cutoff_date - activation_times['event_timestamp']).dt.days

# Filter out rows where 'days_since_activation' is greater than 60
incomplete_cust_df = activation_times[activation_times['days_since_activation'] <= 60]

In [None]:
cutoff_date

In [None]:
incomplete_cust = set(zip(incomplete_cust_df['customer_id'], incomplete_cust_df['account_id']))
cleaned_df = df_grouped_cust_acct[~df_grouped_cust_acct.apply(lambda row: (row['customer_id'], row['account_id']) in incomplete_cust, axis=1)]

In [None]:
cleaned_df.to_pickle("cleaned_wide_format_data.pkl")

## Data Visualization

### Sankey Diagram

In [None]:
# import datasets

event_def = pd.read_csv('event_definitions.csv')
fingerhut_combined = pd.read_csv('fingerhut_combined.csv')

In [None]:
event_dict = {
    'Apply for Credit': [15, 17, 12, 14, 3, 19],
    'Account Activation': [29],
    'Fraud Review': [37],
    'Promotion and Discover': [1, 2, 9, 10, 22, 23, 20, 21, 24],
    'Downpayment': [27, 26, 8, 25],
    'Shopping': [11, 6, 4, 5],
    'Place Order': [18, 7],
    'Order Shipped': [28]
}

In [None]:
# merge event_def (event_definition_id column) with fingerhut_combined (ed_id column)

fingerhut_combined = pd.merge(fingerhut_combined, event_def, how='left', left_on='ed_id', right_on='event_definition_id')

In [None]:
# min and max values from 'combined_id' column

min_combined_id = fingerhut_combined['combined_id'].min()
max_combined_id = fingerhut_combined['combined_id'].max()

In [None]:
# random 1000 numbers from 1 to 1665430

import random

random.seed(12)

random_numbers = random.sample(range(1, 1665430), 1000)

In [None]:
len(random_numbers)

In [None]:
# keep only rows where combined_id is in random_numbers

fingerhut_small = fingerhut_combined[fingerhut_combined['combined_id'].isin(random_numbers)]

In [None]:
# groupby combined_id and only keep groups that has at least one instance of ed_id == 29 (account_activation) and does not have any instance of ed_id == 16 (application_phone_declined) or ed_id == 13 (application_web_declined)

fingerhut_active_account = fingerhut_small.groupby('combined_id').filter(lambda x: (x['ed_id'] == 29).any() & (x['ed_id'] != 16).all() & (x['ed_id'] != 13).all())

In [None]:
# only keep 'combined_id', 'event_timestamp', 'event_name_y'

fingerhut_active_account = fingerhut_active_account[['combined_id', 'ed_id']]

In [None]:
# Use the event_dict to create a new column called 'event_type' in fingerhut_active_account. The value of 'event_type' should be the key of the event_dict, which has lists of ed_id as values.

fingerhut_active_account['event_type'] = fingerhut_active_account['ed_id'].map({v: k for k, l in event_dict.items() for v in l})

In [None]:
# remove ed_id column

fingerhut_active_account = fingerhut_active_account.drop(columns=['ed_id'])

# remove duplicates

fingerhut_active_account = fingerhut_active_account.drop_duplicates()

In [None]:
# groupby combined_id, and if the last event_type is not 'Order Shipped', then add new row with event_type 'No Order Made', and also add the combined_id to the new row

fingerhut_active_account = fingerhut_active_account.groupby('combined_id').apply(lambda x: x.append({'event_type': 'No Order Made', 'combined_id': x['combined_id'].iloc[0]}, ignore_index=True) if x['event_type'].iloc[-1] != 'Order Shipped' else x)

# reset index

fingerhut_active_account = fingerhut_active_account.reset_index(drop=True)

In [None]:
# check for NA values

fingerhut_active_account.isna().sum()

In [None]:
fingerhut_active_account['sequence'] = fingerhut_active_account.groupby('combined_id').cumcount() + 1

In [None]:
pivot_fingerhut_active_account = fingerhut_active_account.pivot(index='combined_id', columns='sequence', values='event_type').reset_index(inplace=False)

In [None]:
pivot_fingerhut_active_account = pivot_fingerhut_active_account.fillna('')

In [None]:
def generate_sankey_chart_data(df: pd.DataFrame):
    # list of list: each list is the set of nodes in each tier/column
    column_values = [df[col] for col in df.columns]

    # this generates the labels for the sankey by taking all the unique values
    labels = sum([list(node_values.unique()) for node_values in column_values], [])

    # initializes a dict of dicts (one dict per tier)
    link_mappings = {col: {} for col in df.columns}

    # each dict maps a node to a unique number value
    i = 0
    for col, nodes in zip(df.columns, column_values):
        for node in nodes.unique():
            link_mappings[col][node] = i
            i += 1

    # specifying which columns are serving as sources and which as targets
    source_cols = df.columns[:-1]
    target_cols = df.columns[1:]
    links = []

    # loop to create a list of links in the format [((src, tgt), wt), (), ()...]
    for source_col, target_col in zip(source_cols, target_cols):
        for source, target in zip(df[source_col], df[target_col]):
            links.append(
                (
                    link_mappings[source_col][source],
                    link_mappings[target_col][target],
                    1  # Weight is 1 for counting transitions
                )
            )

    # creating a dataframe with 3 columns: source, target, and weight
    df_links = pd.DataFrame(links, columns=["source", "target", "weight"])

    # generating three lists needed for the sankey visual
    sources = df_links["source"]
    targets = df_links["target"]
    weights = df_links["weight"]

    return labels, sources, targets, weights


# Your DataFrame
df = pivot_fingerhut_active_account.iloc[:, 2:]

# Call the generate_sankey_chart_data function
labels, sources, targets, weights = generate_sankey_chart_data(df=df)


# Map colors to labels
label_colors = {
    'Promotion and Discover': 'pink',
    'Apply for Credit': 'green',
    'Shopping': 'blue',
    'Account Activation': 'cyan',
    'Downpayment': 'purple',
    'Place Order': 'yellow',
    'Order Shipped': 'orange',
    'No Order Made': 'red',
    '': 'black'  # Adjust this for labels with empty strings
}

# Create a DataFrame to aggregate weights
df_links_aggregated = pd.DataFrame({'source': sources, 'target': targets, 'weight': weights})

# Aggregate weights for the same source and target pairs
df_links_aggregated = df_links_aggregated.groupby(['source', 'target'], as_index=False).agg({'weight': 'sum'})

# Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels,
        color=[label_colors[label] for label in labels]  # Map colors based on the dictionary
    ),
    link=dict(
        source=df_links_aggregated['source'],
        target=df_links_aggregated['target'],
        value=df_links_aggregated['weight'],
        # Add labels for each link (source to target) with the total counts
        label=[f"Total Counts: {weight}" for weight in df_links_aggregated['weight']]
    )
)])

fig.update_layout(title_text="Journey flow of customers with account activation (237 random customers)", font_size=10)
fig.show()


### Markov Chains

In [None]:
pickle = pd.read_pickle('fingerhut_combined_grouped.pkl')

In [None]:
# Check if 'prospecting' is not present in any of the lists in the 'stage' column
prospecting_absent = all(pickle['stage'].apply(lambda stages: 'prospecting' not in stages))

if prospecting_absent:
    print("Confirmed: 'prospecting' is not present in any of the stage lists.")
else:
    print("Warning: 'prospecting' was found in one or more of the stage lists.")

In [None]:
event_def = pd.read_csv('Event_Definitions.csv')

In [None]:
event_def['stage'].unique()

In [None]:
import random
random.seed(31524)
medium_pickle = pickle.sample(n=16000, random_state = 31524)

In [None]:
markov_chains = []

for index, row in medium_pickle.iterrows():

    sequence = row['stage']

    mc_obj = mc.MarkovChain().from_data(sequence)

    markov_chains.append(mc_obj)

In [None]:
unique_states = set()
for mc in markov_chains:
    unique_states.update(mc.states)

In [None]:
# Initialize an aggregated transition count matrix
n = len(unique_states)
aggregated_counts = np.zeros((n, n))

# Map each state to its index in the aggregated matrix
state_to_index = {state: i for i, state in enumerate(unique_states)}

for mc in markov_chains:
    # Increment counts in the aggregated matrix based on observed transitions in mc
    for i in range(len(mc.states)-1):
        from_state, to_state = mc.states[i], mc.states[i+1]
        from_index, to_index = state_to_index[from_state], state_to_index[to_state]
        aggregated_counts[from_index, to_index] += 1  # Increment count for observed transition

# Optional: Convert counts to probabilities by normalizing each row
aggregated_probs = aggregated_counts / aggregated_counts.sum(axis=1, keepdims=True)
#aggregated_probs = np.nan_to_num(aggregated_probs)  # Handle division by zero for states with no outgoing transitions

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(aggregated_probs, annot=True, cmap='coolwarm', fmt=".2f",
            xticklabels=unique_states, yticklabels=unique_states)
plt.title("Aggregated Transition Probabilities")
plt.xlabel("To State")
plt.ylabel("From State")
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(aggregated_counts, annot=True, cmap='coolwarm', fmt=".2f",
            xticklabels=unique_states, yticklabels=unique_states)
plt.title("Aggregated Transition Probabilities")
plt.xlabel("To State")
plt.ylabel("From State")
plt.show()

In [None]:
smaller_sample = pd.read_csv('smaller_sample.csv')
smaller_sample.sort_values(by=['customer_id', 'event_timestamp'], inplace=True)
unique_events = smaller_sample['event_name'].unique()
n_events = len(unique_events)

In [None]:
event_name_to_index = {event_name: index for index, event_name in enumerate(unique_events)}

In [None]:
transition_counts = np.zeros((n_events, n_events))

In [None]:
for _, group in smaller_sample.groupby('customer_id'):
    events = group['event_name'].apply(lambda x: event_name_to_index[x]).values
    for i in range(len(events) - 1):
        current_event, next_event = events[i], events[i + 1]
        transition_counts[current_event, next_event] += 1

In [None]:
transition_probs = transition_counts / transition_counts.sum(axis=1, keepdims=True)

In [None]:
transition_probs[np.isnan(transition_probs)] = 0
np.fill_diagonal(transition_probs, 0) 

In [None]:
transition_df = pd.DataFrame(transition_probs, index=unique_events, columns=unique_events)

In [None]:
column_sums = transition_df.sum(axis=0)
zero_columns = column_sums == 0

# For columns that sum to 0, assign uniform probabilities
for col in transition_df.columns[zero_columns]:
    transition_df[col] = 1 / len(transition_df.columns)

transition_df = transition_df.div(transition_df.sum(axis=0), axis=1)

In [None]:
column_sums = transition_df.sum(axis=0)
print(column_sums)

In [None]:
names = transition_df.index.tolist()
print(names)

In [None]:
mc = pydtmc.MarkovChain(np.transpose(transition_df), names)

In [None]:
pydtmc.plot_eigenvalues(mc, dpi=300)

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(transition_df, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Transition Matrix Heatmap')
plt.xlabel('To State')
plt.ylabel('From State')
plt.show()

## Modeling

### Logistic Regression - Promotions

In [None]:
# read in fingerhut_combined_grouped.pkl

fingerhut_combined_grouped = pd.read_pickle('../Dataset/fingerhut_combined_grouped.pkl')

In [None]:
# if either the place_order_web or place_order_phone is 1, then make new column place_order = 1, else 0

fingerhut_combined_grouped['place_order'] = np.where((fingerhut_combined_grouped['place_order_web'] == 1) | (fingerhut_combined_grouped['place_order_phone'] == 1), 1, 0)

In [None]:
# randomly sample 100000 rows from fingerhut_combined_grouped

# fingerhut_combined_grouped_sample = fingerhut_combined_grouped.sample(n=100000, random_state=0)

fingerhut_combined_grouped_sample = fingerhut_combined_grouped

In [None]:
# check how many rows have activation = 1

fingerhut_combined_grouped_sample['activation'].value_counts()

In [None]:
# check how many rows have place_order = 1 and activation = 1

ordered_activated = fingerhut_combined_grouped_sample[(fingerhut_combined_grouped_sample['place_order'] == 1) & (fingerhut_combined_grouped_sample['activation'] == 1)]
print(len(ordered_activated))

In [None]:
# check how many rows have place_order = 0 and activation = 0

no_ordered_no_activated = fingerhut_combined_grouped_sample[(fingerhut_combined_grouped_sample['place_order'] == 0) & (fingerhut_combined_grouped_sample['activation'] == 0)]
print(len(no_ordered_no_activated))

In [None]:
# check how many rows have place_order = 1 and activation = 0

ordered_no_activated = fingerhut_combined_grouped_sample[(fingerhut_combined_grouped_sample['place_order'] == 1) & (fingerhut_combined_grouped_sample['activation'] == 0)]
print(len(ordered_no_activated))

In [None]:
# check how many rows have place_order = 0 and activation = 1

no_ordered_activated = fingerhut_combined_grouped_sample[(fingerhut_combined_grouped_sample['place_order'] == 0) & (fingerhut_combined_grouped_sample['activation'] == 1)]
print(len(no_ordered_activated))

In [None]:
# sum of all the above 4 categories should be 100000

len(ordered_activated) + len(no_ordered_no_activated) + len(ordered_no_activated) + len(no_ordered_activated)

In [None]:
# check how many rows have promotion_exposure = 1

fingerhut_combined_grouped_sample['promotion_exposure'].value_counts()[1]

In [None]:
# check how many rows have promotion_expousre = 1 and activation = 1

promotion_activated = fingerhut_combined_grouped_sample[(fingerhut_combined_grouped_sample['promotion_exposure'] == 1) & (fingerhut_combined_grouped_sample['activation'] == 1)]
print(len(promotion_activated))

In [None]:
# check how many rows have promotion_expousre = 1 and activation = 0

promotion_no_activated = fingerhut_combined_grouped_sample[(fingerhut_combined_grouped_sample['promotion_exposure'] == 1) & (fingerhut_combined_grouped_sample['activation'] == 0)]
print(len(promotion_no_activated))

In [None]:
promotion_ids = [2, 9, 20, 21, 1, 24]

In [None]:
# for each row, if the list object in the ed_id column contains any of the above 6 ids, then make a new column `promotion_type` that contains the list of ids that were found in the ed_id column

fingerhut_combined_grouped_sample['promotion_type'] = fingerhut_combined_grouped_sample['ed_id'].apply(lambda x: list(set(x).intersection(promotion_ids)))

In [None]:
# using the promotion_type column, create new columns for each of the 6 ids and set the value to 1 if the id is present in the promotion_type list, else 0

for i in promotion_ids:
    fingerhut_combined_grouped_sample[i] = fingerhut_combined_grouped_sample['promotion_type'].apply(lambda x: 1 if i in x else 0)

In [None]:
# if any of the columns 2, 9, 20, 21, 1, 24 are 1, then make a new column `promotion_y_n` = 1, else 0

fingerhut_combined_grouped_sample['promotion_y_n'] = np.where((fingerhut_combined_grouped_sample[2] == 1) | (fingerhut_combined_grouped_sample[9] == 1) | (fingerhut_combined_grouped_sample[20] == 1) | (fingerhut_combined_grouped_sample[21] == 1) | (fingerhut_combined_grouped_sample[1] == 1) | (fingerhut_combined_grouped_sample[24] == 1), 1, 0)

In [None]:
fingerhut_combined_grouped_sample[[2, 9, 20, 21, 1, 24, 'promotion_y_n']]

In [None]:
# Using the activation column as the target variable, create a train-test split with 80% of the data in the training set and 20% in the test set.

from sklearn.model_selection import train_test_split

# X = fingerhut_combined_grouped_sample.drop('activation', axis=1)
# drop the columns that are not needed

X = fingerhut_combined_grouped_sample[[2, 9, 20, 21, 1, 24, 'promotion_y_n']]
X.columns = X.columns.astype(str)
y = fingerhut_combined_grouped_sample['activation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Using the training set, train a logistic regression model to predict the activation column. Use the following hyperparameters: max_iter=1000, random_state=0.

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000, random_state=0, class_weight='balanced')
logreg.fit(X_train, y_train)

In [None]:
# Get the coefficients from the logistic regression model

logreg.coef_

# Pair feature names with coefficients

feature_names = X_train.columns
feature_names

feature_coefficients = dict(zip(feature_names, logreg.coef_[0]))

feature_coefficients

# Sort features by value of coefficient, in descending order

sorted_features = sorted(feature_coefficients.items(), key=lambda x: x[1], reverse=True)

sorted_features

# exponentiate the coefficients to get the odds ratio

odds_ratio = {k: np.exp(v) for k, v in feature_coefficients.items()}
odds_ratio

# order the odds ratio in descending order

sorted_odds_ratio = sorted(odds_ratio.items(), key=lambda x: x[1], reverse=True)
sorted_odds_ratio

In [None]:
# Using the test set, predict the activation column and calculate the accuracy of the model.

y_pred = logreg.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

In [None]:
# Using the test set, calculate the confusion matrix and classification report of the model.

from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))

In [None]:
# try cross-validation in case the model is overfitting

from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(logreg, X, y, cv=10, scoring='accuracy')

cv_scores

# find mean of the cross-validation scores

np.mean(cv_scores)

#### Promotion Visualization

In [None]:
# keep only the rows where application = 1, activation = 1, place_order = 1

fingerhut_combined_grouped_sample_activated_ordered = fingerhut_combined_grouped_sample[(fingerhut_combined_grouped_sample['application'] == 1) & (fingerhut_combined_grouped_sample['activation'] == 1) & (fingerhut_combined_grouped_sample['place_order'] == 1)]

In [None]:
apply_for_credit_ids = [3, 12, 13, 14, 15, 16, 17, 19]
account_activation_ids = [29]
place_order_ids = [7, 18]

In [None]:
# ed_id is a column containing lists of ids for each row. event_timestamp is a column containing lists of the same length as ed_id.
# For each row, extract the timestamp from the event_timestamp column that corresponds to the id in the ed_id column that matches the apply_for_credit_ids, account_activation_ids, and place_order_ids lists and store each timestamp in a new column respectively.

# fingerhut_combined_grouped_sample_activated_ordered['apply_for_credit_timestamp'] = fingerhut_combined_grouped_sample_activated_ordered.apply(lambda x: x['event_timestamp'][x['ed_id'].index(3)] if 3 in x['ed_id'] else None, axis=1)
fingerhut_combined_grouped_sample_activated_ordered['apply_for_credit_timestamp'] = fingerhut_combined_grouped_sample_activated_ordered.apply(lambda x: x['event_timestamp'][next((i for i, id in enumerate(x['ed_id']) if id in apply_for_credit_ids), None)] if any(id in x['ed_id'] for id in apply_for_credit_ids) else None, axis=1)
fingerhut_combined_grouped_sample_activated_ordered['account_activation_timestamp'] = fingerhut_combined_grouped_sample_activated_ordered.apply(lambda x: x['event_timestamp'][x['ed_id'].index(29)] if 29 in x['ed_id'] else None, axis=1)
# fingerhut_combined_grouped_sample_activated_ordered['place_order_timestamp'] = fingerhut_combined_grouped_sample_activated_ordered.apply(lambda x: x['event_timestamp'][x['ed_id'].index(7)] if 7 in x['ed_id'] else None, axis=1)
fingerhut_combined_grouped_sample_activated_ordered['place_order_timestamp'] = fingerhut_combined_grouped_sample_activated_ordered.apply(lambda x: x['event_timestamp'][next((i for i, id in enumerate(x['ed_id']) if id in place_order_ids), None)] if any(id in x['ed_id'] for id in place_order_ids) else None, axis=1)

# Using the timestamps from the previous step, calculate the time it took for each customer to go from applying for credit to activating their account, and from activating their account to placing an order.

fingerhut_combined_grouped_sample_activated_ordered['apply_for_credit_timestamp'] = pd.to_datetime(fingerhut_combined_grouped_sample_activated_ordered['apply_for_credit_timestamp'])
fingerhut_combined_grouped_sample_activated_ordered['account_activation_timestamp'] = pd.to_datetime(fingerhut_combined_grouped_sample_activated_ordered['account_activation_timestamp'])
fingerhut_combined_grouped_sample_activated_ordered['place_order_timestamp'] = pd.to_datetime(fingerhut_combined_grouped_sample_activated_ordered['place_order_timestamp'])

fingerhut_combined_grouped_sample_activated_ordered['apply_for_credit_to_activation'] = (fingerhut_combined_grouped_sample_activated_ordered['account_activation_timestamp'] - fingerhut_combined_grouped_sample_activated_ordered['apply_for_credit_timestamp']).dt.days
fingerhut_combined_grouped_sample_activated_ordered['activation_to_place_order'] = (fingerhut_combined_grouped_sample_activated_ordered['place_order_timestamp'] - fingerhut_combined_grouped_sample_activated_ordered['account_activation_timestamp']).dt.days


In [None]:
# extract apply_for_credit_to_activation and activation_to_place_order columns and save them to a new dataframe

fingerhut_combined_grouped_sample_activated_ordered_time = fingerhut_combined_grouped_sample_activated_ordered[['apply_for_credit_to_activation', 'activation_to_place_order']]

In [None]:
# plot both columns as histograms

import matplotlib.pyplot as plt

fingerhut_combined_grouped_sample_activated_ordered_time['apply_for_credit_to_activation'].plot(kind='hist', bins=60)
# add title and axis labels

plt.title('Days from Applying for Credit to Account Activation')
plt.xlabel('Days')
plt.ylabel('Frequency')

plt.show()

# fingerhut_combined_grouped_sample_activated_ordered_time['activation_to_place_order'].plot(kind='hist', bins=60)

# plt.show()

In [None]:
# append promotion_y_n from fingerhut_combined_grouped_sample to fingerhut_combined_grouped_sample_activated_ordered

fingerhut_combined_grouped_sample_activated_ordered_time['promotion_y_n'] = fingerhut_combined_grouped_sample_activated_ordered['promotion_y_n']

In [None]:
# remove outliers from apply_for_credit_to_activation column

fingerhut_combined_grouped_sample_activated_ordered_time = fingerhut_combined_grouped_sample_activated_ordered_time[fingerhut_combined_grouped_sample_activated_ordered_time['apply_for_credit_to_activation'] < 600]


In [None]:
# make two histograms for apply_for_credit_to_activation based on whether promotion_y_n is 1 or 0

fingerhut_combined_grouped_sample_activated_ordered_time[fingerhut_combined_grouped_sample_activated_ordered_time['promotion_y_n'] == 1]['apply_for_credit_to_activation'].plot(kind='hist', bins=60, alpha=0.5, label='Promotion')
fingerhut_combined_grouped_sample_activated_ordered_time[fingerhut_combined_grouped_sample_activated_ordered_time['promotion_y_n'] == 0]['apply_for_credit_to_activation'].plot(kind='hist', bins=60, alpha=0.5, label='No Promotion')

plt.title('Days from Applying for Credit to Account Activation (outliers removed)')

plt.xlabel('Days')

plt.ylabel('Frequency')

plt.legend()

plt.show()

### Classification - Successful Journeys

In [None]:
df = pd.read_pickle("cleaned_wide_format_data.pkl")

In [None]:
print(df.shape)

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
first_events = df['event_name'].apply(lambda x: x[0] if x else None)
last_events = df['event_name'].apply(lambda x: x[-1] if x else None)

In [None]:
first_events.value_counts().plot(kind = 'bar')
plt.title("First Event Counts")
plt.show()

In [None]:
last_events.value_counts().plot(kind = 'bar')
plt.title("Last Event Counts")
plt.show()

In [None]:
df_with_orders = df[df['event_name'].apply(lambda x: 'order_shipped' in x)]

In [None]:
print(df_with_orders.shape)
df_with_orders.head()

In [None]:
df_no_orders = df[df['event_name'].apply(lambda x: 'order_shipped' not in x)]

In [None]:
print(df_no_orders.shape)

In [None]:
def_df = pd.read_csv("event_definitions.csv")

In [None]:
stage_dict = {'Apply for Credit' : 1, 'Credit Account' : 2, 'Discover' : 3, 'Downpayment' : 4, 'First Purchase' : 5, 
              'Order Shipped' : 6, 'Prospecting' : 7}
def_df['stage_int'] = def_df['stage'].map(stage_dict)

In [None]:
stage_mapping_dict = def_df.set_index('event_definition_id')['stage_int'].to_dict()

In [None]:
sampled_df = df.sample(n=500000, replace=False)

In [None]:
cutoff_date = pd.to_datetime('2023-09-20 12:29:58+0000', utc=True)
cutoff_date # this is like "today"

In [None]:
required_stages = {'Apply for Credit', 'First Purchase', 'Downpayment', 'Order Shipped'}
promotion_ids = [2, 9, 20, 21, 1, 24]


# Function to get the stage from the first id in the list
def get_stage_from_first_id(id_list):
    # Get the first id
    first_id = id_list[0]
    # Return the corresponding stage using the mapping dictionary
    return stage_mapping_dict.get(first_id, 0)

def get_stage_from_last_id(id_list):
    # Get the first id
    last_id = id_list[-1]
    # Return the corresponding stage using the mapping dictionary
    return stage_mapping_dict.get(last_id, 0)

def clean_df(df):
    df = df.reset_index(drop = True)

    # df['event_count'] = df['event_name'].apply(len)
    # df['num_accounts'] = df['account_id'].apply(len)
    # df['unique_event_count'] = df['event_name'].apply(lambda x: len(set(x)))
    # df['unique_stage_count'] = df['stage'].apply(lambda x: len(set(x)))
    
    df['first_event'] = df['ed_id'].apply(lambda x: x[0] if x else None)
    df['days_since_start'] = df['event_timestamp'].apply(lambda x: (cutoff_date - x[0]).days)
    # df['last_event'] = df['ed_id'].apply(lambda x: x[-1] if x else None)
    # df['len_journey'] = df['journey_steps_until_end'].apply(len)
    # df['days_in_journey'] = df['event_timestamp'].apply(lambda x: (x[-1] - x[0]).days)
    # df['progression_rate'] = df['len_journey'] / df['days_in_journey'] # estimate for progression rate of journey
    df['first_stage'] = df['ed_id'].apply(get_stage_from_first_id)
    # df['last_stage'] = df['ed_id'].apply(get_stage_from_last_id)

    df['first_event_month'] = df['event_timestamp'].apply(lambda x: x[0].month)
    df['first_event_day'] = df['event_timestamp'].apply(lambda x: x[0].day)
    df['first_event_hour'] = df['event_timestamp'].apply(lambda x: x[0].hour)

    # now create columns for whether a specific event is present or not
    # remember that ideal journey is defined as: Apply for credit > Make a first purchase > Make the down payment > Order Ships
    # df already has account activation and place order one hot encoded columns
    # df['apply_for_credit'] = df['stage'].apply(lambda x: 1 if 'Apply for Credit' in x else 0) # Apply for credit
    # df['make_first_purchase'] = df['stage'].apply(lambda x: 1 if 'First Purchase' in x else 0) # first purchase
    # df['downpayment'] = df['stage'].apply(lambda x: 1 if 'Downpayment' in x else 0) # downpayment
    # df['order_shipped'] = df['event_name'].apply(lambda x: 1 if 'order_shipped' in x else 0) # order shipped

    df['ideal_journey'] = df['stage'].apply(lambda x: 1 if required_stages.issubset(set(x)) else 0)
    df['promotion_exposure'] = df['ed_id'].apply(lambda x: 1 if any([i in x for i in promotion_ids]) else 0)


    df = df.drop(['event_name', 'event_timestamp', 'ed_id', 'journey_steps_until_end', 'stage', 'place_order', 'account_activation'], axis = 1)
    
    
    return df

In [None]:
sampled_df = clean_df(sampled_df)
whole_sampled_df = clean_df(df)

In [None]:
sampled_df.to_pickle("updated_clustering_sample_df.pkl")

In [None]:
whole_sampled_df.isna().any() # check for NA values

In [None]:
whole_sampled_df.to_pickle(("updated_feature_engineered_data.pkl"))

In [None]:
model_df = whole_sampled_df.drop(['customer_id', 'account_id'], axis = 1)

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(model_df.corr(), cmap='Blues', annot = True)
plt.show()

In [None]:
X = model_df.drop(['ideal_journey'], axis = 1)
y = whole_sampled_df['ideal_journey'] # we use ideal_journey rather than order_shipped etc because the two variables have a correlation of 0.99

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
clf = LogisticRegression(max_iter = 10000).fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print('accurary', accuracy)
print('precision', precision)
print('recall', recall)
print('f1', f1)

In [None]:
from sklearn.metrics import classification_report

In [None]:
target_names = ['unsuccessful', 'successful']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
### Now try with balanced data

In [None]:
clf = LogisticRegression(max_iter = 10000, class_weight = 'balanced').fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
coefficients = clf.coef_[0]  
feature_importance = zip(X_train.columns, coefficients)
sorted_features = sorted(feature_importance, key=lambda x: abs(x[1]), reverse=True)
for feature, coef in sorted_features:
    print(f"{feature}: {coef}")

In [None]:
odds_ratios = np.exp(coefficients)
odds_df = pd.DataFrame({'Feature': X_train.columns, 'OddsRatio': odds_ratios})
odds_df_sorted = odds_df.sort_values(by='OddsRatio', ascending=True)

plt.figure(figsize=(8, 4))
barplot = plt.barh(odds_df_sorted['Feature'], odds_df_sorted['OddsRatio'])
for bar in barplot:
    plt.text(bar.get_width(), bar.get_y() + bar.get_height() / 2, 
             f"{bar.get_width():.2f}", va='center')

plt.xlabel('Odds Ratio')
plt.title('Sorted Feature Importance (Odds Ratios) For Logistic Regression')
plt.tight_layout() 
plt.show()

In [None]:
## Try cross fold validation in case overfitting

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [None]:
model = LogisticRegression(max_iter=10000)

In [None]:
scores = cross_val_score(model, X, y, cv=10, scoring='accuracy') # 10 fold cv
print(f'Accuracy for each fold: {scores}')
print(f'Mean accuracy: {np.mean(scores)}')

In [None]:
## Look into other models

# first test on sample data

In [None]:
temp = sampled_df.drop(['customer_id', 'account_id'], axis = 1)

In [None]:
temp.shape

In [None]:
X_sample = temp.drop(['ideal_journey'], axis = 1)
y_sample = temp['ideal_journey']

In [None]:
scaler = StandardScaler()
X_sample_scaled = scaler.fit_transform(X_sample)
X_sample_scaled = pd.DataFrame(X_sample_scaled, columns=X_sample.columns)

In [None]:
X_train0, X_test0, y_train0, y_test0 = train_test_split(X_sample_scaled, y_sample, test_size=0.33, random_state=42)

In [None]:
clf = LogisticRegression(max_iter = 10000).fit(X_train0, y_train0)
y_pred = clf.predict(X_test0)
accuracy = accuracy_score(y_test0, y_pred)
precision = precision_score(y_test0, y_pred)
recall = recall_score(y_test0, y_pred)
f1 = f1_score(y_test0, y_pred)

In [None]:
print('accurary', accuracy)
print('precision', precision)
print('recall', recall)
print('f1', f1)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, 
                                       min_samples_leaf=2, max_features='sqrt', n_jobs=-1, 
                                       random_state=42, class_weight = 'balanced')
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('accurary', accuracy)
print('precision', precision)
print('recall', recall)
print('f1', f1)

print(classification_report(y_test, y_pred, target_names=target_names))

# setting class weight to balanced brough accuracy down from 82% to 64% but raised the other metrics significantly

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, 
                                       min_samples_leaf=1, max_features='sqrt', n_jobs=-1, random_state=42)
scores = cross_val_score(model, X_scaled, y, cv=10, scoring='accuracy')

print(f'Accuracy for each fold: {scores}')
print(f'Mean accuracy: {np.mean(scores)}')


In [None]:
y.value_counts(normalize = True)

In [None]:
X_scaled.shape

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
### Hyperparameter tuning for random forest classifier

X_sample_rf, _, y_sample_rf, _ = train_test_split(X, y, stratify=y, train_size=0.2, random_state=42)  # Sample 20% of the data

rf = RandomForestClassifier(n_jobs=-1, random_state=42, class_weight = 'balanced')
param_distributions = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20],
                       'min_samples_split': [2, 5],'min_samples_leaf': [1, 2], 'max_features': ['sqrt', 'log2']}

random_search = RandomizedSearchCV(rf, param_distributions, n_iter=10, scoring='roc_auc', cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_sample_rf, y_sample_rf)

print(f"Best parameters: {random_search.best_params_}")
print(f"Best AUC-ROC score: {random_search.best_score_}")


In [None]:
### Now use best parameters from grid search cv on full dataset

rf_classifier = RandomForestClassifier(n_estimators = 200, min_samples_split = 5, min_samples_leaf = 1, max_features = 'log2', max_depth = 10,
                                       random_state=42, class_weight = 'balanced', n_jobs = -1)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('accurary', accuracy)
print('precision', precision)
print('recall', recall)
print('f1', f1)

print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
from imblearn.over_sampling import RandomOverSampler
import tensorflow as tf
from tensorflow.keras import layers

# Perform oversampling to address class imbalance
oversampler = RandomOverSampler()
X_resampled, y_resampled = oversampler.fit_resample(X_scaled, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define the neural network model
model = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(7,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

from tensorflow.keras.callbacks import ReduceLROnPlateau

# Define your model as before
model = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model as before
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Initialize the ReduceLROnPlateau callback
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=1e-6, verbose=1)

# Train the model with the callback
history = model.fit(
    X_train, 
    y_train, 
    epochs=50, 
    batch_size=32, 
    validation_data=(X_test, y_test),
    callbacks=[reduce_lr] 
)

### Classification Visualizations

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_pickle("updated_feature_engineered_data.pkl")
print(df.shape)
df.head()

In [None]:
def_df = pd.read_csv("/Users/alyssaliu/Desktop/StatsM148/Event Definitions.csv")

In [None]:
df.set_index(['customer_id', 'account_id'], inplace=True)

In [None]:
df['ideal_journey'] = df['ideal_journey'].map({0: 'unsuccessful', 1: 'successful'})


In [None]:
df['promotion_exposure'] = df['promotion_exposure'].map({0: 'no', 1: 'yes'})


In [None]:
stage_dict = {'Apply for Credit' : 1, 'Credit Account' : 2, 'Discover' : 3, 'Downpayment' : 4, 'First Purchase' : 5, 
              'Order Shipped' : 6, 'Prospecting' : 7}
stage_dict = {v:k for k,v in stage_dict.items()}

df['first_stage'] = df['first_stage'].map(stage_dict)

In [None]:
df['ideal_journey'].value_counts(normalize = True)

In [None]:
df['promotion_exposure'].value_counts(normalize = True)

In [None]:
sns.boxplot(data=df, x='ideal_journey', y='days_since_start')
plt.show()

In [None]:
sns.histplot(data=df, x='days_since_start', hue='ideal_journey', kde=True)
plt.show()

In [None]:
sns.barplot(x = 'ideal_journey', y = 'days_since_start', data = df)

In [None]:
proportions = df.groupby('ideal_journey')['promotion_exposure'].value_counts(normalize=True).rename('proportion').reset_index()
plt.figure(figsize=(8, 6))
ax = sns.barplot(x='ideal_journey', y='proportion', hue='promotion_exposure', data=proportions, errorbar = None)
ax.bar_label(ax.containers[0], fontsize=10)
plt.title('Normalized Count Plot by Target')
plt.ylabel('Proportion')
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(x='first_stage', hue='ideal_journey', data=df, palette='Set2')
plt.title('Distribution of First Stages by Ideal Journey Status')
plt.xlabel('First Stage')
plt.ylabel('Count')
plt.legend(title='Ideal Journey', labels=['No', 'Yes'])
plt.xticks(rotation=45)
plt.show()

In [None]:
proportions = df.groupby(['first_stage', 'ideal_journey']).size().unstack(fill_value=0)
proportions_normalized = proportions.div(proportions.sum(axis=1), axis=0)
proportions_normalized.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Normalized Distribution of First Stages by Ideal Journey Status')
plt.xlabel('First Stage')
plt.ylabel('Proportion')
plt.axhline(y=0.183241, color='red', linestyle='--', label = 'true prop of successful/unsuccesful')
plt.legend(loc='upper right')
plt.xticks(rotation=45) 
plt.tight_layout()
plt.show()

In [None]:
df_ideal = df[df['ideal_journey'] == 'successful']
df_nonideal = df[df['ideal_journey'] == 'unsuccessful']

In [None]:
plt.figure(figsize = (6, 4))
df_ideal['first_stage'].value_counts(normalize=True).plot(kind='bar')
for i, v in enumerate(df_ideal['first_stage'].value_counts(normalize=True)):
    plt.text(i, v, str(round(v, 6)), ha='center', va='bottom')
plt.title("Proportion of first stage for customers with successful ideal journey")
plt.show()

In [None]:
plt.figure(figsize = (6, 4))
df_nonideal['first_stage'].value_counts(normalize=True).plot(kind='bar')

for i, v in enumerate(df_nonideal['first_stage'].value_counts(normalize=True)):
    plt.text(i, v, str(round(v, 6)), ha='center', va='bottom')
plt.title("Proportion of first stage for customers with unsuccessful ideal journey")
plt.show()

In [None]:
wide_df = pd.read_pickle("/Users/alyssaliu/Desktop/StatsM148/cleaned_wide_format_data.pkl")

In [None]:
wide_df.set_index(['customer_id', 'account_id'], inplace=True)

In [None]:
# wide_df['ideal_journey'] = df['ideal_journey']
wide_df['ideal_journey'] = wide_df.index.map(df['ideal_journey'])

In [None]:
wide_df['event_count'] = wide_df['event_name'].apply(len)
wide_df['unique_event_count'] = wide_df['event_name'].apply(lambda x: len(set(x)))
wide_df['unique_stage_count'] = wide_df['stage'].apply(lambda x: len(set(x)))
wide_df['last_event'] = wide_df['event_name'].apply(lambda x: x[-1] if x else None)
wide_df['last_stage'] = wide_df['stage'].apply(lambda x: x[-1] if x else None)
wide_df['len_journey'] = wide_df['journey_steps_until_end'].apply(len)
wide_df['days_in_journey'] = wide_df['event_timestamp'].apply(lambda x: (x[-1] - x[0]).days)
wide_df['progression_rate'] = wide_df['len_journey'] / wide_df['days_in_journey'] # estimate for progression rate of journey

In [None]:
wide_df['first_purchase'] = wide_df['stage'].apply(lambda x: 1 if 'First Purchase' in x else 0)

In [None]:
event_id_dict = dict(zip(def_df['event_definition_id'], def_df['event_name']))


In [None]:
event_dict = def_df.set_index('event_definition_id')['stage'].to_dict()

# add additional key value pair
event_dict[1] = 'Promotion Created'
event_dict[24] = 'Campaignemail Clicked'

In [None]:
sns.kdeplot(data=wide_df, x='unique_event_count', hue='ideal_journey')
plt.show()

In [None]:
sns.countplot(data=wide_df, x='unique_event_count', hue='ideal_journey')
plt.tight_layout()  
plt.show()

In [None]:
ideal_df = wide_df[wide_df['ideal_journey'] == 'successful']
nonideal_df = wide_df[wide_df['ideal_journey'] == 'unsuccessful']

In [None]:
proportions = wide_df.groupby(['last_stage', 'ideal_journey']).size().unstack(fill_value=0)
proportions_normalized = proportions.div(proportions.sum(axis=1), axis=0)
proportions_normalized.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Normalized Distribution of Last Stages by Ideal Journey Status')
plt.xlabel('Last Stage')
plt.ylabel('Proportion')
plt.axhline(y=0.183241, color='red', linestyle='--', label = 'true prop of successful/unsuccesful')
plt.legend(loc='upper right')
plt.xticks(rotation=45)  
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (6, 4))
ideal_df['last_stage'].value_counts(normalize=True).plot(kind='bar')
for i, v in enumerate(ideal_df['last_stage'].value_counts(normalize=True)):
    plt.text(i, v, str(round(v, 6)), ha='center', va='bottom')
plt.title("Proportion of last stage for customers with successful ideal journey")
plt.show()

In [None]:
plt.figure(figsize = (16, 8))
nonideal_df['last_event'].value_counts(normalize=True).plot(kind='bar')
for i, v in enumerate(nonideal_df['last_event'].value_counts(normalize=True)):
    plt.text(i, v, str(round(v, 2)), ha='center', va='bottom')
plt.title("Proportion of last events for customers with unsuccessful ideal journey")
plt.show()

In [None]:
plt.figure(figsize = (8, 6))
nonideal_df['last_stage'].value_counts(normalize=True).plot(kind='bar')
for i, v in enumerate(nonideal_df['last_stage'].value_counts(normalize=True)):
    plt.text(i, v, str(round(v, 6)), ha='center', va='bottom')
plt.title("Proportion of last stage for customers with unsuccessful ideal journey")
plt.show()

In [None]:
plt.figure(figsize = (12, 8))
sns.countplot(data=wide_df, x='last_stage', hue='ideal_journey')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
proportions = wide_df.groupby(['last_event', 'ideal_journey']).size().unstack(fill_value=0)
proportions_normalized = proportions.div(proportions.sum(axis=1), axis=0)
proportions_normalized.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Normalized Distribution of Last Event by Ideal Journey Status')
plt.xlabel('Last Event')
plt.ylabel('Proportion')
plt.xticks(rotation=90) 
plt.show()

In [None]:
sns.boxplot(data=wide_df, x='ideal_journey', y='event_count')
plt.show()

In [None]:
from itertools import chain

In [None]:
last_stage_purchase = wide_df[wide_df['last_stage'] == 'First Purchase']
last_stage_purchase['last_event'].value_counts(normalize = True).plot(kind = 'bar')
for i, v in enumerate(last_stage_purchase['last_event'].value_counts(normalize=True)):
    plt.text(i, v, str(round(v, 6)), ha='center', va='bottom')
plt.title("Distribution of final event for customers with First Purchase as final stage")
plt.show()

In [None]:
first_purchase_df = wide_df[widf['stage'].apply(lambda stages: 'First Purchase' in stages)]
event_names = list(chain.from_iterable(wide_df['event_name']))
event_name_counts = pd.Series(event_names).value_counts()
plt.figure(figsize=(10, 8))
sns.barplot(x=event_name_counts.values, y=event_name_counts.index)
plt.xlabel('Counts')
plt.ylabel('Event Names')
plt.title('Counts of Event Names Corresponding to "First Purchase" Stage')
plt.show()

In [None]:
sns.kdeplot(data=wide_df, x='len_journey', hue='ideal_journey')
plt.show()

In [None]:
sns.boxplot(data=wide_df, x='ideal_journey', y='len_journey')
plt.show()

In [None]:
wide_df['log_len_journey'] = np.log(wide_df['len_journey'] + 1) # log-transformed data
sns.boxplot(data=wide_df, x='ideal_journey', y='log_len_journey')
plt.title('Boxplot of log-transformed journey lengths')
plt.ylabel('Log of Journey Length')
plt.show()

### XGBoost & Neural Network

In [None]:
df = pd.read_pickle(("updated_feature_engineered_data.pkl"))

In [None]:
sampled_df = pd.read_pickle("updated_clustering_sample_df.pkl")

In [None]:
model_df = df.drop(['customer_id', 'account_id'], axis = 1)
X = model_df.drop(['ideal_journey'], axis = 1)
y = model_df['ideal_journey'] # we use ideal_journey rather than order_shipped etc because the two variables have a correlation of 0.99

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
y_train.value_counts()

In [None]:
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1] # scale weights of imbalanced data 
xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred))
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc}")

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [None]:
# Define a parameter grid to search
param_dist = {'n_estimators': randint(100, 500),'learning_rate': uniform(0.01, 0.3),'subsample': uniform(0.7, 0.3),'max_depth': randint(3, 10),
              'colsample_bytree': uniform(0.7, 0.3),'min_child_weight': randint(1, 6)}
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, 
                                   scoring='roc_auc', error_score=0, verbose=3, n_jobs=-1, cv=3)
random_search.fit(X_train, y_train)
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best ROC AUC found: {random_search.best_score_}")

### Kmeans

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_pickle("/Users/alyssaliu/Desktop/StatsM148/updated_feature_engineered_data.pkl")
print(df.shape)

In [None]:
y = df['ideal_journey']
X = df.drop(['customer_id', 'account_id'], axis = 1)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
wcss = []
for i in range(1, 20): 
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0) # multiple initializations
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_) 
plt.plot(range(1, 20), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=6, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(X_scaled)
labels = kmeans.labels_
df['Cluster'] = labels

In [None]:
feature_to_plot = 'first_event'
plt.figure(figsize=(10,6))
sns.boxplot(x='Cluster', y=feature_to_plot, data=df)
plt.title(f'Distribution of {feature_to_plot} Across Clusters')
plt.show()

In [None]:
feature_to_plot = 'first_event_month'
plt.figure(figsize=(10,6))
sns.boxplot(x='Cluster', y=feature_to_plot, data=df)
plt.title(f'Distribution of {feature_to_plot} Across Clusters')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(hue='first_stage', x='Cluster', data=df)
plt.title(f'Distribution of {feature_to_plot} Across Clusters')
plt.show()

In [None]:
centers = pd.DataFrame(kmeans.cluster_centers_, columns=X_scaled.columns)
print(centers)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(centers, annot=True, cmap='viridis')
plt.title('Centroid Values across Clusters')
plt.xlabel('Feature')
plt.ylabel('Cluster')
plt.show()

In [None]:
cluster_0 = df[df['Cluster'] == 0]
cluster_0_cust = cluster_0['customer_id'].to_list()
print(cluster_0.shape)

In [None]:
cluster_1 = df[df['Cluster'] == 1]
cluster_1_cust = cluster_1['customer_id'].to_list()
print(cluster_1.shape)

In [None]:
cluster_2 = df[df['Cluster'] == 2]
cluster_2_cust = cluster_2['customer_id'].to_list()
print(cluster_2.shape)

In [None]:
cluster_3 = df[df['Cluster'] == 3]
cluster_3_cust = cluster_3['customer_id'].to_list()
print(cluster_3.shape)

In [None]:
cluster_4 = df[df['Cluster'] == 4]
cluster_4_cust = cluster_4['customer_id'].to_list()
print(cluster_4.shape)

In [None]:
cluster_5 = df[df['Cluster'] == 5]
cluster_5_cust = cluster_5['customer_id'].to_list()
print(cluster_5.shape)

In [None]:
## Inspecting entire dataframe per cluster

df_whole = pd.read_pickle("cleaned_wide_format_data.pkl")

In [None]:
cluster_1_df = df_whole[df_whole['customer_id'].isin(cluster_1_cust)]
cluster_2_df = df_whole[df_whole['customer_id'].isin(cluster_2_cust)]
cluster_3_df = df_whole[df_whole['customer_id'].isin(cluster_3_cust)]
cluster_4_df = df_whole[df_whole['customer_id'].isin(cluster_4_cust)]
cluster_5_df = df_whole[df_whole['customer_id'].isin(cluster_5_cust)]

In [None]:
cluster_0_df = df_whole[df_whole['customer_id'].isin(cluster_0_cust)]
cluster_0_df.to_pickle("cluster_0_df.pkl")

In [None]:
## export cluster dataframes for easy access
cluster_1_df.to_pickle("cluster_1_df.pkl")
cluster_2_df.to_pickle("cluster_2_df.pkl")
cluster_3_df.to_pickle("cluster_3_df.pkl")
cluster_4_df.to_pickle("cluster_4_df.pkl")
cluster_5_df.to_pickle("cluster_5_df.pkl")

In [None]:
def create_flow_counts(df):
    source_target_pairs = []
    for path in df['stage']:
        pairs = [(path[i], path[i+1]) for i in range(len(path)-1)]
        source_target_pairs.extend(pairs)
    pairs_df = pd.DataFrame(source_target_pairs, columns=['source', 'target'])
    flow_counts = pairs_df.groupby(['source', 'target']).size().reset_index(name='value')
    all_nodes = list(set(flow_counts['source']).union(set(flow_counts['target'])))
    node_dict = {node: i for i, node in enumerate(all_nodes)}

    # Map to df
    flow_counts['source_id'] = flow_counts['source'].map(node_dict)
    flow_counts['target_id'] = flow_counts['target'].map(node_dict)
    
    return flow_counts


def create_flow_counts_events(df):
    source_target_pairs = []
    for path in df['event_name']:
        pairs = [(path[i], path[i+1]) for i in range(len(path)-1)]
        source_target_pairs.extend(pairs)
    pairs_df = pd.DataFrame(source_target_pairs, columns=['source', 'target'])
    flow_counts = pairs_df.groupby(['source', 'target']).size().reset_index(name='value')
    all_nodes = list(set(flow_counts['source']).union(set(flow_counts['target'])))
    node_dict = {node: i for i, node in enumerate(all_nodes)}

    # Map to df
    flow_counts['source_id'] = flow_counts['source'].map(node_dict)
    flow_counts['target_id'] = flow_counts['target'].map(node_dict)
    
    return flow_counts

In [None]:
clust_1_flow_counts = create_flow_counts(cluster_1_df)
clust_2_flow_counts = create_flow_counts(cluster_2_df)
clust_3_flow_counts = create_flow_counts(cluster_3_df)
clust_4_flow_counts = create_flow_counts(cluster_4_df)
clust_5_flow_counts = create_flow_counts(cluster_5_df)

### Flow counts

In [None]:
clust_1_flow_counts[clust_1_flow_counts['source'] == clust_1_flow_counts['target']]

In [None]:
flow_counts = create_flow_counts(df_whole)

In [None]:
stage_flow_df = flow_counts.pivot_table(index='source', columns='target', values='value', aggfunc='sum')
stage_flow_df = stage_flow_df.fillna(0)
plt.figure(figsize=(12, 10)) 
sns.heatmap(stage_flow_df, annot=True, cmap='viridis', fmt='g', linewidths=.5)
plt.title('Heatmap of Flow Count (for stages)')
plt.ylabel('Source')
plt.xlabel('Target')
plt.show()

In [None]:
flow_counts[flow_counts['source'] == flow_counts['target']]

In [None]:
event_flow_counts = create_flow_counts_events(df_whole)

In [None]:
def_df = pd.read_csv("event_definitions.csv")
first_purchase = list(def_df[def_df['stage'] == 'First Purchase']['event_name'].values)

In [None]:
first_purchase_flow_df = event_flow_counts[(event_flow_counts['source'].isin(first_purchase)) & (event_flow_counts['target'].isin(first_purchase))]
# first_purchase_flow_df = first_purchase_flow_df[first_purchase_flow_df['source'] != first_purchase_flow_df['target']]

In [None]:
fp_flow = first_purchase_flow_df.pivot_table(index='source', columns='target', values='value', aggfunc='sum')
fp_flow = fp_flow.fillna(0)
plt.figure(figsize=(12, 10))
sns.heatmap(fp_flow, annot=True, cmap='viridis', fmt='g', linewidths=.5)
plt.title('Heatmap of Flow Count (for first purchase events)')
plt.ylabel('Source')
plt.xlabel('Target')
plt.show()