In [None]:
# Core libraries
import pandas as pd
import numpy as np
import math
from datetime import timedelta

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from matplotlib.colors import LogNorm
import matplotlib.ticker as mticker
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize
from matplotlib.ticker import FuncFormatter
from matplotlib.colors import LinearSegmentedColormap

In [None]:
df_raw = pd.read_csv('raw.csv')
df_raw.columns = df_raw.columns.str.lower()
print("shape of the data:", df_raw.shape)
df_raw.head()

# 1. Data cleaning

In [None]:
# check data types of all columns
df_raw.dtypes

In [None]:
def convert_to_datetime(df, columns, errors='coerce', format=None):
    for col in columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors=errors, format=format)
        else:
            print(f"Warning: Column '{col}' not found in DataFrame.")
    return df

In [None]:
df_raw = convert_to_datetime(df_raw, ['timestamp', 'converted_at', 'trial_start', 'trial_end'])
df_raw['converted'] = df_raw['converted'].astype(int)
print(df_raw.dtypes)
df_raw.head()

In [None]:
df_raw.isnull().sum()

In [None]:
# Checking if 'converted_at' is null is only null when converted=False

df_nulls = df_raw[df_raw['converted_at'].isna()]
null_summary = df_nulls.groupby('converted').size().reset_index(name='null_count')
null_summary

Since organizations that did not convert churned by the end of trial, I will consider trial_end timestamp as their 'conversion into churn' time.

In [None]:
# Filling converted_at for churned users.

#latest_timestamps = df_raw.groupby('organization_id')['timestamp'].transform('max')

df_raw['converted_at'] = df_raw.apply(
    lambda row: row['trial_end'] if pd.isna(row['converted_at']) and not row['converted'] else row['converted_at'],
    axis=1
)

df_raw = df_raw.sort_values(by=['organization_id', 'timestamp']).reset_index(drop=True)
df_raw

# 2. Finding and cleaning inconsistencies

- Converted value per organization should remain unchanged
- All the timestamps for not converted organizations must be within the trial period

In [None]:
def check_conversion_consistency(df):
    
    converted_counts = df.groupby('organization_id')['converted'].nunique()
    # Find the organizations that have more than 1 distinct conversion rate
    orgs_with_multiple_converted = converted_counts[converted_counts > 1].index.tolist()

    if orgs_with_multiple_converted:
        return orgs_with_multiple_converted
    else:
        return "No inconsistent conversion values"

In [None]:
result = check_conversion_consistency(df_raw)
print(result)

In [None]:
def check_date_consistency(df, return_message=True):

    # === Aggregate only necessary columns ===
    agg_dict = {
        'timestamp': ['min', 'max'],
        'converted_at': ['min', 'max'],
        'trial_start': ['min', 'max'],
        'trial_end': ['min', 'max']
    }

    date_check = df.groupby('organization_id').agg(agg_dict)
    date_check.columns = ['_'.join(col).strip() for col in date_check.columns.values]
    date_check = date_check.reset_index()

    # === Making sure that converted_at, trial_start and trial_end are unique per onganization ===
    for col in ['converted_at', 'trial_start', 'trial_end']:
        date_check[f"{col}_vary_flag"] = (
            date_check[f"{col}_min"] != date_check[f"{col}_max"]
        ).astype(int)

    # === Checking that activity timestamps are within allowed ranges ===
    date_check['timestamp_min_flag'] = (
        date_check['timestamp_min'] <= date_check['trial_start_min']
    ).astype(int)

    date_check['timestamp_max_flag'] = (
        date_check['timestamp_max'] >= date_check['trial_end_max']
    ).astype(int)

    date_check['converted_at_flag'] = (
        date_check['converted_at_min'] < date_check['trial_start_min']
    ).astype(int)

    # === Identifying suspicious records ===
    flag_cols = [col for col in date_check.columns if col.endswith('_flag')]
    suspicious = date_check[date_check[flag_cols].sum(axis=1) > 0]

    if suspicious.empty:
        return "No inconsistent dates" if return_message else None

    return suspicious

In [None]:
result = check_date_consistency(df_raw)
print(result)

# 3. Checking for duplicates

Checking for duplicated events using a combination of **organization_id**, **activity_name** and **timestamp** as a key

In [None]:
grouped_records = (
    df_raw
    .groupby(['organization_id', 'activity_name', 'timestamp'])
    .agg(
        total_records=('activity_name', 'size')
    )
    .sort_values(['organization_id','timestamp', 'activity_name'])
    .reset_index()
)

In [None]:
num_orgs_total = df_raw['organization_id'].nunique()
num_rows_total = len(df_raw)

duplicates = grouped_records[grouped_records['total_records'] > 1]

num_orgs_with_duplicates = duplicates['organization_id'].nunique()
num_duplicated_rows = duplicates['total_records'].sum()

print("Total unique organizations:", num_orgs_total)
print("Total records:", num_rows_total)

print("Number of organizations with duplicates:", num_orgs_with_duplicates)
print("Total number of duplicated records:", num_duplicated_rows)

In [None]:
# Checking the number of duplicates per activity

duplicates_activity = (
    duplicates
    .groupby('activity_name')
    .agg(
        total_duplicate_instances=('total_records', 'size'),
        total_orgs=('organization_id', 'nunique'),
        min_total_records=('total_records', 'min'),
        percentile_25=('total_records', lambda x: x.quantile(0.25)),
        median_total_records=('total_records', 'median'),
        percentile_75=('total_records', lambda x: x.quantile(0.75)),
        percentile_90=('total_records', lambda x: x.quantile(0.90)),
        max_total_records=('total_records', 'max')
    )
    .sort_values('max_total_records', ascending=False)
    .reset_index()
)

duplicates_activity

#### Observations:

Looking at the median total records, I can see that most activities have 2-3 duplicates per organization and timestamp, so that could be a simple coincindece. 

However, looking at max, there are 3 activities that stand out: 

- Scheduling.Shift.Created
- Scheduling.Shift.AssignmentChanged
- Scheduling.Availability.Set

I assume that these activities can be triggered in bulk. For example, an Employee can set availability for several days with one click, and an Admin can create several shifts using some presets or a template. 

I'm going to verify whether they are indeed triggered by something else.

In [None]:
#Identifying records with max duplicates per activity

idx = grouped_records.groupby('activity_name')['total_records'].idxmax()
max_per_act = grouped_records.loc[idx]
max_per_act = max_per_act.sort_values('total_records', ascending=False).reset_index(drop=True)
max_per_act.head(10)

In [None]:
# Making sure that in case of timestamp conflict the template is shown 

gr_sorted = grouped_records.sort_values(
    by=['organization_id', 'timestamp', 'activity_name'],
    ascending=[True, True, True]
)

# Define custom sorting priority
activity_priority = {
    'Scheduling.Template.ApplyModal.Applied': 0,
    'Scheduling.Shift.Created': 1,
    'Scheduling.Shift.AssignmentChanged': 2
}

# Apply priority where applicable
gr_sorted['activity_priority'] = gr_sorted['activity_name'].map(activity_priority).fillna(999)

# Final sort with priority
gr_sorted = gr_sorted.sort_values(
    by=['organization_id', 'timestamp', 'activity_priority']
).drop(columns='activity_priority').reset_index(drop=True)

#gr_sorted

In [None]:
def get_records_within_interval(
    source_df: pd.DataFrame,
    index: int,
    search_df: pd.DataFrame,
    time_before: int = None,
    time_after: int = None
) -> pd.DataFrame:

    # Extract reference values
    org_id = source_df.loc[index, 'organization_id']
    ref_time = source_df.loc[index, 'timestamp']

    # Ensure timestamp column is datetime
    if not pd.api.types.is_datetime64_any_dtype(search_df['timestamp']):
        search_df = search_df.copy()
        search_df['timestamp'] = pd.to_datetime(search_df['timestamp'])

    # Start with org_id match
    mask = (search_df['organization_id'] == org_id)

    # Apply optional time bounds
    if time_before is not None:
        start_time = ref_time - timedelta(minutes=time_before)
        mask &= (search_df['timestamp'] >= start_time)

    if time_after is not None:
        end_time = ref_time + timedelta(minutes=time_after)
        mask &= (search_df['timestamp'] <= end_time)

    return search_df[mask]

In [None]:
# Get records for index 0 of max_record_per_org within 60 minutes
result = get_records_within_interval(
    source_df=max_per_act,
    index=1,
    time_before=60,
    time_after=60,
    search_df=gr_sorted
)

result.reset_index(drop=True)

In [None]:
def annotate_action_ever_used(
    source_df: pd.DataFrame,
    action_name: str = "Scheduling.Template.ApplyModal.Applied",
    max_records: int = 10
) -> pd.DataFrame:

    df = source_df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values(['organization_id', 'timestamp'])

    # Get first time the action was used per organization
    action_first_use = (
        df[df['activity_name'] == action_name]
        .groupby('organization_id')['timestamp']
        .min()
        .rename('first_template_used')
        .reset_index()
    )

    # Merge with original data
    df = df.merge(action_first_use, on='organization_id', how='left')

    # template_used_ever = 1 if first_template_used exists and is earlier than current timestamp
    df['template_used_ever'] = (
        (df['first_template_used'].notna()) &
        (df['first_template_used'] <= df['timestamp'])
    ).astype(int)

    # Drop helper column
    df = df.drop(columns='first_template_used')

    # Filter if desired
    df = df[df['total_records'] >= max_records].reset_index(drop=True)

    return df

In [None]:
annotated_df = annotate_action_ever_used(
    source_df=gr_sorted,
    action_name="Scheduling.Template.ApplyModal.Applied",
    max_records=50        # Only annotate rows where total_records >= x
)

annotated_df

In [None]:
# Filter rows where neither recent nor ever action occurred
no_template_triggered = annotated_df[
    (annotated_df['template_used_ever'] == 0)
]

pd.set_option('display.max_rows', 50)
#no_template_triggered.head(10)

# Find the index of the row with max total_records per organization
idx = no_template_triggered.groupby('organization_id')['total_records'].idxmax()
max_records_no_template = no_template_triggered.loc[idx].reset_index(drop=True)
max_records_no_template.sort_values('total_records', ascending=False).reset_index(drop=True)

In [None]:
num_orgs = no_template_triggered['organization_id'].nunique()
num_rows = no_template_triggered['total_records'].sum()
max = no_template_triggered['total_records'].sum()

print("Total unique organizations:", num_orgs)
print("Total records:", num_rows)

In [None]:
# Filter rows for a specific record
check = gr_sorted[
    (gr_sorted['organization_id'] == '154647fa6ad39ad1ea4dd6bdfd273679') &
    (gr_sorted['timestamp'] >= pd.to_datetime('2024-02-10 00:00:00'))&
    (gr_sorted['timestamp'] <= pd.to_datetime('2024-02-22 20:00:00'))
].reset_index(drop=True)

pd.set_option('display.max_rows', 20)

check

#### Observations:

- Scheduling.Shift.Created and Scheduling.Shift.AssignmentChanged- many bulk records are indeed triggered by Scheduling.Template.ApplyModal.Applied
- Some duplicated entries are created by organizations that never used the templates
- Duplicate entries with no templates applied have no more than 86 shifts booken in the same second
- Scheduling.Availability.Set - this is not, so I assume an employee can just submit their availability for the whole year with one click.

I will assume that all the duplicated records are valid and keep them, but mark them as '.Bulk'.

In [None]:
#Condesging the dataset for easier readability

activity_priority = {
    'Scheduling.Template.ApplyModal.Applied': 0,
    'Scheduling.Shift.Created': 1,
    'Scheduling.Shift.AssignmentChanged': 2,
    'Scheduling.Availability.Set': 3
}

df_grp = (
    df_raw
    .groupby(['organization_id', 'activity_name', 'timestamp'], as_index=False)
    .agg(
        records=('activity_name', 'size'),
        converted=('converted', 'first'),
        converted_at=('converted_at', 'first'),
        trial_start=('trial_start', 'first'),
        trial_end=('trial_end', 'first')
    )
)

df_grp['activity_priority'] = df_grp['activity_name'].map(activity_priority).fillna(999)
df_grp = df_grp.sort_values(['organization_id', 'timestamp', 'activity_priority']).reset_index(drop=True)
df_grp = df_grp.drop(columns='activity_priority')

# Will use 3 records as a treshold for bulk records
df_grp['bulk'] = (df_grp['records'] > 3).astype(int)

df_grp['activity_name_ext'] = np.where(
    df_grp['bulk'] == 1,
    df_grp['activity_name'] + '.Bulk',
    df_grp['activity_name']
)

df_grp = df_grp.drop(columns='bulk')

df_grp

In [None]:
df_grp_sorted = df_grp.sort_values(['organization_id', 'timestamp']).reset_index(drop=True)

# Creating group_id for consecutive runs of same org/activity_name/bulk
df_grp_sorted['group_id'] = (
    (df_grp_sorted['organization_id'] != df_grp_sorted['organization_id'].shift()) |
    (df_grp_sorted['activity_name_ext'] != df_grp_sorted['activity_name'].shift()) |
    (df_grp_sorted['timestamp'].dt.date != df_grp_sorted['timestamp'].dt.date.shift())
).cumsum()

df_short = (
    df_grp_sorted
    .groupby('group_id')
    .agg(
        organization_id=('organization_id', 'first'),
        activity_name=('activity_name', 'first'),
        activity_name_ext=('activity_name_ext', 'first'),
        ts_start=('timestamp', 'first'),
        ts_end=('timestamp', 'last'),
        events=('records', 'size'), # number of rows condensed
        records=('records', 'sum'), # number of records in rows condensed
        converted=('converted', 'first'),
        converted_at=('converted_at', 'first'),
        trial_start=('trial_start', 'first'),
        trial_end=('trial_end', 'first')
    )
    .reset_index(drop=True)
)
df_short['time_diff_sec'] = (df_short['ts_end']-df_short['ts_start']).dt.total_seconds().astype(int)
df_short['activity_name_new'] = df_short['activity_name']
df_short = df_short[['organization_id', 'activity_name', 'activity_name_ext', 'ts_start', 'ts_end', 'time_diff_sec', 'events', 'records', 'converted', 'converted_at', 'trial_start','trial_end']]
df_short

# 4. EDA

In [None]:
df_eda = df_short.copy()
df_eda

#### Adding helper columns

In [None]:
def add_time_features(df):

    df = df.copy()

    # Drop intermediate columns if they exist (safe for reruns)
    for col in ['activity_date', 'activity_week', 'active_days', 'active_weeks', 'max_active_week', 'activity_density']:
        if col in df.columns:
            df.drop(columns=col, inplace=True)

    # Dimension columns
    df['first_action_ts'] = df.groupby('organization_id')['ts_start'].transform('min')
    df['first_action_weekday'] = df['first_action_ts'].dt.dayofweek
    df['first_action_hour'] = df['first_action_ts'].dt.hour
    df['last_action_ts'] = df.groupby('organization_id')['ts_end'].transform('max')
    df['days_to_action'] = (df['first_action_ts'].dt.date - df['trial_start'].dt.date).dt.days
    df['active_span_days'] = (df['last_action_ts'] - df['first_action_ts']).dt.days + 1

    # Fact columns
    df['hours_since_first_action'] = np.floor((df['ts_start'] - df['first_action_ts']).dt.total_seconds() / 3600).astype(int)
    df['hours_since_trial_start'] = np.floor((df['ts_start'] - df['trial_start']).dt.total_seconds() / 3600).astype(int)
    
    df['days_since_first_action'] = (df['ts_start'].dt.date - df['first_action_ts'].dt.date).apply(lambda x: x.days)
    df['days_since_trial_start'] = (df['ts_start'].dt.date - df['trial_start'].dt.date).apply(lambda x: x.days)

    df['days_to_convert_tr'] = (df['converted_at'].dt.date - df['trial_start'].dt.date).apply(lambda x: x.days)
    df['days_to_convert_fa'] = (df['converted_at'].dt.date - df['first_action_ts'].dt.date).apply(lambda x: x.days)
    df['days_to_convert_la'] = (df['converted_at'].dt.date - df['last_action_ts'].dt.date).apply(lambda x: x.days)


    # Weekly conversion info
    df['trial_start_week'] = df['trial_start'].dt.to_period('W').apply(lambda r: r.start_time).dt.normalize()
    df['converted_week'] = ((df['converted_at'].dt.date - df['trial_start'].dt.date).apply(lambda x: x.days) // 7).clip(lower=0) + 1

    # Extract activity date and week
    df['activity_date'] = df['ts_start'].dt.date
    df['activity_week'] = ((df['ts_start'].dt.date - df['trial_start'].dt.date).apply(lambda x: x.days) // 7).clip(lower=0)+ 1

    # Count distinct active days per org
    active_days_per_org = (
        df.groupby('organization_id')['activity_date']
        .nunique()
        .rename('active_days')
        .reset_index()
    )

    # Count distinct active weeks and max active week per org
    active_weeks_per_org = (
        df.groupby('organization_id')
        .agg(
            active_weeks=('activity_week', 'nunique'),
            max_active_week=('activity_week', 'max')
        )
        .reset_index()
    )

    # Merge back into df
    df = df.merge(active_days_per_org, on='organization_id', how='left')
    df = df.merge(active_weeks_per_org, on='organization_id', how='left')

    # Drop intermediate columns
    df.drop(columns=['activity_date', 'activity_week'], inplace=True)

    # Final metrics
    df['activity_density'] = df['active_weeks'] / df['active_days']

    return df

In [None]:
# Adding hepler columns
df_eda = add_time_features(df_eda)
df_eda

#### Exploratory analysis

In [None]:
# When do users convert?

#Adding user group based on conversion status an conversion time
df_eda['user_group'] = np.select(
    [
        df_eda['converted'] == 0,  # Not converted
        (df_eda['converted'] == 1) & 
        (df_eda['converted_at'] < df_eda['trial_end'])&
        (df_eda['last_action_ts'] > df_eda['converted_at']),  # Converted during trial, active later
        (df_eda['converted'] == 1) & 
        (df_eda['converted_at'] < df_eda['trial_end'])&
        (df_eda['last_action_ts'] <= df_eda['converted_at']),  # Converted during trial, inactive later
        (df_eda['converted'] == 1) & 
        (df_eda['converted_at'] >= df_eda['trial_end']),  # Converted after trial
    ],
    [0, 1, 2, 3],  # Custom group labels
    default=4  # Catch-all for invalid/missing cases
)

#checking if all the organization fall into valud groups (not 3)
orgs_in_groups = (
    df_eda
    .groupby(['user_group'])
    .agg(organization_count=('organization_id', 'nunique'))
    .sort_index()
    .reset_index()
)

orgs_in_groups

In [None]:
df_eda

#### Exploratory Analysis

In [None]:
# Setting custom color schema
custom_cmap = LinearSegmentedColormap.from_list("custom_cmap", ["steelblue", "white", "orange"])
custom_cmap2 = LinearSegmentedColormap.from_list("custom_cmap", ["lightsteelblue", "white", "orange"])
custom_cmap3 = LinearSegmentedColormap.from_list("custom_cmap", ["white", "steelblue"])
custom_cmap4 = LinearSegmentedColormap.from_list("custom_cmap", ["white", "orange"])

In [None]:
# Visualizing convesion per days to converst since first action

converters = (
    df_eda[df_eda['converted'] == 1]
    .groupby('organization_id')
    .agg(days_to_convert=('days_to_convert_tr', 'min')) #change here days_to_convert_tr to days_to_convert_fa to check that parameter
    .sort_index()
    .reset_index()
)

# Group by days_to_convert
time_to_convert = (
    converters
    .groupby('days_to_convert')
    .agg(total_orgs=('organization_id', 'nunique'))
    .sort_index()
    .reset_index()
)

# Add cumulative converters and % share
time_to_convert['cumulative_converters'] = time_to_convert['total_orgs'].cumsum()
total_converted = time_to_convert['total_orgs'].sum()
time_to_convert['cumulative_percent'] = (time_to_convert['cumulative_converters'] / total_converted * 100).round(1)

# Create figure with secondary y-axis
fig = go.Figure()

# Add absolute converters (left y-axis)
fig.add_trace(
    go.Bar(
        x=time_to_convert['days_to_convert'],
        y=time_to_convert['total_orgs'],
        name='Daily Converters',
        marker_color='steelblue',
        yaxis='y1'
    )
)

# Add cumulative % converters (right y-axis)
fig.add_trace(
    go.Scatter(
        x=time_to_convert['days_to_convert'],
        y=time_to_convert['cumulative_percent'],
        name='Cumulative %',
        mode='lines+markers',
        marker_color='orange',
        yaxis='y2'
    )
)

# Update layout with dual axes
fig.update_layout(
    title='Daily and Cumulative Conversion by Days Since Trial Start',
    xaxis=dict(title='Trial Day', dtick=1),
    yaxis=dict(
        title='Number of Converted Orgs',
        side='left',
        showgrid=False
    ),
    yaxis2=dict(
        title='Cumulative % of Conversions',
        overlaying='y',
        side='right',
        range=[0, 100]
    ),
    legend=dict(x=0.01, y=0.99),
    width=900,
    height=500
)

fig.show()

In [None]:
df_eda['trial_start_week'] = df_eda['trial_start'].dt.to_period('W').apply(lambda r: r.start_time).dt.normalize()
df_eda['conversion_week'] = df_eda['converted_at'].dt.to_period('W').apply(lambda r: r.start_time).dt.normalize()

df_eda['weeks_since_trial_start'] = ((df_eda['conversion_week'] - df_eda['trial_start_week']).dt.days // 7).clip(lower=0)

converted = df_eda[df_eda['converted'] == 1].copy()

conversion_counts = (
    converted.groupby(['trial_start_week', 'weeks_since_trial_start'])
    .agg(converted_orgs=('organization_id', 'nunique'))
    .reset_index()
)

cohort_sizes = (
    df_eda.groupby('trial_start_week')
    .agg(total_orgs=('organization_id', 'nunique'))
    .reset_index()
)

conversion_counts = conversion_counts.merge(cohort_sizes, on='trial_start_week')
conversion_counts['conversion_rate'] = conversion_counts['converted_orgs'] / conversion_counts['total_orgs']

heatmap_data = conversion_counts.pivot(index='trial_start_week', columns='weeks_since_trial_start', values='conversion_rate')

heatmap_data = heatmap_data.sort_index()

plt.figure(figsize=(12, 4))
ax = sns.heatmap(heatmap_data, annot=True, fmt=".1%", cmap=custom_cmap, cbar_kws={'label': 'Conversion Rate'})

ax.set_yticklabels([d.strftime('%Y-%m-%d') for d in heatmap_data.index], rotation=0)

plt.title('Weekly Cohort Conversion Rate Heatmap')
plt.ylabel('Trial Start Week')
plt.xlabel('Weeks Since Trial Start')
plt.tight_layout()
plt.show()

In [None]:
df_eda[['ts_start', 'trial_start', 'trial_end', 'converted_at']].agg(['min', 'max'])

### Observtions: 
- All converted organizations bought the subscriotion at least 2 weeks after trial start and first action
- Most of them did that within the last week of trial
- Around a half of converters did it after the trial end (30 days). Of those, most converted by day 45 since trial start.
- Some weeks were better than the others in terms of overall performance

#### Questions: 
- Does this mean that these organizations were consistently active during the first 2 weeks of trial?
- Was this conversion pattern consistent across install cohorts?

In [None]:
user_group_aliases = {
    1: 'Converted during trial, active later (19 total)',
    2: 'Converted during trial, inactive later (80 total)',
    3: 'Converted after trial (107 total)'
}

user_groups = sorted(df_eda['user_group'].unique())

for group in user_groups:
    df_group = df_eda[(df_eda['converted'] == 1) & (df_eda['user_group'] == group)].copy()

    # Heatmap 1: active_weeks vs converted_week
    heatmap_data_1 = df_group.pivot_table(
        index='converted_week',
        columns='active_weeks',
        values='organization_id',
        aggfunc=pd.Series.nunique,
        fill_value=0
    )
    
    # Heatmap 2: active_weeks vs max_active_week
    heatmap_data_2 = df_group.pivot_table(
        index='converted_week',
        columns='max_active_week',
        values='organization_id',
        aggfunc=pd.Series.nunique,
        fill_value=0
    )
    
    # Skip if both are empty or zero-sum
    if (heatmap_data_1.empty or heatmap_data_1.values.sum() == 0) and \
       (heatmap_data_2.empty or heatmap_data_2.values.sum() == 0):
        print(f"Skipping group '{group}' — no data to plot.")
        continue

    # Normalize for annotations
    heatmap_norm_1 = heatmap_data_1 / heatmap_data_1.values.sum() if heatmap_data_1.values.sum() > 0 else heatmap_data_1
    heatmap_norm_2 = heatmap_data_2 / heatmap_data_2.values.sum() if heatmap_data_2.values.sum() > 0 else heatmap_data_2
    
    annot_1 = heatmap_norm_1.applymap(lambda x: f"{x:.1%}" if x > 0 else "")
    annot_2 = heatmap_norm_2.applymap(lambda x: f"{x:.1%}" if x > 0 else "")

    alias = user_group_aliases.get(group, str(group))

    fig, axes = plt.subplots(1, 2, figsize=(15, 3), sharey=True)
    
    sns.heatmap(
        heatmap_data_1,
        annot=annot_1,
        fmt='',
        cmap=custom_cmap,
        linewidths=0.5,
        linecolor='white',
        cbar_kws={'label': ''},
        ax=axes[0]
    )
    axes[0].set_title(f'Converted Week vs Active Weeks\n {alias}')
    axes[0].set_xlabel('Active Weeks')
    axes[0].set_ylabel('Converted Week')
    axes[0].invert_yaxis()

    sns.heatmap(
        heatmap_data_2,
        annot=annot_2,
        fmt='',
        cmap=custom_cmap,
        linewidths=0.5,
        linecolor='white',
        cbar_kws={'label': 'Unique Organizations'},
        ax=axes[1]
    )
    axes[1].set_title(f'Convertred Week vs Max Active Week\n {alias}')
    axes[1].set_xlabel('Max Active Week')
    axes[1].set_ylabel('')  # Shared y-axis, so leave blank
    axes[1].invert_yaxis()

    plt.tight_layout()
    plt.show()

#### Observtions & Questions: 
- Absolute majority of converters (group 2 and 3) was only active on week 1 (0-6 days since trial start), but convert around the time of trial end.
- What is weekly retention like for convertes and non-converters?

In [None]:
df_viz = df_eda.copy()

# Precompute total unique orgs per conversion group
total_orgs_per_group = df_viz.groupby('user_group')['organization_id'].nunique()

# Step 1: DAILY retention
daily_counts = (
    df_viz
    .groupby(['active_days', 'user_group'])['organization_id']
    .nunique()
    .reset_index(name='active_orgs')
)

# Normalize
daily_counts['retention'] = (
    daily_counts['active_orgs'] / 
    daily_counts['user_group'].map(total_orgs_per_group)
)
daily_counts = daily_counts.rename(columns={'active_days': 'period'})

# Step 2: WEEKLY retention
weekly_counts = (
    df_viz
    .groupby(['active_weeks', 'user_group'])['organization_id']
    .nunique()
    .reset_index(name='active_orgs')
)

weekly_counts['retention'] = (
    weekly_counts['active_orgs'] /
    weekly_counts['user_group'].map(total_orgs_per_group)
)
weekly_counts = weekly_counts.rename(columns={'active_weeks': 'period'})

# Pivot for heatmap format
daily_pivot = daily_counts.pivot(index='user_group', columns='period', values='retention')
weekly_pivot = weekly_counts.pivot(index='user_group', columns='period', values='retention')

# Plot heatmaps
fig, axes = plt.subplots(1, 2, figsize=(14, 5), sharey=True)

hm = sns.heatmap(
    daily_pivot, 
    annot=True, fmt=".0%", cmap=custom_cmap3, ax=axes[0]
)

# Rotate annotation text by 90 degrees
for text in hm.texts:
    text.set_rotation(90)

axes[0].set_xlabel("Distinct Days Active")
axes[0].set_ylabel("User Group")

sns.heatmap(
    weekly_pivot, 
    annot=True, fmt=".0%", cmap=custom_cmap4, ax=axes[1]
)
axes[1].set_xlabel("Distinct Weeks Active")
axes[1].set_ylabel("")

plt.suptitle("Activity by Period and User Group", fontsize=14)
plt.tight_layout()
plt.show()

#### Observtions: 
- Organizations in **Group 1** were active on at least 3 distinct days and most of them were active on 3-5 distinct weeks.
- 89% of organizations in **Group 2** were active on just one week
- **Group 3** (converted after trial) and **Group 0** (not converted) show very similar daily and weekly engagement patterns. Therefore, these patterns are bad predictors for conversion, but engagement metrics of Group 3 could have the key to difference between converters and non-converters.

### Engagement rates per activity

In [None]:
# Previously I looked at all the data available, but for activity engagement I will look only at actions before conversion
#df_cap = df_eda[df_eda['days_since_first_action'] < 14].reset_index(drop=True)
df_cap = df_short.copy()
df_cap = df_cap[df_cap['ts_start'] <= df_cap['converted_at']].reset_index(drop=True)
df_cap

In [None]:
# Adding hepler columns
df_cap_time = add_time_features(df_cap)
#df_cap_time.dtypes

In [None]:
# Merging user group from df_eda
org_user_group = (
    df_eda
    .sort_values(['organization_id', 'user_group'])
    .groupby('organization_id', as_index=False)['user_group']
    .first()
)

df_cap_time = df_cap_time.merge(org_user_group, on='organization_id', how='left')
df_cap_time.head()

### Identifying activities with the highest engagement and lowest speed of discovery

In [None]:
# Adding acitivity name mapping

mapping = {
    "ShiftDetails.View.Opened": "Other.Both",
    "Scheduling.OpenShiftRequest.Created": "Other.Employee",
    "Absence.Request.Created": "Other.Employee",
    "Absence.Request.Approved": "Other.Admin",
    "PunchClockEndNote.Add.Completed": "Other.Employee",
    "PunchClockStartNote.Add.Completed": "Other.Employee",
    "Absence.Request.Rejected": "Other.Admin",
    "Scheduling.ShiftSwap.Created": "Other.Employee",
    "Scheduling.ShiftHandover.Created": "Other.Employee",
    "Timesheets.BulkApprove.Confirmed": "Other.Admin",
    "Scheduling.ShiftHandover.Accepted": "Other.Both",
    "PunchClock.PunchedOut": "Other.Employee",
    "PunchClock.Entry.Edited": "Other.Admin",
    "Integration.Xero.PayrollExport.Synced": "Other.Admin",
    "Break.Activate.Started": "Other.Employee",
    "Scheduling.ShiftSwap.Accepted": "Other.Both",
    "Break.Activate.Finished": "Other.Employee",
    "Revenue.Budgets.Created": "Other.Admin",
    "Scheduling.OpenShiftRequest.Approved": "Other.Admin",
    "Shift.View.Opened": "Other.Both"
}

# Adding new column to df_cap_time
df_cap_time['activity_name_short'] = df_cap_time['activity_name'].map(mapping).fillna(df_cap_time['activity_name'])
df_cap_time.head()

In [None]:
def combined_engagement_and_speed_styled(
    df,
    category_col='activity_name',
    user_group_col='user_group',
    org_col='organization_id',
    time_col='ts_start',
    days_min=None,
    days_max=None,
    baseline='org_first_activity',
    custom_cmap=custom_cmap
):

    # --- Engagement part ---
    if days_min is not None:
        df = df[df['hours_since_first_action'] >= days_min]
    if days_max is not None:
        df = df[df['hours_since_first_action'] <= days_max]

    engagement_summary = (
        df.groupby([category_col, user_group_col])
          .agg(orgs_engaged=(org_col, 'nunique'))
          .reset_index()
    )

    pivot_orgs_engaged = engagement_summary.pivot(
        index=category_col,
        columns=user_group_col,
        values='orgs_engaged'
    ).fillna(0).astype(int)

    pivot_orgs_engaged.columns = pd.MultiIndex.from_product(
        [['orgs_engaged'], pivot_orgs_engaged.columns]
    )

    total_orgs_per_group = df.groupby(user_group_col)[org_col].nunique().to_dict()

    engagement_rate = pivot_orgs_engaged.copy()
    groups = engagement_rate.columns.get_level_values(1).unique()

    for group in groups:
        engagement_rate[('engagement_rate', group)] = (
            engagement_rate[('orgs_engaged', group)] / total_orgs_per_group.get(group, 1)
        )

    engagement_rate = engagement_rate.loc[:, engagement_rate.columns.get_level_values(0) == 'engagement_rate']

    engagement_df = pd.concat([pivot_orgs_engaged, engagement_rate], axis=1)

    # --- Discovery speed part ---
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col])

    org_first_ts = df.groupby(org_col)[time_col].min().rename('org_first_ts')

    org_activity_first_ts = (
        df.groupby([org_col, user_group_col, category_col])[time_col]
        .min()
        .reset_index()
    )

    org_activity_first_ts = org_activity_first_ts.merge(org_first_ts, left_on=org_col, right_index=True)

    if baseline == 'org_first_activity':
        org_activity_first_ts['discovery_delay'] = (
            org_activity_first_ts[time_col] - org_activity_first_ts['org_first_ts']
        ).dt.total_seconds() / (24 * 3600)
    else:
        org_activity_first_ts['discovery_delay'] = (
            org_activity_first_ts[time_col] - org_activity_first_ts[time_col].min()
        ).dt.total_seconds() / (24 * 3600)

    speed_summary = (
        org_activity_first_ts
        .groupby([user_group_col, category_col])['discovery_delay']
        .agg(median_days='median', mean_days='mean')
        .reset_index()
    )

    speed_melted = speed_summary.melt(
        id_vars=[user_group_col, category_col],
        value_vars=['median_days', 'mean_days'],
        var_name='metric',
        value_name='value'
    )

    speed_pivot = speed_melted.pivot_table(
        index=category_col,
        columns=['metric', user_group_col],
        values='value'
    ).round(2)

    # --- New: Calculate average order of discovery per activity and user group ---
    # Assign order per organization: sort activities by their first timestamp per org & user group
    def assign_order(group):
        group = group.sort_values(time_col)
        group['order'] = range(1, len(group)+1)
        return group

    # Reset index after apply to avoid ambiguity in groupby
    org_activity_order = org_activity_first_ts.groupby([org_col, user_group_col]).apply(assign_order).reset_index(drop=True)

    avg_order_summary = (
        org_activity_order
        .groupby([user_group_col, category_col])['order']
        .mean()
        .reset_index()
    )

    avg_order_pivot = avg_order_summary.pivot(
        index=category_col,
        columns=user_group_col,
        values='order'
    ).round(2)

    avg_order_pivot.columns = pd.MultiIndex.from_product([['avg_order'], avg_order_pivot.columns])

    # --- Combine all data ---
    combined = pd.concat([engagement_df, speed_pivot, avg_order_pivot], axis=1)
    total_orgs = combined.loc[:, combined.columns.get_level_values(0) == 'orgs_engaged'].sum(axis=1)
    combined = combined.loc[total_orgs.sort_values(ascending=False).index]

    # --- Filter by engagement rate >= 0.02 in any user group ---
    engagement_rate_cols = [col for col in combined.columns if col[0] == 'engagement_rate']
    if engagement_rate_cols:
        combined = combined[combined[engagement_rate_cols].ge(0.02).any(axis=1)] # <==== set to something else

    # --- Reorder columns ---
    metrics_order = ['orgs_engaged', 'engagement_rate', 'median_days', 'mean_days', 'avg_order']
    all_groups = sorted(combined.columns.get_level_values(1).unique())

    new_col_order = []
    for metric in metrics_order:
        for group in all_groups:
            if (metric, group) in combined.columns:
                new_col_order.append((metric, group))

    combined = combined[new_col_order]

    # --- Styling ---
    if custom_cmap is None:
        custom_cmap = sns.light_palette("green", as_cmap=True)

    format_dict = {
        **{('engagement_rate', group): '{:.1%}' for group in all_groups if ('engagement_rate', group) in combined.columns},
        **{('median_days', group): '{:.2f}' for group in all_groups if ('median_days', group) in combined.columns},
        **{('mean_days', group): '{:.2f}' for group in all_groups if ('mean_days', group) in combined.columns},
        **{('avg_order', group): '{:.2f}' for group in all_groups if ('avg_order', group) in combined.columns},
    }

    subset_columns = [('engagement_rate', group) for group in all_groups if ('engagement_rate', group) in combined.columns]
    subset_columns += [('avg_order', group) for group in all_groups if ('avg_order', group) in combined.columns]
    #subset_columns += [('mean_days', group) for group in all_groups if ('mean_days', group) in combined.columns]

    styled_df = (
        combined.style
        .background_gradient(subset=subset_columns, cmap=custom_cmap) #axis=1 for per-row color scale
        .format(format_dict)
        .set_caption("Engagement rate, discovery speed, and average order of discovery metrics")
    )

    return styled_df

In [None]:
def combined_engagement_and_speed_styled(
    df,
    category_col='activity_name',
    user_group_col='user_group',
    org_col='organization_id',
    time_col='ts_start',
    days_min=None,
    days_max=None,
    baseline='org_first_activity',
    metric_col='records',
    threshold=1,        
    custom_cmap=custom_cmap
):
    # --- Filter date range ---
    if days_min is not None:
        df = df[df['days_since_first_action'] >= days_min]
    if days_max is not None:
        df = df[df['days_since_first_action'] <= days_max]

    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col])

    # --- Engagement part with threshold ---
    # Count an org as engaged if sum(metric_col) >= threshold
    org_metric_sum = (
        df.groupby([category_col, user_group_col, org_col])[metric_col].sum().reset_index()
    )
    engaged_orgs = (
        org_metric_sum[org_metric_sum[metric_col] >= threshold]
        .groupby([category_col, user_group_col])[org_col]
        .nunique()
        .reset_index(name='orgs_engaged')
    )

    pivot_orgs_engaged = engaged_orgs.pivot(
        index=category_col,
        columns=user_group_col,
        values='orgs_engaged'
    ).fillna(0).astype(int)

    pivot_orgs_engaged.columns = pd.MultiIndex.from_product(
        [['orgs_engaged'], pivot_orgs_engaged.columns]
    )

    total_orgs_per_group = df.groupby(user_group_col)[org_col].nunique().to_dict()

    engagement_rate = pivot_orgs_engaged.copy()
    groups = engagement_rate.columns.get_level_values(1).unique()

    for group in groups:
        engagement_rate[('engagement_rate', group)] = (
            engagement_rate[('orgs_engaged', group)] / total_orgs_per_group.get(group, 1)
        )

    engagement_rate = engagement_rate.loc[:, engagement_rate.columns.get_level_values(0) == 'engagement_rate']
    engagement_df = pd.concat([pivot_orgs_engaged, engagement_rate], axis=1)

    # --- Average totals and daily averages ---
    total_metric_per_org = df.groupby([category_col, user_group_col, org_col])[metric_col].sum().reset_index()
    avg_total = (
        total_metric_per_org.groupby([category_col, user_group_col])[metric_col].mean().reset_index()
    )
    avg_total_pivot = avg_total.pivot(index=category_col, columns=user_group_col, values=metric_col).round(2)
    avg_total_pivot.columns = pd.MultiIndex.from_product([[f'avg_total_{metric_col}'], avg_total_pivot.columns])

    df['date_only'] = df[time_col].dt.date
    days_per_org = df.groupby([category_col, user_group_col, org_col])['date_only'].nunique().reset_index(name='distinct_days')
    metric_and_days = total_metric_per_org.merge(days_per_org, on=[category_col, user_group_col, org_col])
    metric_and_days['avg_daily'] = metric_and_days[metric_col] / metric_and_days['distinct_days']
    avg_daily = (
        metric_and_days.groupby([category_col, user_group_col])['avg_daily'].mean().reset_index()
    )
    avg_daily_pivot = avg_daily.pivot(index=category_col, columns=user_group_col, values='avg_daily').round(2)
    avg_daily_pivot.columns = pd.MultiIndex.from_product([[f'avg_daily_{metric_col}'], avg_daily_pivot.columns])

    # --- Discovery speed ---
    org_first_ts = df.groupby(org_col)[time_col].min().rename('org_first_ts')
    org_activity_first_ts = (
        df.groupby([org_col, user_group_col, category_col])[time_col].min().reset_index()
    )
    org_activity_first_ts = org_activity_first_ts.merge(org_first_ts, left_on=org_col, right_index=True)

    if baseline == 'org_first_activity':
        org_activity_first_ts['discovery_delay'] = (
            org_activity_first_ts[time_col] - org_activity_first_ts['org_first_ts']
        ).dt.total_seconds() / (24 * 3600)
    else:
        org_activity_first_ts['discovery_delay'] = (
            org_activity_first_ts[time_col] - org_activity_first_ts[time_col].min()
        ).dt.total_seconds() / (24 * 3600)

    speed_summary = (
        org_activity_first_ts.groupby([user_group_col, category_col])['discovery_delay']
        .agg(median_days='median', mean_days='mean')
        .reset_index()
    )

    speed_pivot = speed_summary.pivot(index=category_col, columns=user_group_col).round(2)

    # --- Average order ---
    def assign_order(group):
        group = group.sort_values(time_col)
        group['order'] = range(1, len(group) + 1)
        return group
    org_activity_order = org_activity_first_ts.groupby([org_col, user_group_col]).apply(assign_order).reset_index(drop=True)
    avg_order_summary = (
        org_activity_order.groupby([user_group_col, category_col])['order'].mean().reset_index()
    )
    avg_order_pivot = avg_order_summary.pivot(index=category_col, columns=user_group_col, values='order').round(2)
    avg_order_pivot.columns = pd.MultiIndex.from_product([['avg_order'], avg_order_pivot.columns])

    # --- Combine ---
    combined = pd.concat([engagement_df, avg_total_pivot, avg_daily_pivot, speed_pivot, avg_order_pivot], axis=1)
    total_orgs = combined.loc[:, combined.columns.get_level_values(0) == 'orgs_engaged'].sum(axis=1)
    combined = combined.loc[total_orgs.sort_values(ascending=False).index]

    # --- Engagement rate filter ---
    engagement_rate_cols = [col for col in combined.columns if col[0] == 'engagement_rate']
    if engagement_rate_cols:
        combined = combined[combined[engagement_rate_cols].ge(0.02).any(axis=1)]

    # --- Reorder ---
    #metrics_order = ['orgs_engaged', 'engagement_rate', f'avg_total_{metric_col}', f'avg_daily_{metric_col}', 'median_days', 'mean_days', 'avg_order']
    metrics_order = ['orgs_engaged', 'engagement_rate', f'avg_total_{metric_col}', f'avg_daily_{metric_col}', 'avg_order']
    all_groups = sorted(combined.columns.get_level_values(1).unique())
    new_col_order = [(m, g) for m in metrics_order for g in all_groups if (m, g) in combined.columns]
    combined = combined[new_col_order]

    # --- Style ---
    if custom_cmap is None:
        custom_cmap = sns.light_palette("green", as_cmap=True)
    format_dict = {
        **{('engagement_rate', g): '{:.1%}' for g in all_groups if ('engagement_rate', g) in combined.columns},
        **{(f'avg_total_{metric_col}', g): '{:.2f}' for g in all_groups if (f'avg_total_{metric_col}', g) in combined.columns},
        **{(f'avg_daily_{metric_col}', g): '{:.2f}' for g in all_groups if (f'avg_daily_{metric_col}', g) in combined.columns},
        **{('median_days', g): '{:.2f}' for g in all_groups if ('median_days', g) in combined.columns},
        **{('mean_days', g): '{:.2f}' for g in all_groups if ('mean_days', g) in combined.columns},
        **{('avg_order', g): '{:.2f}' for g in all_groups if ('avg_order', g) in combined.columns},
    }
    subset_columns = [('engagement_rate', g) for g in all_groups if ('engagement_rate', g) in combined.columns]
    #subset_columns += [('avg_order', g) for g in all_groups if ('avg_order', g) in combined.columns]

    return (
        combined.style
        .background_gradient(subset=subset_columns, cmap=custom_cmap)
        .format(format_dict)
        .set_caption(f"Engagement rate, discovery speed, and average order of discovery metrics ({metric_col})")
    )

In [None]:
#Checking the first 2 weeks of engagement
# days_min and days_max are hours

engagement = combined_engagement_and_speed_styled(
    df_cap_time, 
    user_group_col='user_group', 
    metric_col='events', 
    category_col='activity_name_ext', 
    threshold=2,
    days_min = 0, 
    days_max=0)

engagement

##### Group description 

| Group   | Description                               | Total organizations |
|---------|-------------------------------------------|----------------------|
| Group 0 | Not converted                             | 760                  |
| Group 1 | Converted during trial, active after      | 19                   |
| Group 2 | Converted during trial, active after      | 80                   |
| Group 3 | Converted after trial                     | 107                  |

### Observations 
#### Engagement
- **Scheduling.Shift.Created**: core admin feature, but only used by **~90%** of organizations.
- **Mobile.Schedule.Loaded**: core employee feature, used by **94.7% in Group 1**, and by only **40% in other** groups. This could mean that organizations that are likely to convert already run active businesses and have employees that check the schedules. Could also mean that these are big businesses.
- __Group 1:__  It is the smallest group, so I will only consider the activities with 50%+ engagement. This group has the highest engagement with these features:
    - Scheduling.Shift.AssignmentChanged
    - PunchClock.PunchedIn
    - Scheduling.Shift.Approved
- __Groups 0 and 3:__  these groups had very similar activity patterns, which could be explained by the fact that both didn't convert during trial. But Group 3 had **twice higher** engagement with **Scheduling.Template.ApplyModal.Applied** which could be one of the reasons why they eventually converted.

#### Unlock order
- __Group 1__ has a more or less defined order of discovering activities that follows the engagegment rates: Scheduling.Shift.Created -> Mobile.Schedule.Loaded -> Scheduling.Shift.AssignmentChanged -> Communication.Message.Created -> PunchClock.PunchedIn
- __Group 2__ has the least defined unlock order with most actions being between 1st and 3rd    
- __Groups 0 and 3:__  again show very similar unlock patterns which might make it hard to predict conversion.

In [None]:
def engagement_summary_activity_filter(
    df,
    activities,
    filter_type='exclude',      # 'exclude' or 'include'
    match_mode='or',            # 'or' = any match, 'and' = all match
    category_col='activity_name',
    user_group_col='user_group',
    org_col='organization_id',
    time_col='ts_start',
    days_min=None,
    days_max=None,
    baseline='org_first_activity',
    custom_cmap=custom_cmap
):

    # --- Step 1: Identify activities per org ---
    org_activities = df.groupby(org_col)[category_col].apply(set)

    if match_mode == 'or':
        matching_orgs = org_activities[
            org_activities.apply(lambda acts: any(act in acts for act in activities))
        ].index
    elif match_mode == 'and':
        matching_orgs = org_activities[
            org_activities.apply(lambda acts: all(act in acts for act in activities))
        ].index
    else:
        raise ValueError("match_mode must be either 'or' or 'and'.")

    # --- Step 2: Apply filter_type ---
    if filter_type == 'exclude':
        df_filtered = df[~df[org_col].isin(matching_orgs)].copy()
    elif filter_type == 'include':
        df_filtered = df[df[org_col].isin(matching_orgs)].copy()
    else:
        raise ValueError("filter_type must be either 'exclude' or 'include'.")

    # --- Step 3: Reset index to avoid ambiguity in later groupby ---
    df_filtered = df_filtered.reset_index(drop=True)

    # --- Step 4: Run main calculation/styling ---
    return combined_engagement_and_speed_styled(
        df_filtered,
        category_col=category_col,
        user_group_col=user_group_col,
        org_col=org_col,
        time_col=time_col,
        days_min=days_min,
        days_max=days_max,
        baseline=baseline,
        custom_cmap=custom_cmap
    )

In [None]:
# Define the list of activities to exclude
#employee_activities = [ 'Mobile.Schedule.Loaded', 'PunchClock.PunchedIn', 'Absence.Request.Created', 'Scheduling.Availability.Set', 'Scheduling.OpenShiftRequest.Created']
employee_activities = ['Mobile.Schedule.Loaded']

no_employee = engagement_summary_activity_filter(
    df_cap_time,
    activities=employee_activities,
    filter_type='exclude',
    match_mode='or'
)

no_employee

In [None]:
no_shifts_views = engagement_summary_activity_filter(
    df_cap_time,
    activities=['Mobile.Schedule.Loaded', 'Scheduling.Shift.Created'],
    filter_type='exclude',
    match_mode='or'
)

no_shifts_views

In [None]:
shifts_and_views = engagement_summary_activity_filter(
    df_cap_time,
    activities=['Mobile.Schedule.Loaded', 'Scheduling.Shift.Created'],
    filter_type='include',
    match_mode='and'
)

shifts_and_views

In [None]:
#pd.set_option('display.max_rows', 50)
#df_cap.dtypes

#### Observations and assumptions: 
- The organizations that didn't engage in sheduling shifts, mainly engaged in viewing schedule on mobile and didn't engage in any meaningful activity involving scheduling or accounting. 4 converted organizations were engaged in Punching clock, but since there was no
- Around 50% of total organizations (both converted and not) did not engage in any employee-driven activities. I assume that it is possible to use Planday in admin-only mode without explicit time tracking.
- Almost all organizations (except 14) engaged with either Scheduling.Shift.Created or Mobile.Schedule.Loaded 

#### Engagement depth per user group
- How many records per activity did they have?

In [None]:
import math
import warnings

def records_kde_by_activity(
    df,
    activity_col='activity_name',
    user_group_col='user_group',
    org_col='organization_id',
    records_col='records',
    ts_col='ts_start',
    min_days=None,
    max_days=None,
    plots_per_row=4,
    plot=True
):
    # Prefilter dataset by days_since_first_action if provided
    if min_days is not None:
        df = df[df['days_since_first_action'] >= min_days]
    if max_days is not None:
        df = df[df['days_since_first_action'] <= max_days]

    # Calculate days_active per org: number of unique days (date part of ts_col)
    df['activity_date'] = pd.to_datetime(df[ts_col]).dt.date
    days_active = df.groupby(org_col)['activity_date'].nunique().rename('days_active')
    
    # Aggregate total records per org per activity & user group
    agg_df = (
        df.groupby([activity_col, user_group_col, org_col])[records_col]
        .sum()
        .reset_index()
        .merge(days_active.reset_index(), on=org_col, how='left')
    )

    # Calculate avg_daily records per org
    agg_df['avg_daily'] = agg_df[records_col] / agg_df['days_active']
    
    # Apply log transform (add 1 to avoid log(0))
    agg_df['log_total'] = np.log1p(agg_df[records_col])
    agg_df['log_avg_daily'] = np.log1p(agg_df['avg_daily'])

    if not plot:
        return agg_df

    activities = sorted(agg_df[activity_col].unique())

    # We'll collect only plots that have data
    plots_info = []  # tuples: (activity, 'log_total' or 'log_avg_daily', data_by_group)

    for activity in activities:
        subset = agg_df[agg_df[activity_col] == activity]
        for metric in ['log_avg_daily']:  # ['log_total', 'log_avg_daily']
            
            # Check if there is at least one group with >1 data point
            has_data = False
            for group in sorted(subset[user_group_col].unique()):
                group_data = subset[subset[user_group_col] == group][metric]
                if len(group_data) > 1:
                    has_data = True
                    break
            if has_data:
                plots_info.append((activity, metric, subset))

    total_plots = len(plots_info)
    if total_plots == 0:
        print("No valid data to plot.")
        return agg_df

    rows = math.ceil(total_plots / plots_per_row)
    fig, axes = plt.subplots(rows, plots_per_row, figsize=(5 * plots_per_row, 4 * rows))
    if total_plots == 1:
        axes = [axes] 
    else:
        axes = axes.flatten()


    for i, (activity, metric, subset) in enumerate(plots_info):
        ax = axes[i]
        for group in sorted(subset[user_group_col].unique()):
            group_data = subset[subset[user_group_col] == group][metric]
            if len(group_data) > 1 and group_data.var() > 0:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")  # suppress KDE warnings
                    sns.kdeplot(group_data, label=str(group), fill=True, alpha=0.4, ax=ax)
        title_metric = "Log Total Records" if metric == 'log_total' else "Log Avg Daily Records"
        ax.set_title(f"{activity} ({title_metric})", fontsize=11)
        ax.set_xlabel(f"{title_metric} per Org + 1")
        ax.set_ylabel("Density")
        handles, labels = ax.get_legend_handles_labels()
        if handles:
            ax.legend(title=user_group_col)

    # Hide unused subplots
    for j in range(i + 1, len(axes)):
        axes[j].set_visible(False)

    plt.tight_layout()
    plt.show()

    return agg_df

In [None]:
agg_records = records_kde_by_activity(df_cap_time, activity_col='activity_name_short', plots_per_row=4, min_days=0, max_days=1)

#### Observations and assumptions: 
- From the distribution of max daily Mobile.Schedule.Loaded activities per user_group we can see that **Group 1** has the most daily views, which could mean that these are larger organizations.

In [None]:
df_agg = records_kde_by_activity(df_cap_time, plot=False, user_group_col='converted', activity_col='activity_name', plots_per_row=4, min_days=0, max_days=6)
df_agg

In [None]:
df_cap.dtypes
#df_agg

In [None]:
feature_table = df_agg.pivot_table(
    index=['organization_id', 'converted'], 
    columns='activity_name', 
    values='log_avg_daily', 
    aggfunc='first'  # or 'mean' if multiple rows per org-activity
).reset_index().fillna(0)

# Convert avg_daily values to binary flags: 1 if > 0 else 0
activity_cols = feature_table.columns.difference(['organization_id', 'converted'])
#feature_table[activity_cols] = (feature_table[activity_cols] > 0).astype(int)

corr_matrix = feature_table.corr()

corr_with_converted = corr_matrix['converted'].drop('converted').sort_values(ascending = False)

plt.figure(figsize=(10, 8))

# Barplot with horizontal bars, positive to right, negative to left
sns.barplot(x=corr_with_converted.values, y=corr_with_converted.index, palette='viridis')

plt.axvline(0, color='black', linewidth=1)  # vertical line at 0 for reference
plt.title('Correlation of Week 1 Activity Engagement with "Converted"')
plt.xlabel('Correlation')
plt.ylabel('Features')

plt.tight_layout()
plt.show()

#### Observations:
Correlation bertween of all the features with converted is very weak (less than 0.07)

### State Transition probabilities: last actions before conversion/churn

In [None]:
def plot_reverse_transition_heatmap(df, title):
    # Get the previous state for each current state per organization
    df['prev_state'] = df.groupby('organization_id')['activity_name'].shift(1)
    
    # Drop rows without a previous state (first state per org)
    transitions = (
        df.dropna(subset=['prev_state'])
        .groupby(['activity_name', 'prev_state'])
        .size()
        .reset_index(name='count')
    )
    
    all_states = sorted(set(df['activity_name']).union(set(df['prev_state'].dropna())))
    
    # Pivot so rows = current state, columns = previous state
    transition_matrix = transitions.pivot(index='activity_name', columns='prev_state', values='count').fillna(0)
    transition_matrix = transition_matrix.reindex(index=all_states, columns=all_states, fill_value=0)
    
    # Normalize by current state (rows) to get P(prev_state | current_state)
    transition_prob = transition_matrix.div(transition_matrix.sum(axis=1), axis=0).fillna(0)
    activity_order = sorted(transition_prob.index)
    
    fig = px.imshow(
        transition_prob,
        labels=dict(x="Previous Activity", y="Current Activity", color="Reverse Transition Probability"),
        x=transition_prob.columns,
        y=transition_prob.index,
        color_continuous_scale='Blues',
        title=title,
        aspect="auto",
        width=900,
        height=900
    )
    
    fig.update_layout(yaxis_autorange='reversed')
    fig.show()

In [None]:
df = df_cap.copy()

admin_activities = ['Absence.Request.Approved', 'Absence.Request.Rejected', 'Integration.Xero.PayrollExport.Synced', 'Timesheets.BulkApprove.Confirmed',
                    'PunchClock.Entry.Edited', 'Revenue.Budgets.Created', 'Scheduling.OpenShiftRequest.Approved',
                    'Scheduling.Template.ApplyModal.Applied', 'Communication.Message.Created', 'Scheduling.Shift.Approved',
                    'Scheduling.Shift.HandoverAccepted'
                   ]

df = df[df['activity_name'].isin(admin_activities)]


# Define custom sorting priority
activity_priority = {
    'Scheduling.Template.ApplyModal.Applied': 0,
    'Scheduling.Shift.Created': 1,
    'Scheduling.Shift.AssignmentChanged': 2
}

df['activity_priority'] = df['activity_name'].map(activity_priority).fillna(999)
df_sorted = df.sort_values(['organization_id', 'ts_start', 'activity_priority']).reset_index(drop=True)

# Creating group_id for consecutive runs of same org/activity_name/bulk
df_sorted['group_id'] = (
    (df_sorted['organization_id'] != df_sorted['organization_id'].shift()) |
    (df_sorted['activity_name'] != df_sorted['activity_name'].shift()) 
).cumsum()

df_rollup = (
    df_sorted
    .groupby('group_id')
    .agg(
        organization_id=('organization_id', 'first'),
        activity_name=('activity_name', 'first'),
        ts_start=('ts_start', 'first'),
        converted=('converted', 'first'),
        converted_at=('converted_at', 'first')
    )
    .reset_index(drop=True)
)
df_rollup = df_rollup[['organization_id', 'activity_name', 'ts_start', 'converted', 'converted_at']]
df_rollup

In [None]:
# Split data by converted status

#activities_to_exclude = ['Scheduling.Shift.Created', 'Mobile.Schedule.Loaded', 'Scheduling.Shift.AssignmentChanged'] 
employee_activities = [ 'PunchClock.PunchedIn', 'PunchClock.PunchedOut', 'Absence.Request.Created', 'Scheduling.Availability.Set', 
                        'Scheduling.OpenShiftRequest.Created', 'Break.Activate.Finished', 'Break.Activate.Started',
                        'PunchClockEndNote.Add.Completed',  'PunchClockStartNote.Add.Completed', 'Scheduling.ShiftSwap.Created'
                      ]
admin_activities = ['Absence.Request.Approved', 'Absence.Request.Rejected', 'Integration.Xero.PayrollExport.Synced', 'Timesheets.BulkApprove.Confirmed',
                    'PunchClock.Entry.Edited', 'Revenue.Budgets.Created', 'Scheduling.OpenShiftRequest.Approved',
                    'Scheduling.Template.ApplyModal.Applied', 'Communication.Message.Created', 'Scheduling.Shift.Approved',
                    'Scheduling.Shift.HandoverAccepted'
                   ]

df = df_rollup.copy()
#df = df[~df['activity_name'].isin(employee_activities)]
#df = df[df['activity_name'].isin(admin_activities)]

df_converted = df[df['converted'] == 1]
df_not_converted = df[df['converted'] == 0]

# Prepare chains for each group
#df_chains_converted = prepare_chains(df_converted)
#df_chains_not_converted = prepare_chains(df_not_converted)

# Plot heatmaps
#plot_reverse_transition_heatmap(df_chains_converted, "Reverse Transition - Converted")
#plot_reverse_transition_heatmap(df_chains_not_converted, "Reverse Transition - Not Converted")

#### Observations

- Most employee-driven activities result in browsing mobile schedule which could mean that this is the default screen in the mobile app.
- Since Admins are the ones making a decision abour conversion, I excluded employee-only activities and scheduling shifts from the list to get the most common conversion and churn drivers.

## Feature engineering and prediction

In [None]:
#pd.set_option('display.max_rows', 100)
df_fe = df_cap_time.copy()
df_fe

In [None]:
df_dim = (
    df_fe
    .groupby(['organization_id', 'converted'])
    .agg(
        days_to_action =('days_to_action', 'first'),
        first_action_weekday=('first_action_weekday', 'first'),
        first_action_hour=('first_action_hour', 'first'),
        active_span_days =('active_span_days', 'first'),
        active_days =('active_days', 'first'),
        active_weeks =('active_weeks', 'first'),
        activity_density =('activity_density', 'first')
    )
    .sort_values(['organization_id', 'converted'])
    .reset_index()
)
#df_dim['first_action_week'] = df_fe['first_action_ts'].dt.isocalendar().week.astype(int)

d1_flag = df_fe[df_fe['days_since_first_action'] == 1] \
            .groupby('organization_id').size() \
            .to_frame('d1_eng')

d1_flag['d1_eng'] = 1
df_dim = df_dim.merge(d1_flag[['d1_eng']], on='organization_id', how='left')
df_dim['d1_eng'] = df_dim['d1_eng'].fillna(0).astype(int)

df_dim

In [None]:
df_fe_agg = records_kde_by_activity(df_fe, plot=False, user_group_col='converted', 
                                    activity_col='activity_name_short', plots_per_row=4, 
                                    min_days=0, max_days=31) #analysing the first 2 weeks
df_fe_agg

In [None]:
#Testing on just 2 days of engagement
metrics_df = df_fe_agg.pivot_table(
        index='organization_id',
        columns='activity_name_short',
        values='avg_daily',
        aggfunc='mean',
        fill_value=0
    ).reset_index()

metrics_df

In [None]:
zero_var_cols = metrics_df.columns[metrics_df.nunique() <= 1]
print("Columns with zero or one unique value:", zero_var_cols.tolist())

In [None]:
df_feat = df_dim.merge(metrics_df, on='organization_id', how='left')
df_feat.head()

In [None]:
#print(df_feat.isnull().sum())

In [None]:
#print(df_feat.head())
print(df_feat.columns)

In [None]:
#pip install scikit-learn imbalanced-learn xgboost

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, classification_report, roc_auc_score, roc_curve
)
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

df=df_feat.copy()

X = df.drop(columns=['converted', 'organization_id'])
y = df['converted']

#categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
#X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)  

# Now you can continue with scaling, modeling, etc.
scaler = StandardScaler()
#X_scaled = pd.DataFrame(scaler.fit_transform(X_encoded), columns=X_encoded.columns)
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# Apply SMOTE on training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Define models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'KNN': KNeighborsClassifier()
}

# Train, predict, and evaluate
for name, model in models.items():
    print(f"\n=== {name} ===")
    model.fit(X_train_res, y_train_res)
    
    y_pred = model.predict(X_test)
    y_proba = None
    # For AUROC, need predicted probabilities for the positive class
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):  # fallback
        y_proba = model.decision_function(X_test)
    
    # Accuracy
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    
    # Detailed classification report
    print("Classification report:")
    print(classification_report(y_test, y_pred, digits=4))
    
    # AUROC
    if y_proba is not None:
        auroc = roc_auc_score(y_test, y_proba)
        print(f"AUROC: {auroc:.4f}")
    else:
        print("AUROC: N/A (no probability estimates available)")

#### Prediction result

XGBoost model has shown the best accuracy overall. Although more feature engineering and pruning to improve Converters prediction, we can look at the most important features.

In [None]:
import shap

xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_res, y_train_res)

# Create SHAP explainer for tree-based models
explainer = shap.Explainer(xgb_model, X_train_res)

# Calculate SHAP values for the training set
shap_values = explainer(X_train_res)

In [None]:
shap.summary_plot(shap_values, X_train_res, max_display=30)

#### Features
The model was trained on the first 2 weeks of trial. According to the model, these are most important features that had a clear positive or negative contribution:

Independent metrics:

- _Active span days_: organizations with bigger active span are less likely to convert.
- _Activity density_: organizations with higher activity density (ratio of active days to active weeks) are more likely to convert.

Note: daily metrics were capped (only first 3 days), the above are historic

App metrics:

- Organizations with high _Shift assignement change rate_ are less likely to convert
- Organizations with higher _Punch in to View_ ratio are more likely to convert
- Organizations with higher _Template usage_ are more likely to convert
- Organizations with low ratio of _Views to Shifts_ scheduled are more likely to convert -> this could indicate that actively run a business and already have real employees
- Organizations with high number of _Messages Created_ scheduled are less likely to convert -> this could indicate that the messaging tool is not convenient, or that users don't understaand how to use other fun

Punch in to punch out rate is quite low and often 0, so I assume that this is an optional action and employees are punched out automatically when their shift ends.

### Trial goals

Based on the model and EDA results, I am going to monitor these trial goals:

Core functionality:
Scheduling.Shift.Created -> Mobile.Schedule.Loaded -> Scheduling.Shift.Approved

Admin functionality:
Scheduling.Template.ApplyModal.Applied -> Scheduling.Shift.Created ->  Scheduling.Shift.AssignmentChanged -> Scheduling.Shift.Approved

Since there is no hard sequence of events, I ordered them in the order or the most intuitive discovery in my opinion.