<h4><b>Importing necessary libraries and connecting to the database</b></h4>

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Connect to SQLite database
conn = sqlite3.connect("../saas.db")

<h4><b>Fetching Trial Account data from database</b></h4>

In [2]:
trial_acc = pd.read_sql("SELECT account_id,signup_date FROM accounts WHERE is_trial='True'",conn)
trial_acc["signup_date"]=pd.to_datetime(trial_acc["signup_date"])
trial_acc["window_start"]=trial_acc["signup_date"] #Made this column for name convinience
trial_acc["window_end"] = trial_acc["signup_date"] + pd.Timedelta(days=30)
trial_acc.head()

Unnamed: 0,account_id,signup_date,window_start,window_end
0,A-1f0ac7,2023-08-27,2023-08-27,2023-09-26
1,A-6c093d,2023-04-14,2023-04-14,2023-05-14
2,A-462d45,2024-01-15,2024-01-15,2024-02-14
3,A-832ec2,2023-11-19,2023-11-19,2023-12-19
4,A-00cac8,2023-09-15,2023-09-15,2023-10-15


<h4><b>Fetching subscriptions,Churn Date and Feature Usage data from the database</b></h4>

In [3]:
churn=pd.read_sql("SELECT account_id, MIN(churn_date) AS churn_date FROM churn GROUP BY account_id",conn)
churn["churn_date"]=pd.to_datetime(churn["churn_date"])
subs=pd.read_sql("SELECT subscription_id, account_id , start_date , end_date FROM subscriptions ",conn)
subs["start_date"]=pd.to_datetime(subs["start_date"])
subs["end_date"]=pd.to_datetime(subs["end_date"])
feature_usage=pd.read_sql("SELECT * FROM feature_usage",conn)
feature_usage["usage_date"]=pd.to_datetime(feature_usage["usage_date"])

<h4><b>Joining Trial Accounts data to Subscriptions data</b></h4>

In [13]:
trial_subs = trial_acc.merge(
    subs,
    on="account_id",
    how="left"
)

trial_subs = trial_subs[
    (trial_subs["start_date"] <= trial_subs["window_end"]) &
    (
        trial_subs["end_date"].isna() |
        (trial_subs["end_date"] >= trial_subs["window_start"])
    )
]
trial_subs.head()

Unnamed: 0,account_id,signup_date,window_start,window_end,subscription_id,start_date,end_date
15,A-6c093d,2023-04-14,2023-04-14,2023-05-14,S-0ff481,2023-04-30,NaT
21,A-462d45,2024-01-15,2024-01-15,2024-02-14,S-9dda85,2024-01-16,2024-05-29
22,A-462d45,2024-01-15,2024-01-15,2024-02-14,S-5a4fb5,2024-02-13,NaT
46,A-00cac8,2023-09-15,2023-09-15,2023-10-15,S-2e965d,2023-09-16,NaT
60,A-f446b6,2023-05-12,2023-05-12,2023-06-11,S-19b415,2023-05-28,NaT


<h4><b>Joining Trial Subscription data with Feature Usage data</b>(within the window period)</h4>

In [25]:
trial_usage = trial_subs.merge(
    feature_usage,
    on="subscription_id",
    how="left"
)

trial_usage = trial_usage[
    (trial_usage["usage_date"] >= trial_usage["window_start"]) &
    (trial_usage["usage_date"] <= trial_usage["window_end"])
]
trial_usage.head()

Unnamed: 0,account_id,signup_date,window_start,window_end,subscription_id,start_date,end_date,usage_id,usage_date,feature_name,usage_count,usage_duration_secs,error_count,is_beta_feature
11,A-462d45,2024-01-15,2024-01-15,2024-02-14,S-5a4fb5,2024-02-13,NaT,U-d9d788,2024-01-15,feature_12,11.0,1760.0,0.0,False
43,A-3b5cd1,2023-09-12,2023-09-12,2023-10-12,S-b4ac47,2023-10-07,NaT,U-00d094,2023-09-29,feature_27,11.0,3157.0,2.0,False
44,A-3b5cd1,2023-09-12,2023-09-12,2023-10-12,S-b4ac47,2023-10-07,NaT,U-5dae6a,2023-09-29,feature_16,13.0,2535.0,0.0,True
56,A-9badbd,2024-06-19,2024-06-19,2024-07-19,S-f421c8,2024-06-29,NaT,U-ab2170,2024-07-01,feature_20,14.0,294.0,1.0,False
59,A-9badbd,2024-06-19,2024-06-19,2024-07-19,S-f421c8,2024-06-29,NaT,U-e6e444,2024-07-02,feature_33,9.0,1809.0,0.0,True


<h4><b>Aggregating the Trial Account's Usage data to get Behavioral Features data in the 30 days window </b></h4>

In [6]:
behavioral_features = (
    trial_usage
    .groupby(["account_id","signup_date"])
    .agg(
        usage_events_30d=("usage_count", "sum"),
        active_days_30d=("usage_date", "nunique"),
        features_used_30d=("feature_name", "nunique"),
        total_errors_30d=("error_count", "sum"),
        total_usage_time_30d=("usage_duration_secs", "sum")
    )
    .reset_index()
)
#adding the churn date data to it
behavioral_features=behavioral_features.merge(
    churn,
    on="account_id",
    how="left"
)
behavioral_features.head()

Unnamed: 0,account_id,signup_date,usage_events_30d,active_days_30d,features_used_30d,total_errors_30d,total_usage_time_30d,churn_date
0,A-139c3b,2024-11-21,31.0,3,3,0.0,11593.0,NaT
1,A-1e50e0,2023-02-21,9.0,1,1,4.0,2331.0,2023-11-01
2,A-3b5cd1,2023-09-12,24.0,1,2,2.0,5692.0,2024-01-21
3,A-417d2f,2023-07-04,16.0,1,1,0.0,7680.0,2024-12-30
4,A-462d45,2024-01-15,11.0,1,1,0.0,1760.0,2024-05-17


<h4><b>Adding early churn feature to the behavioural data </b>(Churn within 90 days of signup is early churn in this case)</h4><p>NOTE: We used 90 days as measure because the steep retention drop was noticed in 1-3 months period, especially for trial accounts</p>

In [7]:
behavioral_features["early_churn"] = (behavioral_features["churn_date"]<=(behavioral_features["signup_date"]+pd.Timedelta(days=90))).fillna(0).astype(bool)
behavioral_features.head()

Unnamed: 0,account_id,signup_date,usage_events_30d,active_days_30d,features_used_30d,total_errors_30d,total_usage_time_30d,churn_date,early_churn
0,A-139c3b,2024-11-21,31.0,3,3,0.0,11593.0,NaT,False
1,A-1e50e0,2023-02-21,9.0,1,1,4.0,2331.0,2023-11-01,False
2,A-3b5cd1,2023-09-12,24.0,1,2,2.0,5692.0,2024-01-21,False
3,A-417d2f,2023-07-04,16.0,1,1,0.0,7680.0,2024-12-30,False
4,A-462d45,2024-01-15,11.0,1,1,0.0,1760.0,2024-05-17,False


<h4><b>Calculating Early Churn Rate for activated and non-activated users (active users have active_days_30d>=2)</b></h4>

In [8]:
behavioral_features["is_active"]=(behavioral_features["active_days_30d"]>=2).astype(bool)
active_early_churn = behavioral_features.groupby("is_active")["early_churn"].mean().reset_index(name="early_churn_rate")
active_early_churn

Unnamed: 0,is_active,early_churn_rate
0,False,0.529412
1,True,0.25


<h5>Active users have lesser churn rate </h5>

<h4><b>Calculating Early Churn Rate for High Usage and Low Usage Accounts</b></h4>

In [9]:
behavioral_features["high_usage"]=(behavioral_features["total_usage_time_30d"]>=behavioral_features["total_usage_time_30d"].median()).astype(bool)
high_usage_churn = behavioral_features.groupby("high_usage")["early_churn"].mean().reset_index(name="early_churn_rate")
high_usage_churn

Unnamed: 0,high_usage,early_churn_rate
0,False,0.5
1,True,0.454545


## Conclusion
<h5>Behavioral cohort analysis among trial users indicates that early activation, defined as engagement across multiple days within the first 30 days, is strongly associated with lower early churn. In contrast, overall usage volume alone does not meaningfully differentiate churn risk. This suggests that early habit formation and repeated engagement, rather than sheer activity intensity, is the primary driver of trial retention.</h5>
<p><b>Note:</b> Given the small sample size, these results are interpreted directionally and used to guide further analysis rather than to draw causal conclusions.</p>

<h4>Saving the behavioural_features file as csv for modelling</h4>

In [None]:
behavioral_features.to_csv("../data/processed/trial_behavioral_features.csv",index=False)
