In [429]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

#Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('deep')

In [430]:
conn = sqlite3.connect('papcorns.sqlite')

In [431]:
#Import Users Table into dataframe

users_df = pd.read_sql_query("SELECT*FROM users;",conn)

In [448]:
#Import Users Event Table into dataframe

events_df = pd.read_sql_query("SELECT*FROM user_events;",conn)

In [433]:
#Check columns and records's amount of Users df

users_df.shape

(1002, 5)

In [434]:
#Check columns and records's amount of Users Event df

events_df.shape

(3486, 5)

In [435]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  1002 non-null   int64 
 1   created_at          1002 non-null   object
 2   attribution_source  1002 non-null   object
 3   country             1002 non-null   object
 4   name                1002 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.3+ KB


In [436]:
events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3486 entries, 0 to 3485
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          3486 non-null   int64  
 1   created_at  3486 non-null   object 
 2   user_id     3486 non-null   int64  
 3   event_name  3486 non-null   object 
 4   amount_usd  1231 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 136.3+ KB


In [437]:
#5 head records of Users df

users_df.head()

Unnamed: 0,id,created_at,attribution_source,country,name
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown
1,2,2024-10-12T00:00:00,instagram,NL,Frank Moore
2,3,2024-10-15T00:00:00,tiktok,TR,Ivy Anderson
3,4,2024-08-28T00:00:00,tiktok,TR,Alice Brown
4,5,2024-04-03T00:00:00,organic,NL,Bob Moore


In [438]:
#5 head records of Users Event df

events_df.head()

Unnamed: 0,id,created_at,user_id,event_name,amount_usd
0,1,2024-05-07T00:00:00,1,app_install,
1,2,2024-05-12T00:00:00,1,trial_started,
2,3,2024-05-24T00:00:00,1,trial_cancelled,
3,4,2024-10-12T00:00:00,2,app_install,
4,5,2024-10-13T00:00:00,2,trial_started,


In [452]:
#lets remove id in events_df. user_id is reference od id in users_df


events_df = events_df.drop('id',axis = 1, inplace = True)

In [439]:
#Check Missing values by columns Users df

#Note  No missing values in Users df

users_df.isnull().sum()

id                    0
created_at            0
attribution_source    0
country               0
name                  0
dtype: int64

In [440]:
#Check Missing values by columns Users Event df

#Note!... 2255 of 3486 records of amount_usd column are null that is too high
#Take a look and give address what category have missing values 'most'

events_df.isnull().sum()

id               0
created_at       0
user_id          0
event_name       0
amount_usd    2255
dtype: int64

In [379]:
#There is a problem with created column cronologically. There are some date at the future so those values should be removed from dataset

events_df['created_at'] = pd.to_datetime(events_df['created_at'])


In [441]:
events_df.shape

(3486, 5)

In [450]:
with pd.option_context('display.max_rows',None):
    print(events_df)

        id           created_at  user_id              event_name  amount_usd
0        1  2024-05-07T00:00:00        1             app_install         NaN
1        2  2024-05-12T00:00:00        1           trial_started         NaN
2        3  2024-05-24T00:00:00        1         trial_cancelled         NaN
3        4  2024-10-12T00:00:00        2             app_install         NaN
4        5  2024-10-13T00:00:00        2           trial_started         NaN
5        6  2024-10-20T00:00:00        2    subscription_started        8.99
6        7  2024-11-19T00:00:00        2    subscription_renewed        8.99
7        8  2024-12-19T00:00:00        2    subscription_renewed        8.99
8        9  2025-01-18T00:00:00        2    subscription_renewed        8.99
9       10  2025-02-12T00:00:00        2  subscription_cancelled         NaN
10      11  2024-10-15T00:00:00        3             app_install         NaN
11      12  2024-10-19T00:00:00        3           trial_started         NaN

In [449]:
events_df

Unnamed: 0,id,created_at,user_id,event_name,amount_usd
0,1,2024-05-07T00:00:00,1,app_install,
1,2,2024-05-12T00:00:00,1,trial_started,
2,3,2024-05-24T00:00:00,1,trial_cancelled,
3,4,2024-10-12T00:00:00,2,app_install,
4,5,2024-10-13T00:00:00,2,trial_started,
...,...,...,...,...,...
3481,3482,2025-02-25T00:00:00,1000,trial_cancelled,
3482,3483,2025-02-25T00:00:00,1001,app_install,
3483,3484,2025-02-25T00:00:00,1001,trial_started,
3484,3485,2025-02-25T00:00:00,1001,subscription_started,9.99


In [381]:
events_df.drop(events_df[events_df['created_at']>pd.Timestamp.now()].index,axis = 0,inplace=True)

In [382]:
events_df.shape

(3293, 5)

In [383]:
print(3486-3293,' rows are dropped from events_df')

193  rows are dropped from events_df


In [384]:
#Value count by columns for Users df


for  i in users_df.columns:
    print("-"*18+"  ",i,"  "+"-"*18)
    print('There are ',users_df[i].nunique(),f' unique records of {i}',end= '\n\n')
    print(users_df[i].value_counts(ascending=False),end='\n\n\n\n')
    

------------------   id   ------------------
There are  1002  unique records of id

id
1       1
673     1
660     1
661     1
662     1
       ..
340     1
341     1
342     1
343     1
1002    1
Name: count, Length: 1002, dtype: int64



------------------   created_at   ------------------
There are  384  unique records of created_at

created_at
2024-07-01T00:00:00    8
2024-10-24T00:00:00    8
2024-08-15T00:00:00    7
2024-11-13T00:00:00    7
2024-07-06T00:00:00    6
                      ..
2024-03-03T00:00:00    1
2024-12-12T00:00:00    1
2024-06-06T00:00:00    1
2024-08-02T00:00:00    1
2024-10-13T00:00:00    1
Name: count, Length: 384, dtype: int64



------------------   attribution_source   ------------------
There are  3  unique records of attribution_source

attribution_source
tiktok       352
organic      344
instagram    306
Name: count, dtype: int64



------------------   country   ------------------
There are  3  unique records of country

country
TR    354
US    340
NL

In [385]:
#Value count by columns for Users Eventdf

for  i in events_df.columns:
    print("-"*18+"  ",i,"  "+"-"*18)
    print('There are ',events_df[i].nunique(),f' unique records of {i}',end= '\n\n')
    print(events_df[i].value_counts(ascending=False),end='\n\n\n\n')

------------------   id   ------------------
There are  3293  unique records of id

id
1       1
2315    1
2302    1
2303    1
2304    1
       ..
1166    1
1167    1
1168    1
1169    1
3481    1
Name: count, Length: 3293, dtype: int64



------------------   created_at   ------------------
There are  418  unique records of created_at

created_at
2025-02-15    17
2024-05-14    17
2025-02-13    16
2025-01-05    16
2025-02-01    16
              ..
2025-01-20     2
2024-03-22     2
2024-01-26     2
2024-01-06     1
2024-01-16     1
Name: count, Length: 418, dtype: int64



------------------   user_id   ------------------
There are  1000  unique records of user_id

user_id
359    8
102    8
311    8
24     8
819    8
      ..
220    1
786    1
224    1
387    1
504    1
Name: count, Length: 1000, dtype: int64



------------------   event_name   ------------------
There are  6  unique records of event_name

event_name
app_install               1000
trial_started              681
subscri

In [386]:
#Create new dataframe having missing values of amount_usd column
# Addressing missing values in Users event df

null_df = events_df[events_df['amount_usd'].isnull()]

In [387]:
#groupby created_at

#There is no make sense result grouping by created_at !... 

null_df.groupby(['created_at'])['created_at'].count()

created_at
2024-01-01    3
2024-01-02    2
2024-01-03    3
2024-01-04    3
2024-01-05    4
             ..
2025-02-17    1
2025-02-18    2
2025-02-19    2
2025-02-20    1
2025-02-21    1
Name: created_at, Length: 417, dtype: int64

In [388]:
#groupby event_name

#In this analysis we realize that in those subcategory of 
#event_name (app_install,subscription_cancelled,trial_cancelled,trial_started) No Cost
#that s why the values are null 

null_df.groupby(['event_name'])['event_name'].count()

event_name
app_install               1000
subscription_cancelled     300
trial_cancelled            197
trial_started              681
Name: event_name, dtype: int64

## Core Tasks 

#### Join the table for handling core  tasks

In [389]:
#Join dataframe to combine country and amount_usd at the same dataframe
#this allow calculate revenue by country
#As noted in the document user_id in Events Tbale is reference of id in Users table so mergen based on those columns

In [390]:
df = pd.merge(users_df,events_df, how = 'inner', left_on = 'id',right_on = 'user_id')

In [391]:
df.head()

Unnamed: 0,id_x,created_at_x,attribution_source,country,name,id_y,created_at_y,user_id,event_name,amount_usd
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown,1,2024-05-07,1,app_install,
1,1,2024-05-07T00:00:00,instagram,US,Eve Brown,2,2024-05-12,1,trial_started,
2,1,2024-05-07T00:00:00,instagram,US,Eve Brown,3,2024-05-24,1,trial_cancelled,
3,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,4,2024-10-12,2,app_install,
4,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,5,2024-10-13,2,trial_started,


In [392]:
#remove  idle created columns after merging

df.drop(['id_y','created_at_y'],axis =1,inplace=True)

In [393]:
#rename suffixed columns into original

df.rename({'id_x':'id','created_at_x':'created_at'},inplace=True,axis=1)

In [394]:
#keep a copy of dataframe just in case

df_backup = df.copy()

In [395]:
df

Unnamed: 0,id,created_at,attribution_source,country,name,user_id,event_name,amount_usd
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown,1,app_install,
1,1,2024-05-07T00:00:00,instagram,US,Eve Brown,1,trial_started,
2,1,2024-05-07T00:00:00,instagram,US,Eve Brown,1,trial_cancelled,
3,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2,app_install,
4,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2,trial_started,
...,...,...,...,...,...,...,...,...
3288,999,2024-12-24T00:00:00,organic,NL,Charlie Davis,999,trial_started,
3289,999,2024-12-24T00:00:00,organic,NL,Charlie Davis,999,subscription_started,8.99
3290,999,2024-12-24T00:00:00,organic,NL,Charlie Davis,999,subscription_renewed,8.99
3291,1000,2025-02-13T00:00:00,organic,NL,Jack Anderson,1000,app_install,


In [396]:
#Convert created_at column  into datetime format

df['created_at'] = pd.to_datetime(df['created_at'])

In [397]:
#check missing values after joining. This might be to confirm if joining is correct or not

df.isnull().sum()

id                       0
created_at               0
attribution_source       0
country                  0
name                     0
user_id                  0
event_name               0
amount_usd            2178
dtype: int64

### 1 -  Calculate the total revenue generated from subscriptions for each country

In [398]:
df.groupby('country')['amount_usd'].sum()

country
NL    3164.48
TR    1976.04
US    3666.33
Name: amount_usd, dtype: float64

### 2 - Calculate the total number of trials given to users who came from instagram

In [399]:
df[(df['attribution_source']=='instagram')&(df['event_name']=='trial_started')]

Unnamed: 0,id,created_at,attribution_source,country,name,user_id,event_name,amount_usd
1,1,2024-05-07,instagram,US,Eve Brown,1,trial_started,
4,2,2024-10-12,instagram,NL,Frank Moore,2,trial_started,
44,12,2024-01-08,instagram,US,Frank Miller,12,trial_started,
63,15,2024-07-10,instagram,US,Bob Miller,15,trial_started,
66,16,2024-04-26,instagram,NL,Alice Brown,16,trial_started,
...,...,...,...,...,...,...,...,...
3238,981,2024-04-16,instagram,US,Eve Wilson,981,trial_started,
3262,989,2025-01-12,instagram,NL,Alice Jones,989,trial_started,
3265,990,2024-11-24,instagram,TR,David Davis,990,trial_started,
3270,992,2025-02-07,instagram,US,Grace Jones,992,trial_started,


In [400]:
#Check each trial record belongs to unique user_id

df[(df['attribution_source']=='instagram')&(df['event_name']=='trial_started')]['user_id'].nunique()

209

In [401]:
print('Amount of trials from intagram :',df[(df['attribution_source']=='instagram')&(df['event_name']=='trial_started')]['event_name'].count())

Amount of trials from intagram : 209


### 3 - Create a new column named 'acquisition_channel' by categorizing users based on their 'attribution_source'

In [402]:
#Use list comprehension based on attribution_source values 'Paid' for instagram and tiktok and Organic for organic
#Crete new column

df['acquisition_channel'] = pd.Series(['Organic' if i=='organic' else 'Paid' for i in df['attribution_source']])

In [403]:
df.sample(20)

Unnamed: 0,id,created_at,attribution_source,country,name,user_id,event_name,amount_usd,acquisition_channel
459,145,2024-03-25,tiktok,NL,Alice Williams,145,app_install,,Paid
1022,309,2024-05-25,instagram,US,Charlie Smith,309,subscription_renewed,9.99,Paid
573,179,2024-04-04,organic,US,Frank Brown,179,subscription_started,9.99,Organic
1872,568,2024-04-01,instagram,US,Frank Williams,568,subscription_renewed,9.99,Paid
1562,472,2024-10-29,organic,TR,Jack Smith,472,subscription_cancelled,,Organic
1767,531,2024-09-28,instagram,TR,Jack Moore,531,subscription_renewed,4.99,Paid
347,109,2024-08-29,tiktok,US,Ivy Williams,109,app_install,,Paid
209,61,2024-12-15,tiktok,US,David Taylor,61,trial_started,,Paid
1456,444,2025-01-08,organic,NL,Charlie Jones,444,trial_started,,Organic
254,77,2025-01-05,organic,NL,Bob Wilson,77,trial_cancelled,,Organic


 ### 4 - Analyze the trial-to-subscription conversion rate : 

- Calculate the overall conversion rate
- Break down the conversion rate by attribution_source

In [404]:
df['event_name'].value_counts()

event_name
app_install               1000
trial_started              681
subscription_renewed       639
subscription_started       476
subscription_cancelled     300
trial_cancelled            197
Name: count, dtype: int64

In [405]:
#For calculation the trial to subscription overall. just divide subscription_started by trial_started values

print('Overall conversion rate of trial-to-subscription : ', f'%{round((481/682)*100,2)}')

Overall conversion rate of trial-to-subscription :  %70.53


In [406]:
#Groupby the dataframe based on attribution_source and return value_counts
#Then find trial to subscription rate by subcategory of attribution_source

df.groupby('attribution_source')[['attribution_source','event_name']].value_counts()

attribution_source  event_name            
instagram           app_install               305
                    trial_started             209
                    subscription_renewed      198
                    subscription_started      148
                    subscription_cancelled     95
                    trial_cancelled            60
organic             app_install               343
                    subscription_renewed      238
                    trial_started             236
                    subscription_started      167
                    subscription_cancelled    107
                    trial_cancelled            66
tiktok              app_install               352
                    trial_started             236
                    subscription_renewed      203
                    subscription_started      161
                    subscription_cancelled     98
                    trial_cancelled            71
Name: count, dtype: int64

In [407]:
#As seen the calculation, type of attribution sources do not influent conversion rate specifically.


print('Conversion rate of trial-to-subscription by "instagram" : ', f'%{round((149/210)*100,2)}',end='\n\n')
print('Conversion rate of trial-to-subscription by "organic" : ', f'%{round((169/236)*100,2)}',end='\n\n')
print('Conversion rate of trial-to-subscription by "tiktok" : ', f'%{round((163/236)*100,2)}')

Conversion rate of trial-to-subscription by "instagram" :  %70.95

Conversion rate of trial-to-subscription by "organic" :  %71.61

Conversion rate of trial-to-subscription by "tiktok" :  %69.07


 ### 5 - Calculate the median subscription duration (in months) for each country

In [408]:
#convert created_at column of event_df dataframe



In [410]:
#Create pivot table on events_df. This allow us compare date of events according to userd_id
#We will see each event date under a column named with events for each user_id 

df_pivot = events_df.pivot_table(index='user_id', columns='event_name', values='created_at')

In [411]:
#Adding countries into pivot table by merging based on user_id and id in users_Df

df_pivot = pd.merge(users_df,df_pivot, how = 'inner', left_on = 'id',right_on = 'user_id')

In [412]:
df_pivot

Unnamed: 0,id,created_at,attribution_source,country,name,app_install,subscription_cancelled,subscription_renewed,subscription_started,trial_cancelled,trial_started
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown,2024-05-07,NaT,NaT,NaT,2024-05-24,2024-05-12
1,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2024-10-12,2025-02-12,2024-12-19,2024-10-20,NaT,2024-10-13
2,3,2024-10-15T00:00:00,tiktok,TR,Ivy Anderson,2024-10-15,2025-01-20,2024-12-20,2024-10-21,NaT,2024-10-19
3,4,2024-08-28T00:00:00,tiktok,TR,Alice Brown,2024-08-28,NaT,NaT,NaT,2024-09-06,2024-08-31
4,5,2024-04-03T00:00:00,organic,NL,Bob Moore,2024-04-03,NaT,NaT,NaT,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...
995,996,2025-01-28T00:00:00,organic,TR,Jack Anderson,2025-01-28,NaT,NaT,2025-02-06,NaT,2025-02-01
996,997,2024-03-06T00:00:00,organic,NL,Bob Jones,2024-03-06,NaT,NaT,NaT,2024-03-16,2024-03-08
997,998,2025-02-01T00:00:00,instagram,TR,Bob Davis,2025-02-01,NaT,NaT,NaT,NaT,NaT
998,999,2024-12-24T00:00:00,organic,NL,Charlie Davis,2024-12-24,NaT,2025-02-02,2025-01-03,NaT,2024-12-29


In [413]:
#Create new df_pivot just store subscription based columns and essential columns

df_pivot_subscription = df_pivot[['id','country','subscription_cancelled','subscription_started']]

In [414]:
#For subscription not canceled , we calculate duration through subtraction 'subscription_started' from 'now'

now = pd.Timestamp.now()

df_pivot_subscription['duration-InMonth'] = round(
    ((df_pivot_subscription['subscription_cancelled'].fillna(now) - df_pivot_subscription['subscription_started']).dt.days) / 30, 2
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pivot_subscription['duration-InMonth'] = round(


In [415]:
#id stands for user_id aswell so when merging we create our dataframe based on id and user_id

df_pivot_subscription

Unnamed: 0,id,country,subscription_cancelled,subscription_started,duration-InMonth
0,1,US,NaT,NaT,
1,2,NL,2025-02-12,2024-10-20,3.83
2,3,TR,2025-01-20,2024-10-21,3.03
3,4,TR,NaT,NaT,
4,5,NL,NaT,NaT,
...,...,...,...,...,...
995,996,TR,NaT,2025-02-06,0.50
996,997,NL,NaT,NaT,
997,998,TR,NaT,NaT,
998,999,NL,NaT,2025-01-03,1.63


In [416]:
#Groupby on country and return median value of 'duration-InMonth' for each country

df_pivot_subscription[df_pivot_subscription['duration-InMonth'].notna()].groupby('country')['duration-InMonth'].median()

country
NL    2.515
TR    2.400
US    2.315
Name: duration-InMonth, dtype: float64

### 6 - Calculate the Average Lifetime Value (LTV) by country

In [417]:
#Calculate average revenue per user. Find total revenue and unique amount of users


total_revenue = df['amount_usd'].sum()
total_users = df['user_id'].nunique()  
rev_by_user = total_revenue / total_users

In [418]:
#Calculate "average lifespan" from duration-in month - so revenue taken only from subscription 
#We can use our "df_pivot_subscription" dataframe

average_lifespan = df_pivot_subscription['duration-InMonth'].mean()

In [419]:
average_lifespan

3.347752100840336

In [420]:
#Now Calculate LTV - Lifetiem Values by multiplication average lifespan with average revenue by user

LTV = round(rev_by_user*average_lifespan,2)

In [421]:
print('Average Lifetime Value(LTV) :',f'${LTV}')

Average Lifetime Value(LTV) : $29.48


## BONUS Tasks 

### 7 - Predict the churn probability for use #1002 (Clark Kent)

- Use any relevant features from the dataset
- Explain your model selection and feature engineering process
- Provide confidence intervals if applicaple

In [422]:
df_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   id                      1000 non-null   int64         
 1   created_at              1000 non-null   object        
 2   attribution_source      1000 non-null   object        
 3   country                 1000 non-null   object        
 4   name                    1000 non-null   object        
 5   app_install             1000 non-null   datetime64[ns]
 6   subscription_cancelled  300 non-null    datetime64[ns]
 7   subscription_renewed    336 non-null    datetime64[ns]
 8   subscription_started    476 non-null    datetime64[ns]
 9   trial_cancelled         197 non-null    datetime64[ns]
 10  trial_started           681 non-null    datetime64[ns]
dtypes: datetime64[ns](6), int64(1), object(4)
memory usage: 86.1+ KB


In [423]:
events_df.groupby('user_id').count()

Unnamed: 0_level_0,id,created_at,event_name,amount_usd
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,3,3,0
2,7,7,7,4
3,7,7,7,4
4,3,3,3,0
5,1,1,1,0
...,...,...,...,...
996,3,3,3,1
997,3,3,3,0
998,1,1,1,0
999,4,4,4,2


In [424]:
df[df['user_id'].isin([2,3])]

Unnamed: 0,id,created_at,attribution_source,country,name,user_id,event_name,amount_usd,acquisition_channel
3,2,2024-10-12,instagram,NL,Frank Moore,2,app_install,,Paid
4,2,2024-10-12,instagram,NL,Frank Moore,2,trial_started,,Paid
5,2,2024-10-12,instagram,NL,Frank Moore,2,subscription_started,8.99,Paid
6,2,2024-10-12,instagram,NL,Frank Moore,2,subscription_renewed,8.99,Paid
7,2,2024-10-12,instagram,NL,Frank Moore,2,subscription_renewed,8.99,Paid
8,2,2024-10-12,instagram,NL,Frank Moore,2,subscription_renewed,8.99,Paid
9,2,2024-10-12,instagram,NL,Frank Moore,2,subscription_cancelled,,Paid
10,3,2024-10-15,tiktok,TR,Ivy Anderson,3,app_install,,Paid
11,3,2024-10-15,tiktok,TR,Ivy Anderson,3,trial_started,,Paid
12,3,2024-10-15,tiktok,TR,Ivy Anderson,3,subscription_started,4.99,Paid


In [425]:
events_df[events_df['user_id'].isin([2,3])]

Unnamed: 0,id,created_at,user_id,event_name,amount_usd
3,4,2024-10-12,2,app_install,
4,5,2024-10-13,2,trial_started,
5,6,2024-10-20,2,subscription_started,8.99
6,7,2024-11-19,2,subscription_renewed,8.99
7,8,2024-12-19,2,subscription_renewed,8.99
8,9,2025-01-18,2,subscription_renewed,8.99
9,10,2025-02-12,2,subscription_cancelled,
10,11,2024-10-15,3,app_install,
11,12,2024-10-19,3,trial_started,
12,13,2024-10-21,3,subscription_started,4.99


In [426]:
df_pivot[df_pivot['id'].isin([2,3])]

Unnamed: 0,id,created_at,attribution_source,country,name,app_install,subscription_cancelled,subscription_renewed,subscription_started,trial_cancelled,trial_started
1,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2024-10-12,2025-02-12,2024-12-19,2024-10-20,NaT,2024-10-13
2,3,2024-10-15T00:00:00,tiktok,TR,Ivy Anderson,2024-10-15,2025-01-20,2024-12-20,2024-10-21,NaT,2024-10-19


In [427]:
df_pivot['trial_cancelled'].value_counts(dropna= True)

trial_cancelled
2024-07-20    3
2024-07-17    3
2024-06-19    3
2025-01-14    3
2024-10-19    2
             ..
2024-05-03    1
2024-12-06    1
2024-07-14    1
2024-09-11    1
2024-03-16    1
Name: count, Length: 164, dtype: int64

In [428]:
events_df

Unnamed: 0,id,created_at,user_id,event_name,amount_usd
0,1,2024-05-07,1,app_install,
1,2,2024-05-12,1,trial_started,
2,3,2024-05-24,1,trial_cancelled,
3,4,2024-10-12,2,app_install,
4,5,2024-10-13,2,trial_started,
...,...,...,...,...,...
3474,3475,2024-12-29,999,trial_started,
3475,3476,2025-01-03,999,subscription_started,8.99
3476,3477,2025-02-02,999,subscription_renewed,8.99
3479,3480,2025-02-13,1000,app_install,


In [451]:
df_pivot

Unnamed: 0,id,created_at,attribution_source,country,name,app_install,subscription_cancelled,subscription_renewed,subscription_started,trial_cancelled,trial_started
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown,2024-05-07,NaT,NaT,NaT,2024-05-24,2024-05-12
1,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2024-10-12,2025-02-12,2024-12-19,2024-10-20,NaT,2024-10-13
2,3,2024-10-15T00:00:00,tiktok,TR,Ivy Anderson,2024-10-15,2025-01-20,2024-12-20,2024-10-21,NaT,2024-10-19
3,4,2024-08-28T00:00:00,tiktok,TR,Alice Brown,2024-08-28,NaT,NaT,NaT,2024-09-06,2024-08-31
4,5,2024-04-03T00:00:00,organic,NL,Bob Moore,2024-04-03,NaT,NaT,NaT,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...
995,996,2025-01-28T00:00:00,organic,TR,Jack Anderson,2025-01-28,NaT,NaT,2025-02-06,NaT,2025-02-01
996,997,2024-03-06T00:00:00,organic,NL,Bob Jones,2024-03-06,NaT,NaT,NaT,2024-03-16,2024-03-08
997,998,2025-02-01T00:00:00,instagram,TR,Bob Davis,2025-02-01,NaT,NaT,NaT,NaT,NaT
998,999,2024-12-24T00:00:00,organic,NL,Charlie Davis,2024-12-24,NaT,2025-02-02,2025-01-03,NaT,2024-12-29
