In [412]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime



In [413]:
conn = sqlite3.connect('papcorns.sqlite')

In [414]:
#Import Users Table into dataframe

users_df = pd.read_sql_query("SELECT*FROM users;",conn)

In [653]:
#Import Users Event Table into dataframe

events_df = pd.read_sql_query("SELECT*FROM user_events;",conn)

In [416]:
#Check columns and records's amount of Users df

users_df.shape

(1002, 5)

In [417]:
#Check columns and records's amount of Users Event df

events_df.shape

(3486, 5)

In [418]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  1002 non-null   int64 
 1   created_at          1002 non-null   object
 2   attribution_source  1002 non-null   object
 3   country             1002 non-null   object
 4   name                1002 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.3+ KB


In [419]:
events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3486 entries, 0 to 3485
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          3486 non-null   int64  
 1   created_at  3486 non-null   object 
 2   user_id     3486 non-null   int64  
 3   event_name  3486 non-null   object 
 4   amount_usd  1231 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 136.3+ KB


In [420]:
#5 head records of Users df

users_df.head()

Unnamed: 0,id,created_at,attribution_source,country,name
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown
1,2,2024-10-12T00:00:00,instagram,NL,Frank Moore
2,3,2024-10-15T00:00:00,tiktok,TR,Ivy Anderson
3,4,2024-08-28T00:00:00,tiktok,TR,Alice Brown
4,5,2024-04-03T00:00:00,organic,NL,Bob Moore


In [421]:
#5 head records of Users Event df

events_df.head()

Unnamed: 0,id,created_at,user_id,event_name,amount_usd
0,1,2024-05-07T00:00:00,1,app_install,
1,2,2024-05-12T00:00:00,1,trial_started,
2,3,2024-05-24T00:00:00,1,trial_cancelled,
3,4,2024-10-12T00:00:00,2,app_install,
4,5,2024-10-13T00:00:00,2,trial_started,


In [422]:
#lets remove id in events_df. user_id is reference od id in users_df


events_df = events_df.drop(columns='id',axis = 1)

In [423]:
#Check Missing values by columns Users df

#Note  No missing values in Users df

users_df.isnull().sum()

id                    0
created_at            0
attribution_source    0
country               0
name                  0
dtype: int64

In [424]:
#Check Missing values by columns Users Event df

#Note!... 2255 of 3486 records of amount_usd column are null that is too high
#Take a look and give address what category have missing values 'most'

events_df.isnull().sum()

created_at       0
user_id          0
event_name       0
amount_usd    2255
dtype: int64

In [425]:
#There is a problem with created column cronologically. There are some date at the future so those values should be removed from dataset

events_df['created_at'] = pd.to_datetime(events_df['created_at'])


In [426]:
events_df.shape

(3486, 4)

In [427]:
with pd.option_context('display.max_rows',None):
    print(events_df)

     created_at  user_id              event_name  amount_usd
0    2024-05-07        1             app_install         NaN
1    2024-05-12        1           trial_started         NaN
2    2024-05-24        1         trial_cancelled         NaN
3    2024-10-12        2             app_install         NaN
4    2024-10-13        2           trial_started         NaN
5    2024-10-20        2    subscription_started        8.99
6    2024-11-19        2    subscription_renewed        8.99
7    2024-12-19        2    subscription_renewed        8.99
8    2025-01-18        2    subscription_renewed        8.99
9    2025-02-12        2  subscription_cancelled         NaN
10   2024-10-15        3             app_install         NaN
11   2024-10-19        3           trial_started         NaN
12   2024-10-21        3    subscription_started        4.99
13   2024-11-20        3    subscription_renewed        4.99
14   2024-12-20        3    subscription_renewed        4.99
15   2025-01-19        3

In [428]:
events_df.drop(events_df[events_df['created_at']>pd.Timestamp.now()].index,axis = 0,inplace=True)

In [429]:
events_df.shape

(3310, 4)

In [430]:
print(3486-3293,' rows are dropped from events_df')

193  rows are dropped from events_df


In [431]:
users_df

Unnamed: 0,id,created_at,attribution_source,country,name
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown
1,2,2024-10-12T00:00:00,instagram,NL,Frank Moore
2,3,2024-10-15T00:00:00,tiktok,TR,Ivy Anderson
3,4,2024-08-28T00:00:00,tiktok,TR,Alice Brown
4,5,2024-04-03T00:00:00,organic,NL,Bob Moore
...,...,...,...,...,...
997,998,2025-02-01T00:00:00,instagram,TR,Bob Davis
998,999,2024-12-24T00:00:00,organic,NL,Charlie Davis
999,1000,2025-02-13T00:00:00,organic,NL,Jack Anderson
1000,1001,2025-02-16T00:00:00,instagram,US,Bruce Wayne


In [432]:
#Value count by columns for Users df


for  i in users_df.columns:
    print("-"*18+"  ",i,"  "+"-"*18)
    print('There are ',users_df[i].nunique(),f' unique records of {i}',end= '\n\n')
    print(users_df[i].value_counts(ascending=False),end='\n\n\n\n')
    

------------------   id   ------------------
There are  1002  unique records of id

1       1
673     1
660     1
661     1
662     1
       ..
340     1
341     1
342     1
343     1
1002    1
Name: id, Length: 1002, dtype: int64



------------------   created_at   ------------------
There are  384  unique records of created_at

2024-07-01T00:00:00    8
2024-10-24T00:00:00    8
2024-08-15T00:00:00    7
2024-11-13T00:00:00    7
2024-07-06T00:00:00    6
                      ..
2024-03-03T00:00:00    1
2024-12-12T00:00:00    1
2024-06-06T00:00:00    1
2024-08-02T00:00:00    1
2024-10-13T00:00:00    1
Name: created_at, Length: 384, dtype: int64



------------------   attribution_source   ------------------
There are  3  unique records of attribution_source

tiktok       352
organic      344
instagram    306
Name: attribution_source, dtype: int64



------------------   country   ------------------
There are  3  unique records of country

TR    354
US    340
NL    308
Name: country, dty

In [433]:
#Value count by columns for Users Eventdf

for  i in events_df.columns:
    print("-"*18+"  ",i,"  "+"-"*18)
    print('There are ',events_df[i].nunique(),f' unique records of {i}',end= '\n\n')
    print(events_df[i].value_counts(ascending=False),end='\n\n\n\n')

------------------   created_at   ------------------
There are  422  unique records of created_at

2024-05-14    17
2025-02-15    17
2025-02-01    16
2025-02-13    16
2025-01-05    16
              ..
2024-01-02     2
2025-01-20     2
2024-01-26     2
2024-01-06     1
2024-01-16     1
Name: created_at, Length: 422, dtype: int64



------------------   user_id   ------------------
There are  1002  unique records of user_id

262     8
17      8
780     8
327     8
158     8
       ..
492     1
489     1
477     1
469     1
1002    1
Name: user_id, Length: 1002, dtype: int64



------------------   event_name   ------------------
There are  6  unique records of event_name

app_install               1002
trial_started              682
subscription_renewed       644
subscription_started       479
subscription_cancelled     304
trial_cancelled            199
Name: event_name, dtype: int64



------------------   amount_usd   ------------------
There are  3  unique records of amount_usd

4.99

In [434]:
#Create new dataframe having missing values of amount_usd column
# Addressing missing values in Users event df

null_df = events_df[events_df['amount_usd'].isnull()]

In [435]:
#groupby created_at

#There is no make sense result grouping by created_at !... 

null_df.groupby(['created_at'])['created_at'].count()

created_at
2024-01-01    3
2024-01-02    2
2024-01-03    3
2024-01-04    3
2024-01-05    4
             ..
2025-02-20    1
2025-02-21    1
2025-02-22    3
2025-02-24    2
2025-02-25    4
Name: created_at, Length: 420, dtype: int64

In [436]:
#groupby event_name

#In this analysis we realize that in those subcategory of 
#event_name (app_install,subscription_cancelled,trial_cancelled,trial_started) No Cost
#that s why the values are null 

null_df.groupby(['event_name'])['event_name'].count()

event_name
app_install               1002
subscription_cancelled     304
trial_cancelled            199
trial_started              682
Name: event_name, dtype: int64

## Core Tasks 

In [437]:
#Join dataframe to combine country and amount_usd at the same dataframe
#this allow calculate revenue by country
#As noted in the document user_id in Events Tbale is reference of id in Users table so mergen based on those columns

In [438]:
df = pd.merge(users_df,events_df, how = 'inner', left_on = 'id',right_on = 'user_id')

In [439]:
df.head()

Unnamed: 0,id,created_at_x,attribution_source,country,name,created_at_y,user_id,event_name,amount_usd
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown,2024-05-07,1,app_install,
1,1,2024-05-07T00:00:00,instagram,US,Eve Brown,2024-05-12,1,trial_started,
2,1,2024-05-07T00:00:00,instagram,US,Eve Brown,2024-05-24,1,trial_cancelled,
3,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2024-10-12,2,app_install,
4,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2024-10-13,2,trial_started,


In [440]:
#remove  idle created columns after merging

df.drop(['created_at_y'],axis =1,inplace=True)

In [441]:
#rename suffixed columns into original

df.rename({'created_at_x':'created_at'},inplace=True,axis=1)

In [442]:
#keep a copy of dataframe just in case

df_backup = df.copy()

In [443]:
df

Unnamed: 0,id,created_at,attribution_source,country,name,user_id,event_name,amount_usd
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown,1,app_install,
1,1,2024-05-07T00:00:00,instagram,US,Eve Brown,1,trial_started,
2,1,2024-05-07T00:00:00,instagram,US,Eve Brown,1,trial_cancelled,
3,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2,app_install,
4,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2,trial_started,
...,...,...,...,...,...,...,...,...
3305,1000,2025-02-13T00:00:00,organic,NL,Jack Anderson,1000,trial_cancelled,
3306,1001,2025-02-16T00:00:00,instagram,US,Bruce Wayne,1001,app_install,
3307,1001,2025-02-16T00:00:00,instagram,US,Bruce Wayne,1001,trial_started,
3308,1001,2025-02-16T00:00:00,instagram,US,Bruce Wayne,1001,subscription_started,9.99


In [444]:
#check missing values after joining. This might be to confirm if joining is correct or not

df.isnull().sum()

id                       0
created_at               0
attribution_source       0
country                  0
name                     0
user_id                  0
event_name               0
amount_usd            2187
dtype: int64

### 1 -  Calculate the total revenue generated from subscriptions for each country

In [445]:
df.groupby('country')['amount_usd'].sum()

country
NL    3173.47
TR    1991.01
US    3706.29
Name: amount_usd, dtype: float64

### 2 - Calculate the total number of trials given to users who came from instagram

In [446]:
df[(df['attribution_source']=='instagram')&(df['event_name']=='trial_started')]

Unnamed: 0,id,created_at,attribution_source,country,name,user_id,event_name,amount_usd
1,1,2024-05-07T00:00:00,instagram,US,Eve Brown,1,trial_started,
4,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2,trial_started,
44,12,2024-01-08T00:00:00,instagram,US,Frank Miller,12,trial_started,
63,15,2024-07-10T00:00:00,instagram,US,Bob Miller,15,trial_started,
66,16,2024-04-26T00:00:00,instagram,NL,Alice Brown,16,trial_started,
...,...,...,...,...,...,...,...,...
3274,989,2025-01-12T00:00:00,instagram,NL,Alice Jones,989,trial_started,
3277,990,2024-11-24T00:00:00,instagram,TR,David Davis,990,trial_started,
3282,992,2025-02-07T00:00:00,instagram,US,Grace Jones,992,trial_started,
3287,995,2024-09-30T00:00:00,instagram,US,Frank Smith,995,trial_started,


In [447]:
#Check each trial record belongs to unique user_id

df[(df['attribution_source']=='instagram')&(df['event_name']=='trial_started')]['user_id'].nunique()

210

In [448]:
print('Amount of trials from intagram :',df[(df['attribution_source']=='instagram')&(df['event_name']=='trial_started')]['event_name'].count())

Amount of trials from intagram : 210


### 3 - Create a new column named 'acquisition_channel' by categorizing users based on their 'attribution_source'

In [449]:
#Use list comprehension based on attribution_source values 'Paid' for instagram and tiktok and Organic for organic
#Crete new column

df['acquisition_channel'] = pd.Series(['Organic' if i=='organic' else 'Paid' for i in df['attribution_source']])

In [450]:
df.sample(20)

Unnamed: 0,id,created_at,attribution_source,country,name,user_id,event_name,amount_usd,acquisition_channel
985,296,2024-12-14T00:00:00,organic,US,Charlie Davis,296,trial_cancelled,,Organic
1563,472,2024-10-29T00:00:00,organic,TR,Jack Smith,472,subscription_renewed,4.99,Organic
1886,571,2024-05-14T00:00:00,instagram,NL,Frank Williams,571,app_install,,Paid
2253,680,2024-11-22T00:00:00,organic,US,Charlie Jones,680,trial_started,,Organic
1016,307,2024-08-10T00:00:00,tiktok,NL,Henry Moore,307,trial_started,,Paid
1152,348,2025-01-07T00:00:00,instagram,NL,Eve Anderson,348,subscription_started,8.99,Paid
1395,424,2025-02-10T00:00:00,instagram,TR,David Jones,424,trial_started,,Paid
1573,474,2024-11-08T00:00:00,organic,NL,Charlie Miller,474,subscription_started,8.99,Organic
1869,567,2024-11-26T00:00:00,tiktok,US,Alice Moore,567,subscription_renewed,9.99,Paid
1219,366,2024-10-24T00:00:00,tiktok,TR,Frank Wilson,366,subscription_renewed,4.99,Paid


 ### 4 - Analyze the trial-to-subscription conversion rate : 

- Calculate the overall conversion rate
- Break down the conversion rate by attribution_source

In [451]:
df['event_name'].value_counts()

app_install               1002
trial_started              682
subscription_renewed       644
subscription_started       479
subscription_cancelled     304
trial_cancelled            199
Name: event_name, dtype: int64

In [452]:
#For calculation the trial to subscription overall. just divide subscription_started by trial_started values

print('Overall conversion rate of trial-to-subscription : ', f'%{round((481/682)*100,2)}')

Overall conversion rate of trial-to-subscription :  %70.53


In [453]:
df.groupby('attribution_source')[['attribution_source','event_name']].value_counts().reset_index()

Unnamed: 0,attribution_source,event_name,0
0,instagram,app_install,306
1,instagram,trial_started,210
2,instagram,subscription_renewed,201
3,instagram,subscription_started,149
4,instagram,subscription_cancelled,96
5,instagram,trial_cancelled,61
6,organic,app_install,344
7,organic,subscription_renewed,239
8,organic,trial_started,236
9,organic,subscription_started,168


In [454]:
#Groupby the dataframe based on attribution_source and return value_counts
#Then find trial to subscription rate by subcategory of attribution_source

df.groupby('attribution_source')[['attribution_source','event_name']].value_counts()

attribution_source  event_name            
instagram           app_install               306
                    trial_started             210
                    subscription_renewed      201
                    subscription_started      149
                    subscription_cancelled     96
                    trial_cancelled            61
organic             app_install               344
                    subscription_renewed      239
                    trial_started             236
                    subscription_started      168
                    subscription_cancelled    109
                    trial_cancelled            67
tiktok              app_install               352
                    trial_started             236
                    subscription_renewed      204
                    subscription_started      162
                    subscription_cancelled     99
                    trial_cancelled            71
dtype: int64

In [455]:
conv_df = df.groupby('attribution_source')[['attribution_source','event_name']].value_counts().reset_index()

In [456]:
conv_df[(conv_df['attribution_source']=='instagram')&(conv_df['event_name']=='subscription_started')].iloc[:,-1].values[0]

149

In [457]:
conv_df[(conv_df['attribution_source']=='instagram')&(conv_df['event_name']=='trial_started')].iloc[:,-1].values[0]

210

In [458]:
#As seen at the calculation, type of attribution sources do not influent conversion rate specifically.
#At the end if user start to use trial , about 470 of them start subscription


print('Conversion rate of trial-to-subscription by "instagram" : ', f"%{round((conv_df[(conv_df['attribution_source']=='instagram')&(conv_df['event_name']=='subscription_started')].iloc[:,-1].values[0]/conv_df[(conv_df['attribution_source']=='instagram')&(conv_df['event_name']=='trial_started')].iloc[:,-1].values[0])*100,2)}",end='\n\n')
print('Conversion rate of trial-to-subscription by "organic" : ', f"%{round((conv_df[(conv_df['attribution_source']=='organic')&(conv_df['event_name']=='subscription_started')].iloc[:,-1].values[0]/conv_df[(conv_df['attribution_source']=='organic')&(conv_df['event_name']=='trial_started')].iloc[:,-1].values[0])*100,2)}",end='\n\n')
print('Conversion rate of trial-to-subscription by "tiktok" : ', f"%{round((conv_df[(conv_df['attribution_source']=='tiktok')&(conv_df['event_name']=='subscription_started')].iloc[:,-1].values[0]/conv_df[(conv_df['attribution_source']=='tiktok')&(conv_df['event_name']=='trial_started')].iloc[:,-1].values[0])*100,2)}")

Conversion rate of trial-to-subscription by "instagram" :  %70.95

Conversion rate of trial-to-subscription by "organic" :  %71.19

Conversion rate of trial-to-subscription by "tiktok" :  %68.64


In [459]:
{round(conv_df[(conv_df['attribution_source']=='instagram')&(conv_df['event_name']=='subscription_started')].iloc[:,-1].values[0]/conv_df[(conv_df['attribution_source']=='instagram')&(conv_df['event_name']=='trial_started')].iloc[:,-1].values[0],2)}

{0.71}

 ### 5 - Calculate the median subscription duration (in months) for each country

In [460]:
#convert created_at column of event_df dataframe



In [461]:
#Create pivot table on events_df. This allow us compare date of events according to userd_id
#We will see each event date under a column named with events for each user_id 

df_pivot = events_df.pivot_table(index='user_id', columns='event_name', values='created_at')

In [462]:
#Adding countries into pivot table by merging based on user_id and id in users_Df

df_pivot = pd.merge(users_df,df_pivot, how = 'inner', left_on = 'id',right_on = 'user_id')

In [463]:
df_pivot

Unnamed: 0,id,created_at,attribution_source,country,name,app_install,subscription_cancelled,subscription_renewed,subscription_started,trial_cancelled,trial_started
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown,2024-05-07,NaT,NaT,NaT,2024-05-24,2024-05-12
1,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2024-10-12,2025-02-12,2024-12-19,2024-10-20,NaT,2024-10-13
2,3,2024-10-15T00:00:00,tiktok,TR,Ivy Anderson,2024-10-15,2025-01-20,2024-12-20,2024-10-21,NaT,2024-10-19
3,4,2024-08-28T00:00:00,tiktok,TR,Alice Brown,2024-08-28,NaT,NaT,NaT,2024-09-06,2024-08-31
4,5,2024-04-03T00:00:00,organic,NL,Bob Moore,2024-04-03,NaT,NaT,NaT,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...
997,998,2025-02-01T00:00:00,instagram,TR,Bob Davis,2025-02-01,NaT,NaT,NaT,NaT,NaT
998,999,2024-12-24T00:00:00,organic,NL,Charlie Davis,2024-12-24,NaT,2025-02-02,2025-01-03,NaT,2024-12-29
999,1000,2025-02-13T00:00:00,organic,NL,Jack Anderson,2025-02-13,NaT,NaT,NaT,2025-02-25,2025-02-15
1000,1001,2025-02-16T00:00:00,instagram,US,Bruce Wayne,2025-02-25,NaT,NaT,2025-02-25,NaT,2025-02-25


In [464]:
#Create new df_pivot just store subscription based columns and essential columns

df_pivot_subscription = df_pivot[['id','country','subscription_cancelled','subscription_started']]

In [465]:
#For subscription not canceled , we calculate duration through subtraction 'subscription_started' from 'now'

now = pd.Timestamp.now()

df_pivot_subscription['duration-InMonth'] = round(
    ((df_pivot_subscription['subscription_cancelled'].fillna(now) - df_pivot_subscription['subscription_started']).dt.days) / 30, 2
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pivot_subscription['duration-InMonth'] = round(


In [466]:
#id stands for user_id aswell so when merging we create our dataframe based on id and user_id

df_pivot_subscription

Unnamed: 0,id,country,subscription_cancelled,subscription_started,duration-InMonth
0,1,US,NaT,NaT,
1,2,NL,2025-02-12,2024-10-20,3.83
2,3,TR,2025-01-20,2024-10-21,3.03
3,4,TR,NaT,NaT,
4,5,NL,NaT,NaT,
...,...,...,...,...,...
997,998,TR,NaT,NaT,
998,999,NL,NaT,2025-01-03,1.77
999,1000,NL,NaT,NaT,
1000,1001,US,NaT,2025-02-25,0.00


In [467]:
#Groupby on country and return median value of 'duration-InMonth' for each country

df_pivot_subscription[df_pivot_subscription['duration-InMonth'].notna()].groupby('country')['duration-InMonth'].median()

country
NL    2.50
TR    2.43
US    2.33
Name: duration-InMonth, dtype: float64

### 6 - Calculate the Average Lifetime Value (LTV) by country

In [468]:
#Calculate average revenue per user. Find total revenue and unique amount of users


total_revenue = df['amount_usd'].sum()
total_users = df['user_id'].nunique()  
rev_by_user = total_revenue / total_users

In [469]:
#Calculate "average lifespan" from duration-in month - so revenue taken only from subscription 
#We can use our "df_pivot_subscription" dataframe

average_lifespan = df_pivot_subscription['duration-InMonth'].mean()

In [470]:
average_lifespan

3.3752609603340287

In [471]:
#Now Calculate LTV - Lifetiem Values by multiplication average lifespan with average revenue by user

LTV = round(rev_by_user*average_lifespan,2)

In [472]:
print('Average Lifetime Value(LTV) :',f'${LTV}')

Average Lifetime Value(LTV) : $29.88
