In [277]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime



In [278]:
conn = sqlite3.connect('papcorns.sqlite')

In [279]:
#Import Users Table into dataframe

users_df = pd.read_sql_query("SELECT*FROM users;",conn)

In [280]:
#Import User Events Table into dataframe

events_df = pd.read_sql_query("SELECT*FROM user_events;",conn)

In [281]:
#Check columns and records's amount of Users df

users_df.shape

(1002, 5)

In [282]:
#Check columns and records's amount of Users Event df
#We clean some of the record of events_df so let keep first shape with the name of event_shape1
#That allow us to display how many records will be deleted

event_shape1 = events_df.shape
event_shape1 

(3486, 5)

In [283]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  1002 non-null   int64 
 1   created_at          1002 non-null   object
 2   attribution_source  1002 non-null   object
 3   country             1002 non-null   object
 4   name                1002 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.3+ KB


In [284]:
events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3486 entries, 0 to 3485
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          3486 non-null   int64  
 1   created_at  3486 non-null   object 
 2   user_id     3486 non-null   int64  
 3   event_name  3486 non-null   object 
 4   amount_usd  1231 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 136.3+ KB


In [285]:
#5 head records of Users df

users_df.head()

Unnamed: 0,id,created_at,attribution_source,country,name
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown
1,2,2024-10-12T00:00:00,instagram,NL,Frank Moore
2,3,2024-10-15T00:00:00,tiktok,TR,Ivy Anderson
3,4,2024-08-28T00:00:00,tiktok,TR,Alice Brown
4,5,2024-04-03T00:00:00,organic,NL,Bob Moore


In [286]:
#5 head records of Users Event df

events_df.head()

Unnamed: 0,id,created_at,user_id,event_name,amount_usd
0,1,2024-05-07T00:00:00,1,app_install,
1,2,2024-05-12T00:00:00,1,trial_started,
2,3,2024-05-24T00:00:00,1,trial_cancelled,
3,4,2024-10-12T00:00:00,2,app_install,
4,5,2024-10-13T00:00:00,2,trial_started,


In [287]:
#lets remove id in events_df. user_id is reference of id in users_df


events_df = events_df.drop(columns='id',axis = 1)

In [288]:
#Check Missing values by columns Users df

#Note  No missing values in Users df

users_df.isnull().sum()

id                    0
created_at            0
attribution_source    0
country               0
name                  0
dtype: int64

In [289]:
#Check Missing values by columns Users Event df

#Note!... 2255 of 3486 records of amount_usd column are null that is too high
#Take a look and give address what category have missing values 'most'

events_df.isnull().sum()

created_at       0
user_id          0
event_name       0
amount_usd    2255
dtype: int64

In [291]:
#There are some date at the future. That s a mistake. We need to check and clean them in the dataset

events_df['created_at'].agg([min,max])

min   2024-01-01
max   2025-07-12
Name: created_at, dtype: datetime64[ns]

In [292]:
#There are records of future dates such as app install and trial started etc. on 2025-02-25, 2025-06-28 ...
#This is impossible . So we need to filter and clean them

events_df[events_df['created_at']>pd.Timestamp.now()]

Unnamed: 0,created_at,user_id,event_name,amount_usd
108,2025-03-20,28,subscription_renewed,9.99
109,2025-04-19,28,subscription_renewed,9.99
110,2025-05-19,28,subscription_renewed,9.99
111,2025-06-18,28,subscription_renewed,9.99
112,2025-06-28,28,subscription_cancelled,
...,...,...,...,...
3481,2025-02-25,1000,trial_cancelled,
3482,2025-02-25,1001,app_install,
3483,2025-02-25,1001,trial_started,
3484,2025-02-25,1001,subscription_started,9.99


In [293]:
#Clean rows having date of future

events_df.drop(events_df[events_df['created_at']>pd.Timestamp.now()].index,axis = 0,inplace=True)

In [296]:
#Filter and drop rows in the dataset which are wrong date

events_df.drop(events_df[events_df['created_at']>pd.Timestamp.now()].index,axis = 0,inplace=True)

In [300]:
print(event_shape1[0]-event_shape2[0],' rows are dropped from events_df due to wrong date')

183  rows are dropped from events_df due to wrong date


In [22]:
#Value count by columns for Users df


for  i in users_df.columns:
    print("-"*18+"  ",i,"  "+"-"*18)
    print('There are ',users_df[i].nunique(),f' unique records of {i}',end= '\n\n')
    print(users_df[i].value_counts(ascending=False),end='\n\n\n\n')
    

------------------   id   ------------------
There are  1002  unique records of id

1       1
673     1
660     1
661     1
662     1
       ..
340     1
341     1
342     1
343     1
1002    1
Name: id, Length: 1002, dtype: int64



------------------   created_at   ------------------
There are  384  unique records of created_at

2024-07-01T00:00:00    8
2024-10-24T00:00:00    8
2024-08-15T00:00:00    7
2024-11-13T00:00:00    7
2024-07-06T00:00:00    6
                      ..
2024-03-03T00:00:00    1
2024-12-12T00:00:00    1
2024-06-06T00:00:00    1
2024-08-02T00:00:00    1
2024-10-13T00:00:00    1
Name: created_at, Length: 384, dtype: int64



------------------   attribution_source   ------------------
There are  3  unique records of attribution_source

tiktok       352
organic      344
instagram    306
Name: attribution_source, dtype: int64



------------------   country   ------------------
There are  3  unique records of country

TR    354
US    340
NL    308
Name: country, dty

In [23]:
#Value count by columns for Users Eventdf

for  i in events_df.columns:
    print("-"*18+"  ",i,"  "+"-"*18)
    print('There are ',events_df[i].nunique(),f' unique records of {i}',end= '\n\n')
    print(events_df[i].value_counts(ascending=False),end='\n\n\n\n')

------------------   created_at   ------------------
There are  419  unique records of created_at

2025-02-15    17
2024-05-14    17
2025-02-13    16
2025-01-05    16
2025-02-01    16
              ..
2025-01-20     2
2024-03-22     2
2024-01-26     2
2024-01-06     1
2024-01-16     1
Name: created_at, Length: 419, dtype: int64



------------------   user_id   ------------------
There are  1000  unique records of user_id

872    8
17     8
845    8
158    8
36     8
      ..
245    1
246    1
249    1
504    1
187    1
Name: user_id, Length: 1000, dtype: int64



------------------   event_name   ------------------
There are  6  unique records of event_name

app_install               1000
trial_started              681
subscription_renewed       639
subscription_started       477
subscription_cancelled     303
trial_cancelled            197
Name: event_name, dtype: int64



------------------   amount_usd   ------------------
There are  3  unique records of amount_usd

4.99    396
9.9

In [24]:
#Create new dataframe having missing values of amount_usd column
# Addressing missing values in Users event df

null_df = events_df[events_df['amount_usd'].isnull()]

In [25]:
#groupby created_at

#There is no make sense result grouping by created_at !... 

null_df.groupby(['created_at'])['created_at'].count()

created_at
2024-01-01    3
2024-01-02    2
2024-01-03    3
2024-01-04    3
2024-01-05    4
             ..
2025-02-18    2
2025-02-19    2
2025-02-20    1
2025-02-21    1
2025-02-22    3
Name: created_at, Length: 418, dtype: int64

In [26]:
#groupby event_name

#In this analysis we realize that in those subcategory of 
#event_name (app_install,subscription_cancelled,trial_cancelled,trial_started) No Cost
#that s why the values are null 
#we convert them into 0 at the next steps

null_df.groupby(['event_name'])['event_name'].count()

event_name
app_install               1000
subscription_cancelled     303
trial_cancelled            197
trial_started              681
Name: event_name, dtype: int64

## Core Tasks 

#### Join the table for handling core  tasks

In [27]:
#Join dataframe to combine country and amount_usd at the same dataframe
#this allow calculate revenue by country
#As noted in the document user_id in Events Tbale is reference of id in Users table so mergen based on those columns

In [28]:
df = pd.merge(users_df,events_df, how = 'inner', left_on = 'id',right_on = 'user_id')

In [29]:
df.head()

Unnamed: 0,id,created_at_x,attribution_source,country,name,created_at_y,user_id,event_name,amount_usd
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown,2024-05-07,1,app_install,
1,1,2024-05-07T00:00:00,instagram,US,Eve Brown,2024-05-12,1,trial_started,
2,1,2024-05-07T00:00:00,instagram,US,Eve Brown,2024-05-24,1,trial_cancelled,
3,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2024-10-12,2,app_install,
4,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2024-10-13,2,trial_started,


In [30]:
#remove  idle created columns after merging

df.drop(['created_at_y'],axis =1,inplace=True)

In [31]:
#rename suffixed columns into original

df.rename({'created_at_x':'created_at'},inplace=True,axis=1)

In [32]:
#keep a copy of dataframe just in case

df_backup = df.copy()

In [33]:
df

Unnamed: 0,id,created_at,attribution_source,country,name,user_id,event_name,amount_usd
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown,1,app_install,
1,1,2024-05-07T00:00:00,instagram,US,Eve Brown,1,trial_started,
2,1,2024-05-07T00:00:00,instagram,US,Eve Brown,1,trial_cancelled,
3,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2,app_install,
4,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2,trial_started,
...,...,...,...,...,...,...,...,...
3292,999,2024-12-24T00:00:00,organic,NL,Charlie Davis,999,trial_started,
3293,999,2024-12-24T00:00:00,organic,NL,Charlie Davis,999,subscription_started,8.99
3294,999,2024-12-24T00:00:00,organic,NL,Charlie Davis,999,subscription_renewed,8.99
3295,1000,2025-02-13T00:00:00,organic,NL,Jack Anderson,1000,app_install,


In [34]:
#Convert created_at column  into datetime format

df['created_at'] = pd.to_datetime(df['created_at'])

In [258]:
df['created_at'].agg([max, min])

max   2025-02-15
min   2024-01-01
Name: created_at, dtype: datetime64[ns]

In [35]:
#check missing values after joining. This might be to confirm if joining is correct or not

df.isnull().sum()

id                       0
created_at               0
attribution_source       0
country                  0
name                     0
user_id                  0
event_name               0
amount_usd            2181
dtype: int64

### 1 -  Calculate the total revenue generated from subscriptions for each country

In [36]:
df.groupby('country')['amount_usd'].sum()

country
NL    3173.47
TR    1976.04
US    3666.33
Name: amount_usd, dtype: float64

### 2 - Calculate the total number of trials given to users who came from instagram

In [37]:
df[(df['attribution_source']=='instagram')&(df['event_name']=='trial_started')]

Unnamed: 0,id,created_at,attribution_source,country,name,user_id,event_name,amount_usd
1,1,2024-05-07,instagram,US,Eve Brown,1,trial_started,
4,2,2024-10-12,instagram,NL,Frank Moore,2,trial_started,
44,12,2024-01-08,instagram,US,Frank Miller,12,trial_started,
63,15,2024-07-10,instagram,US,Bob Miller,15,trial_started,
66,16,2024-04-26,instagram,NL,Alice Brown,16,trial_started,
...,...,...,...,...,...,...,...,...
3242,981,2024-04-16,instagram,US,Eve Wilson,981,trial_started,
3266,989,2025-01-12,instagram,NL,Alice Jones,989,trial_started,
3269,990,2024-11-24,instagram,TR,David Davis,990,trial_started,
3274,992,2025-02-07,instagram,US,Grace Jones,992,trial_started,


In [38]:
#Check each trial record belongs to unique user_id

df[(df['attribution_source']=='instagram')&(df['event_name']=='trial_started')]['user_id'].nunique()

209

In [39]:
print('Amount of trials from intagram :',df[(df['attribution_source']=='instagram')&(df['event_name']=='trial_started')]['event_name'].count())

Amount of trials from intagram : 209


### 3 - Create a new column named 'acquisition_channel' by categorizing users based on their 'attribution_source'

In [40]:
#Use list comprehension based on attribution_source values 'Paid' for instagram and tiktok and Organic for organic
#Crete new column

df['acquisition_channel'] = pd.Series(['Organic' if i=='organic' else 'Paid' for i in df['attribution_source']])

In [41]:
df.sample(20)

Unnamed: 0,id,created_at,attribution_source,country,name,user_id,event_name,amount_usd,acquisition_channel
1676,505,2024-01-18,organic,NL,Ivy Brown,505,subscription_renewed,8.99,Organic
3003,911,2024-12-31,organic,TR,Jack Miller,911,trial_started,,Organic
2178,659,2024-09-08,instagram,TR,Alice Miller,659,trial_started,,Paid
2446,738,2024-11-08,tiktok,NL,Grace Miller,738,trial_started,,Paid
1728,519,2024-08-03,instagram,US,Jack Miller,519,trial_started,,Paid
602,190,2025-01-29,organic,TR,David Moore,190,app_install,,Organic
1315,398,2024-04-09,tiktok,NL,Charlie Miller,398,subscription_started,8.99,Paid
2195,662,2024-04-08,instagram,TR,David Jones,662,subscription_started,4.99,Paid
986,298,2024-05-16,instagram,TR,Bob Anderson,298,subscription_started,4.99,Paid
811,250,2024-11-16,tiktok,US,Jack Smith,250,app_install,,Paid


 ### 4 - Analyze the trial-to-subscription conversion rate : 

- Calculate the overall conversion rate
- Break down the conversion rate by attribution_source

In [42]:
df['event_name'].value_counts()

app_install               1000
trial_started              681
subscription_renewed       639
subscription_started       477
subscription_cancelled     303
trial_cancelled            197
Name: event_name, dtype: int64

In [43]:
#For calculation the trial to subscription overall. just divide subscription_started by trial_started values

print('Overall conversion rate of trial-to-subscription : ', f'%{round((481/682)*100,2)}')

Overall conversion rate of trial-to-subscription :  %70.53


In [44]:
#Groupby the dataframe based on attribution_source and return value_counts
#Then find trial to subscription rate by subcategory of attribution_source

df.groupby('attribution_source')[['attribution_source','event_name']].value_counts()

attribution_source  event_name            
instagram           app_install               305
                    trial_started             209
                    subscription_renewed      198
                    subscription_started      148
                    subscription_cancelled     95
                    trial_cancelled            60
organic             app_install               343
                    subscription_renewed      238
                    trial_started             236
                    subscription_started      167
                    subscription_cancelled    109
                    trial_cancelled            66
tiktok              app_install               352
                    trial_started             236
                    subscription_renewed      203
                    subscription_started      162
                    subscription_cancelled     99
                    trial_cancelled            71
dtype: int64

In [45]:
#As seen at the calculation, type of attribution sources do not influent conversion rate specifically.
#At the end if user start to use trial , about 470 of them start subscription


print('Conversion rate of trial-to-subscription by "instagram" : ', f'%{round((149/210)*100,2)}',end='\n\n')
print('Conversion rate of trial-to-subscription by "organic" : ', f'%{round((169/236)*100,2)}',end='\n\n')
print('Conversion rate of trial-to-subscription by "tiktok" : ', f'%{round((163/236)*100,2)}')

Conversion rate of trial-to-subscription by "instagram" :  %70.95

Conversion rate of trial-to-subscription by "organic" :  %71.61

Conversion rate of trial-to-subscription by "tiktok" :  %69.07


 ### 5 - Calculate the median subscription duration (in months) for each country

In [46]:
#convert created_at column of event_df dataframe



In [47]:
#Create pivot table on events_df. This allow us compare date of events according to userd_id
#We will see each event date under a column named with events for each user_id 

df_pivot = events_df.pivot_table(index='user_id', columns='event_name', values='created_at')

In [48]:
#Adding countries into pivot table by merging based on user_id and id in users_Df

df_pivot = pd.merge(users_df,df_pivot, how = 'inner', left_on = 'id',right_on = 'user_id')

In [49]:
df_pivot

Unnamed: 0,id,created_at,attribution_source,country,name,app_install,subscription_cancelled,subscription_renewed,subscription_started,trial_cancelled,trial_started
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown,2024-05-07,NaT,NaT,NaT,2024-05-24,2024-05-12
1,2,2024-10-12T00:00:00,instagram,NL,Frank Moore,2024-10-12,2025-02-12,2024-12-19,2024-10-20,NaT,2024-10-13
2,3,2024-10-15T00:00:00,tiktok,TR,Ivy Anderson,2024-10-15,2025-01-20,2024-12-20,2024-10-21,NaT,2024-10-19
3,4,2024-08-28T00:00:00,tiktok,TR,Alice Brown,2024-08-28,NaT,NaT,NaT,2024-09-06,2024-08-31
4,5,2024-04-03T00:00:00,organic,NL,Bob Moore,2024-04-03,NaT,NaT,NaT,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...
995,996,2025-01-28T00:00:00,organic,TR,Jack Anderson,2025-01-28,NaT,NaT,2025-02-06,NaT,2025-02-01
996,997,2024-03-06T00:00:00,organic,NL,Bob Jones,2024-03-06,NaT,NaT,NaT,2024-03-16,2024-03-08
997,998,2025-02-01T00:00:00,instagram,TR,Bob Davis,2025-02-01,NaT,NaT,NaT,NaT,NaT
998,999,2024-12-24T00:00:00,organic,NL,Charlie Davis,2024-12-24,NaT,2025-02-02,2025-01-03,NaT,2024-12-29


In [50]:
#Create new df_pivot just store subscription based columns and essential columns

df_pivot_subscription = df_pivot[['id','country','subscription_cancelled','subscription_started']]

In [51]:
#For subscription not canceled , we calculate duration through subtraction 'subscription_started' from 'now'

now = pd.Timestamp.now()

df_pivot_subscription['duration-InMonth'] = round(
    ((df_pivot_subscription['subscription_cancelled'].fillna(now) - df_pivot_subscription['subscription_started']).dt.days) / 30, 2
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pivot_subscription['duration-InMonth'] = round(


In [52]:
#id stands for user_id aswell so when merging we create our dataframe based on id and user_id

df_pivot_subscription

Unnamed: 0,id,country,subscription_cancelled,subscription_started,duration-InMonth
0,1,US,NaT,NaT,
1,2,NL,2025-02-12,2024-10-20,3.83
2,3,TR,2025-01-20,2024-10-21,3.03
3,4,TR,NaT,NaT,
4,5,NL,NaT,NaT,
...,...,...,...,...,...
995,996,TR,NaT,2025-02-06,0.53
996,997,NL,NaT,NaT,
997,998,TR,NaT,NaT,
998,999,NL,NaT,2025-01-03,1.67


In [53]:
#Groupby on country and return median value of 'duration-InMonth' for each country

df_pivot_subscription[df_pivot_subscription['duration-InMonth'].notna()].groupby('country')['duration-InMonth'].median()

country
NL    2.500
TR    2.415
US    2.330
Name: duration-InMonth, dtype: float64

### 6 - Calculate the Average Lifetime Value (LTV) by country

In [54]:
#Calculate average revenue per user. Find total revenue and unique amount of users


total_revenue = df['amount_usd'].sum()
total_users = df['user_id'].nunique()  
rev_by_user = total_revenue / total_users

In [55]:
#Calculate "average lifespan" from duration-in month - so revenue taken only from subscription 
#We can use our "df_pivot_subscription" dataframe

average_lifespan = df_pivot_subscription['duration-InMonth'].mean()

In [56]:
average_lifespan

3.3528721174004175

In [57]:
#Now Calculate LTV - Lifetiem Values by multiplication average lifespan with average revenue by user

LTV = round(rev_by_user*average_lifespan,2)

In [58]:
print('Average Lifetime Value(LTV) :',f'${LTV}')

Average Lifetime Value(LTV) : $29.56


## BONUS Tasks 

### 7 - Predict the churn probability for use #1002 (Clark Kent)

- Use any relevant features from the dataset
- Explain your model selection and feature engineering process
- Provide confidence intervals if applicaple

In [59]:
users_df.head()

Unnamed: 0,id,created_at,attribution_source,country,name
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown
1,2,2024-10-12T00:00:00,instagram,NL,Frank Moore
2,3,2024-10-15T00:00:00,tiktok,TR,Ivy Anderson
3,4,2024-08-28T00:00:00,tiktok,TR,Alice Brown
4,5,2024-04-03T00:00:00,organic,NL,Bob Moore


In [60]:
events_df.head()

Unnamed: 0,created_at,user_id,event_name,amount_usd
0,2024-05-07,1,app_install,
1,2024-05-12,1,trial_started,
2,2024-05-24,1,trial_cancelled,
3,2024-10-12,2,app_install,
4,2024-10-13,2,trial_started,


In [61]:
events_df.columns

Index(['created_at', 'user_id', 'event_name', 'amount_usd'], dtype='object')

In [62]:
events_df['event_name'].value_counts(dropna=False).index

Index(['app_install', 'trial_started', 'subscription_renewed',
       'subscription_started', 'subscription_cancelled', 'trial_cancelled'],
      dtype='object')

In [63]:
events_df['event_name'] = events_df['event_name'].astype('category')

# Count 'subscription_renewed' events per user_id
renewed_counts = events_df[events_df['event_name'] == 'subscription_renewed'].groupby('user_id').size().reset_index(name='subscription_renewed')


In [64]:
#renewed_counts

In [65]:
x_df = events_df[['user_id']].drop_duplicates()  # Ensure all users are included
df_subscription_renewed = x_df.merge(renewed_counts, on='user_id', how='left')

In [66]:
df_subscription_renewed

Unnamed: 0,user_id,subscription_renewed
0,1,
1,2,3.0
2,3,3.0
3,4,
4,5,
...,...,...
995,996,
996,997,
997,998,
998,999,1.0


In [67]:
filtered_df = events_df[events_df['event_name']!='subscription_renewed']

In [68]:
filtered_df['event_name'].value_counts()

app_install               1000
trial_started              681
subscription_started       477
subscription_cancelled     303
trial_cancelled            197
subscription_renewed         0
Name: event_name, dtype: int64

In [69]:
df_pivot_no_renewed = filtered_df.pivot_table(index='user_id', columns='event_name', values='created_at')

In [70]:
df_pivot_no_renewed = df_pivot_no_renewed.reset_index()

In [71]:
events_df[events_df['user_id']==1002]

Unnamed: 0,created_at,user_id,event_name,amount_usd


In [72]:
df_final = df_pivot_no_renewed.merge(df_subscription_renewed, on='user_id', how='inner')

In [73]:
df_final

Unnamed: 0,user_id,app_install,subscription_cancelled,subscription_started,trial_cancelled,trial_started,subscription_renewed
0,1,2024-05-07,NaT,NaT,2024-05-24,2024-05-12,
1,2,2024-10-12,2025-02-12,2024-10-20,NaT,2024-10-13,3.0
2,3,2024-10-15,2025-01-20,2024-10-21,NaT,2024-10-19,3.0
3,4,2024-08-28,NaT,NaT,2024-09-06,2024-08-31,
4,5,2024-04-03,NaT,NaT,NaT,NaT,
...,...,...,...,...,...,...,...
995,996,2025-01-28,NaT,2025-02-06,NaT,2025-02-01,
996,997,2024-03-06,NaT,NaT,2024-03-16,2024-03-08,
997,998,2025-02-01,NaT,NaT,NaT,NaT,
998,999,2024-12-24,NaT,2025-01-03,NaT,2024-12-29,1.0


In [74]:
users_df

Unnamed: 0,id,created_at,attribution_source,country,name
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown
1,2,2024-10-12T00:00:00,instagram,NL,Frank Moore
2,3,2024-10-15T00:00:00,tiktok,TR,Ivy Anderson
3,4,2024-08-28T00:00:00,tiktok,TR,Alice Brown
4,5,2024-04-03T00:00:00,organic,NL,Bob Moore
...,...,...,...,...,...
997,998,2025-02-01T00:00:00,instagram,TR,Bob Davis
998,999,2024-12-24T00:00:00,organic,NL,Charlie Davis
999,1000,2025-02-13T00:00:00,organic,NL,Jack Anderson
1000,1001,2025-02-16T00:00:00,instagram,US,Bruce Wayne


In [75]:
df_final =  pd.merge(df_final,users_df[['attribution_source','country']], left_on=df_final['user_id'],right_on =users_df['id'], how='inner').drop(columns=['key_0'])

In [76]:
df_final

Unnamed: 0,user_id,app_install,subscription_cancelled,subscription_started,trial_cancelled,trial_started,subscription_renewed,attribution_source,country
0,1,2024-05-07,NaT,NaT,2024-05-24,2024-05-12,,instagram,US
1,2,2024-10-12,2025-02-12,2024-10-20,NaT,2024-10-13,3.0,instagram,NL
2,3,2024-10-15,2025-01-20,2024-10-21,NaT,2024-10-19,3.0,tiktok,TR
3,4,2024-08-28,NaT,NaT,2024-09-06,2024-08-31,,tiktok,TR
4,5,2024-04-03,NaT,NaT,NaT,NaT,,organic,NL
...,...,...,...,...,...,...,...,...,...
995,996,2025-01-28,NaT,2025-02-06,NaT,2025-02-01,,organic,TR
996,997,2024-03-06,NaT,NaT,2024-03-16,2024-03-08,,organic,NL
997,998,2025-02-01,NaT,NaT,NaT,NaT,,instagram,TR
998,999,2024-12-24,NaT,2025-01-03,NaT,2024-12-29,1.0,organic,NL


In [77]:
df_final = df_final.drop(columns=['TDAYtrialStart_trialCancel'])

KeyError: "['TDAYtrialStart_trialCancel'] not found in axis"

In [78]:
df_final['DAY_appinstall_trialStart'] = (df_final['trial_started']-df_final['app_install']).dt.days

In [79]:
#App indirenlerin 3 te 2 si trial basliyor
#Trial ilk 6 gunde kullanmaya her gune ortlama yaklasik esit dagilacak sekilde 
#6 gun gectikten sonra trial e kullanan yok

df_final['TDAY_appinstall_trialStart'].value_counts(dropna =False).sort_index()

1.0    113
2.0    121
3.0    113
4.0    118
5.0    105
6.0    111
NaN    319
Name: TDAY_appinstall_trialStart, dtype: int64

In [80]:
#Trial started yapmadan subscription yapan yok

df_final[(df_final['trial_started'].isnull())&(df_final['subscription_started'].notna())]

Unnamed: 0,user_id,app_install,subscription_cancelled,subscription_started,trial_cancelled,trial_started,subscription_renewed,attribution_source,country,TDAY_appinstall_trialStart


In [81]:
df_final['DAY_trialStart_trialCancel'] = (df_final['trial_cancelled']-df_final['trial_started']).dt.days	

In [82]:
#Trial started yapanlarin buyuk kismi subscription yapiyor
#2 hafta trial olan kesin subscription yapiyor

df_final['DAY_trialStart_trialCancel'].value_counts(dropna =False).sort_index()

1.0      19
2.0      12
3.0      12
4.0      18
5.0      12
6.0      17
7.0      19
8.0      16
9.0      19
10.0     11
11.0     12
12.0     16
13.0     14
NaN     803
Name: TDAY_trialStart_trialCancel, dtype: int64

In [83]:
events_df[events_df['event_name']=='trial_started']['event_name'].value_counts(dropna = False).sort_index()

app_install                 0
subscription_cancelled      0
subscription_renewed        0
subscription_started        0
trial_cancelled             0
trial_started             681
Name: event_name, dtype: int64

In [84]:
events_df[events_df['event_name']=='trial_started']['event_name'].value_counts(dropna = False).sort_index()

app_install                 0
subscription_cancelled      0
subscription_renewed        0
subscription_started        0
trial_cancelled             0
trial_started             681
Name: event_name, dtype: int64

In [85]:
len(events_df[events_df['event_name']=='subscription_started']['user_id'].isin(events_df[events_df['event_name']=='trial_started']['user_id']))

477

In [86]:
#Trial e baslayanlarin yarisi subscription oluyor
#App indrenlerinde 3 te 2 si trial oluyordu
#Dolayisiyla App indirenlerin 3 te 1 subscription oluyor

len(events_df[events_df['event_name']=='subscription_started']['user_id']\
    .isin(events_df[events_df['event_name']=='trial_started']['user_id']))/events_df['user_id'].nunique()

0.477

In [87]:
df_final['DAY_trialStart_trialCancel'] = (df_final['trial_cancelled']-df_final['trial_started']).dt.days

In [88]:
#

sum((df_final['subscription_started']-df_final['trial_started']).dt.days.value_counts(dropna = False).values[1:])

477

In [89]:
events_df.columns

Index(['created_at', 'user_id', 'event_name', 'amount_usd'], dtype='object')

In [90]:
users_df.columns

Index(['id', 'created_at', 'attribution_source', 'country', 'name'], dtype='object')

In [111]:
dic = {i: events_df[events_df['user_id']==i].event_name.unique().to_list() for i in events_df.user_id.unique()}

In [119]:
[i for i in dic.keys() if 'subscription_renewed' in dic[i]]

336

In [118]:
df_final[df_final['subscription_renewed'].notna()].user_id

1        2
2        3
7        8
8        9
9       10
      ... 
979    980
985    986
987    988
994    995
998    999
Name: user_id, Length: 336, dtype: int64

In [121]:
df_final[df_final.subscription_renewed.notna()].user_id

1        2
2        3
7        8
8        9
9       10
      ... 
979    980
985    986
987    988
994    995
998    999
Name: user_id, Length: 336, dtype: int64

In [186]:
df_final

Unnamed: 0,user_id,app_install,subscription_cancelled,subscription_started,trial_cancelled,trial_started,subscription_renewed,attribution_source,country,TDAY_appinstall_trialStart,TDAY_trialStart_trialCancel,Ever_Subscripted,Still_Subscripted,subscription_duration
0,1,2024-05-07,NaT,NaT,2024-05-24,2024-05-12,,instagram,US,5.0,12.0,0,0,0
1,2,2024-10-12,2025-02-12,2024-10-20,NaT,2024-10-13,3.0,instagram,NL,1.0,,1,0,115
2,3,2024-10-15,2025-01-20,2024-10-21,NaT,2024-10-19,3.0,tiktok,TR,4.0,,1,0,91
3,4,2024-08-28,NaT,NaT,2024-09-06,2024-08-31,,tiktok,TR,3.0,6.0,0,0,0
4,5,2024-04-03,NaT,NaT,NaT,NaT,,organic,NL,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,2025-01-28,NaT,2025-02-06,NaT,2025-02-01,,organic,TR,4.0,,1,1,16
996,997,2024-03-06,NaT,NaT,2024-03-16,2024-03-08,,organic,NL,2.0,8.0,0,0,0
997,998,2025-02-01,NaT,NaT,NaT,NaT,,instagram,TR,,,0,0,0
998,999,2024-12-24,NaT,2025-01-03,NaT,2024-12-29,1.0,organic,NL,5.0,,1,1,50


In [107]:
events_df[events_df['user_id']==1].event_name.unique().to_list()

['app_install', 'trial_started', 'trial_cancelled']

In [113]:
dic[1]

['app_install', 'trial_started', 'trial_cancelled']

In [129]:
type(df_final['subscription_started'][0])

pandas._libs.tslibs.nattype.NaTType

In [131]:
#Check csncle subscription ever

[0 if pd.isnull(i) else 1 for i in df_final['subscription_started']]

[0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,


In [None]:
[1 if (pd.isnull(i) and pd.isnull(k)) or  (pd.isnull(i) and pd.isnull(k))  else 0 for i,k in zip(df_final['trial_started'],df_final['subscription_cancelled'])]

In [135]:
#Check still subscripted. If there is 'Nat' value then it is still kept subscrition

[1 if pd.isnull(i) else 0 for i in df_final['subscription_cancelled'] for y in df_final['trial_started'] ]

[1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,


In [132]:
df_final['Ever_Subscripted'] = pd.Series([0 if pd.isnull(i) else 1 for i in df_final['subscription_started']])

In [177]:
#At some case subscription has started less than 1 month. This column solve problem if sebscription_renewed has 0

#df_final[(df_final['trial_cancelled'].notnull())&(df_final['subscription_started'].notnull())]
#Bu kod trial cancel olmadan subscription started olan var mi ona bakiyor. Boyle bir durum yok

#df_final[(df_final['trial_started'].isnull())&(df_final['subscription_started'].notnull())]
#Bu kod ise trial olmadan dogrudan subscript var mi onu cek ediyor. Boyle bir durum da yok

#Bu check kodlari is_still sutunu olusturmak icin lazim

df_final['Still_Subscripted'] = pd.Series([1 if pd.notnull(i)==True and pd.isnull(k)==True\
                                           else 0 for i,k in\
                                           zip(df_final['subscription_started'],df_final['subscription_cancelled'])
  ])

In [143]:
events_df[events_df.user_id==14]['amount_usd'].sum()

26.97

In [187]:
df_final.corr()

Unnamed: 0,user_id,subscription_renewed,TDAY_appinstall_trialStart,TDAY_trialStart_trialCancel,Ever_Subscripted,Still_Subscripted,subscription_duration
user_id,1.0,-0.081518,-0.008646,-0.157066,-0.020907,-0.009375,-0.01826
subscription_renewed,-0.081518,1.0,0.028464,,,-0.106199,0.959459
TDAY_appinstall_trialStart,-0.008646,0.028464,1.0,-0.04033,-0.037626,-0.028201,-0.01942
TDAY_trialStart_trialCancel,-0.157066,,-0.04033,1.0,,,
Ever_Subscripted,-0.020907,,-0.037626,,1.0,0.480592,0.640209
Still_Subscripted,-0.009375,-0.106199,-0.028201,,0.480592,1.0,0.581366
subscription_duration,-0.01826,0.959459,-0.01942,,0.640209,0.581366,1.0


In [144]:
df_final['subscription_renewed'].value_counts(dropna=False)

NaN    664
1.0    136
2.0    121
3.0     55
4.0     24
Name: subscription_renewed, dtype: int64

In [168]:
df_final[(df_final['trial_cancelled'].notnull())&(df_final['subscription_started'].notnull())]

Unnamed: 0,user_id,app_install,subscription_cancelled,subscription_started,trial_cancelled,trial_started,subscription_renewed,attribution_source,country,TDAY_appinstall_trialStart,TDAY_trialStart_trialCancel,Ever_Subscripted,Still_Subscripted


In [170]:
df_final[(df_final['trial_started'].isnull())&(df_final['subscription_started'].notnull())]

Unnamed: 0,user_id,app_install,subscription_cancelled,subscription_started,trial_cancelled,trial_started,subscription_renewed,attribution_source,country,TDAY_appinstall_trialStart,TDAY_trialStart_trialCancel,Ever_Subscripted,Still_Subscripted


In [172]:
[1 if pd.notnull(i)==True and pd.isnull(k)==True else 0 for i,k in zip(df_final['subscription_started'],df_final['subscription_cancelled'])
  ]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [179]:
df_final[['trial_started','trial_cancelled','subscription_started','subscription_cancelled','Still_Subscripted']]

Unnamed: 0,trial_started,trial_cancelled,subscription_started,subscription_cancelled,Still_Subscripted
0,2024-05-12,2024-05-24,NaT,NaT,0
1,2024-10-13,NaT,2024-10-20,2025-02-12,0
2,2024-10-19,NaT,2024-10-21,2025-01-20,0
3,2024-08-31,2024-09-06,NaT,NaT,0
4,NaT,NaT,NaT,NaT,0
...,...,...,...,...,...
995,2025-02-01,NaT,2025-02-06,NaT,1
996,2024-03-08,2024-03-16,NaT,NaT,0
997,NaT,NaT,NaT,NaT,0
998,2024-12-29,NaT,2025-01-03,NaT,1


In [185]:
#Subscription duration cahnge depending on subscription ongoing, calceled or never subscripted

df_final['subscription_duration'] = (
    (now - df_final['subscription_started']).dt.days.where(df_final['subscription_cancelled'].isna(), 
    (df_final['subscription_cancelled'] - df_final['subscription_started']).dt.days)
)

# If user never subscribed, set duration to 0
df_final['subscription_duration'] = df_final['subscription_duration'].fillna(0).astype(int)

In [188]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   user_id                      1000 non-null   int64         
 1   app_install                  1000 non-null   datetime64[ns]
 2   subscription_cancelled       303 non-null    datetime64[ns]
 3   subscription_started         477 non-null    datetime64[ns]
 4   trial_cancelled              197 non-null    datetime64[ns]
 5   trial_started                681 non-null    datetime64[ns]
 6   subscription_renewed         336 non-null    float64       
 7   attribution_source           1000 non-null   object        
 8   country                      1000 non-null   object        
 9   TDAY_appinstall_trialStart   681 non-null    float64       
 10  TDAY_trialStart_trialCancel  197 non-null    float64       
 11  Ever_Subscripted             1000 non-null  

In [189]:
df_final[df_final.trial_cancelled.isnull()]

Unnamed: 0,user_id,app_install,subscription_cancelled,subscription_started,trial_cancelled,trial_started,subscription_renewed,attribution_source,country,TDAY_appinstall_trialStart,TDAY_trialStart_trialCancel,Ever_Subscripted,Still_Subscripted,subscription_duration
1,2,2024-10-12,2025-02-12,2024-10-20,NaT,2024-10-13,3.0,instagram,NL,1.0,,1,0,115
2,3,2024-10-15,2025-01-20,2024-10-21,NaT,2024-10-19,3.0,tiktok,TR,4.0,,1,0,91
4,5,2024-04-03,NaT,NaT,NaT,NaT,,organic,NL,,,0,0,0
6,7,2024-10-21,NaT,NaT,NaT,NaT,,instagram,US,,,0,0,0
7,8,2024-08-14,2024-10-16,2024-08-28,NaT,2024-08-15,1.0,tiktok,TR,1.0,,1,0,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,995,2024-09-30,2025-01-08,2024-10-17,NaT,2024-10-04,2.0,instagram,US,4.0,,1,0,83
995,996,2025-01-28,NaT,2025-02-06,NaT,2025-02-01,,organic,TR,4.0,,1,1,16
997,998,2025-02-01,NaT,NaT,NaT,NaT,,instagram,TR,,,0,0,0
998,999,2024-12-24,NaT,2025-01-03,NaT,2024-12-29,1.0,organic,NL,5.0,,1,1,50


In [190]:
df_final.columns

Index(['user_id', 'app_install', 'subscription_cancelled',
       'subscription_started', 'trial_cancelled', 'trial_started',
       'subscription_renewed', 'attribution_source', 'country',
       'TDAY_appinstall_trialStart', 'TDAY_trialStart_trialCancel',
       'Ever_Subscripted', 'Still_Subscripted', 'subscription_duration'],
      dtype='object')

In [193]:
#Trial duration is calculated depending on 3 situation
#1- never trial return 0
#ever trial but not subcripted trial_cancelled -trial_started
#when go into subscrition started then subscription_start-trial_started

df_final['trial_duration'] = (df_final['trial_cancelled'] - df_final['trial_started']).dt.days

# Case 2: If user subscribed before canceling trial (without trial cancellation), use subscription_started instead
df_final.loc[
    (df_final['Ever_Subscripted']) & (df_final['trial_cancelled'].isna()), 
    'trial_duration'
] = (df_final['subscription_started'] - df_final['trial_started']).dt.days

# Case 3: If trial was never started, return 0
df_final['trial_duration'] = df_final['trial_duration'].fillna(0).astype(int)

In [195]:
df_final = df_final.drop(columns='TDAY_trialStart_trialCancel')

In [196]:
df_final

Unnamed: 0,user_id,app_install,subscription_cancelled,subscription_started,trial_cancelled,trial_started,subscription_renewed,attribution_source,country,TDAY_appinstall_trialStart,Ever_Subscripted,Still_Subscripted,subscription_duration,trial_duration
0,1,2024-05-07,NaT,NaT,2024-05-24,2024-05-12,,instagram,US,5.0,0,0,0,12
1,2,2024-10-12,2025-02-12,2024-10-20,NaT,2024-10-13,3.0,instagram,NL,1.0,1,0,115,7
2,3,2024-10-15,2025-01-20,2024-10-21,NaT,2024-10-19,3.0,tiktok,TR,4.0,1,0,91,2
3,4,2024-08-28,NaT,NaT,2024-09-06,2024-08-31,,tiktok,TR,3.0,0,0,0,6
4,5,2024-04-03,NaT,NaT,NaT,NaT,,organic,NL,,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,2025-01-28,NaT,2025-02-06,NaT,2025-02-01,,organic,TR,4.0,1,1,16,5
996,997,2024-03-06,NaT,NaT,2024-03-16,2024-03-08,,organic,NL,2.0,0,0,0,8
997,998,2025-02-01,NaT,NaT,NaT,NaT,,instagram,TR,,0,0,0,0
998,999,2024-12-24,NaT,2025-01-03,NaT,2024-12-29,1.0,organic,NL,5.0,1,1,50,5


In [None]:
df_final['amount_usd'] = 

In [197]:
df_final.corr()

Unnamed: 0,user_id,subscription_renewed,TDAY_appinstall_trialStart,Ever_Subscripted,Still_Subscripted,subscription_duration,trial_duration
user_id,1.0,-0.081518,-0.008646,-0.020907,-0.009375,-0.01826,-0.06112
subscription_renewed,-0.081518,1.0,0.028464,,-0.106199,0.959459,0.021765
TDAY_appinstall_trialStart,-0.008646,0.028464,1.0,-0.037626,-0.028201,-0.01942,0.02955
Ever_Subscripted,-0.020907,,-0.037626,1.0,0.480592,0.640209,0.491347
Still_Subscripted,-0.009375,-0.106199,-0.028201,0.480592,1.0,0.581366,0.236361
subscription_duration,-0.01826,0.959459,-0.01942,0.640209,0.581366,1.0,0.321319
trial_duration,-0.06112,0.021765,0.02955,0.491347,0.236361,0.321319,1.0


In [198]:
df_final.groupby('attribution_source')['subscription_duration'].mean()

attribution_source
instagram    42.563934
organic      48.906706
tiktok       51.772727
Name: subscription_duration, dtype: float64

In [203]:
#Users still subscripted are long time user

df_final.groupby(['attribution_source','Still_Subscripted'])['subscription_duration'].mean()

attribution_source  Still_Subscripted
instagram           0                     28.357143
                    1                    110.113208
organic             0                     28.901754
                    1                    147.206897
tiktok              0                     24.058824
                    1                    178.904762
Name: subscription_duration, dtype: float64

In [204]:
#Long time user have a rate  1-of

df_final.groupby(['attribution_source','Still_Subscripted'])['subscription_duration'].count()

attribution_source  Still_Subscripted
instagram           0                    252
                    1                     53
organic             0                    285
                    1                     58
tiktok              0                    289
                    1                     63
Name: subscription_duration, dtype: int64

In [205]:
#Still subscripted user are longterm users
#And, the rate 1 of 3 of Ever_subscripted are still subscripted and as result they are longterm depending on former analysis

df_final.groupby(['attribution_source','Ever_Subscripted','Still_Subscripted'])['subscription_duration'].count()

attribution_source  Ever_Subscripted  Still_Subscripted
instagram           0                 0                    157
                    1                 0                     95
                                      1                     53
organic             0                 0                    176
                    1                 0                    109
                                      1                     58
tiktok              0                 0                    190
                    1                 0                     99
                                      1                     63
Name: subscription_duration, dtype: int64

In [199]:
df_final.groupby('attribution_source')['trial_duration'].mean()

attribution_source
instagram    4.819672
organic      4.545190
tiktok       4.713068
Name: trial_duration, dtype: float64

In [202]:
df_final.groupby('attribution_source')['Ever_Subscripted'].count()

attribution_source
instagram    305
organic      343
tiktok       352
Name: Ever_Subscripted, dtype: int64

In [253]:
df_final.groupby('country')['subscription_duration'].mean()

country
NL    53.240260
TR    45.759207
US    45.516224
Name: subscription_duration, dtype: float64

In [255]:
df_final.groupby(['country','Still_Subscripted'])['subscription_duration'].count()

country  Still_Subscripted
NL       0                    252
         1                     56
TR       0                    288
         1                     65
US       0                    286
         1                     53
Name: subscription_duration, dtype: int64

In [257]:
df_final.groupby(['country','attribution_source'])['attribution_source'].count()

country  attribution_source
NL       instagram             104
         organic               102
         tiktok                102
TR       instagram             101
         organic               131
         tiktok                121
US       instagram             100
         organic               110
         tiktok                129
Name: attribution_source, dtype: int64

In [208]:
events_df.groupby('user_id')['amount_usd'].sum().corr(df_final.subscription_renewed.fillna(0))

-0.0012888325237101054

In [221]:
events_df['amount_usd'] = events_df['amount_usd'].fillna(0)

In [236]:
pd.concat([events_df.groupby('user_id')[['amount_usd']].sum(),df_final.subscription_renewed],axis=1)

Unnamed: 0,amount_usd,subscription_renewed
1,0.00,3.0
2,35.96,3.0
3,19.96,
4,0.00,
5,0.00,
...,...,...
997,0.00,
998,0.00,1.0
999,17.98,
1000,0.00,


In [237]:
events_df.groupby('user_id')['amount_usd'].sum().corr(df_final.subscription_renewed.fillna(0))

-0.0012888325237101054

In [209]:
df_final.subscription_renewed

0      NaN
1      3.0
2      3.0
3      NaN
4      NaN
      ... 
995    NaN
996    NaN
997    NaN
998    1.0
999    NaN
Name: subscription_renewed, Length: 1000, dtype: float64

In [212]:
events_df.tail(50)

Unnamed: 0,created_at,user_id,event_name,amount_usd
3413,2024-08-03,983,app_install,
3414,2024-08-09,983,trial_started,
3415,2024-08-10,983,trial_cancelled,
3416,2024-08-24,984,app_install,
3417,2024-11-04,985,app_install,
3418,2024-11-09,985,trial_started,
3419,2024-11-19,985,trial_cancelled,
3420,2024-08-27,986,app_install,
3421,2024-08-28,986,trial_started,
3422,2024-08-30,986,subscription_started,4.99


In [244]:
#Put spent usd per user_id into df_final

df_final['amount_usd']=events_df.groupby('user_id')['amount_usd'].sum()

In [249]:
df_final = df_final.drop(columns='amount_usd')

"""
he issue arises because groupby() returns a Series with a different index (grouped by user_id),
while df_final expects the same index alignment. When you assign directly, pandas attempts to align values by index,
leading to NaN values for users that are missing in df_final.

"""

In [251]:
#Let create it  properly 

df_final['amount_usd'] = df_final['user_id'].map(events_df.groupby('user_id')['amount_usd'].sum())


In [245]:
#fill Nan value with 0 at subscription_renewed so if it is Nana then no renewed

df_final.subscription_renewed = df_final.subscription_renewed.fillna(0)

In [242]:
events_df[['user_id','amount_usd']].head(50)

Unnamed: 0,user_id,amount_usd
0,1,0.0
1,1,0.0
2,1,0.0
3,2,0.0
4,2,0.0
5,2,8.99
6,2,8.99
7,2,8.99
8,2,8.99
9,2,0.0


In [252]:
#expected high corelation between subscription renewed and amount_usd

df_final.corr()

Unnamed: 0,user_id,subscription_renewed,TDAY_appinstall_trialStart,Ever_Subscripted,Still_Subscripted,subscription_duration,trial_duration,amount_usd
user_id,1.0,-0.025864,-0.008646,-0.020907,-0.009375,-0.01826,-0.06112,-0.028537
subscription_renewed,-0.025864,1.0,-0.002734,0.640684,-0.146962,0.294741,0.320415,0.904747
TDAY_appinstall_trialStart,-0.008646,-0.002734,1.0,-0.037626,-0.028201,-0.01942,0.02955,-0.005258
Ever_Subscripted,-0.020907,0.640684,-0.037626,1.0,0.480592,0.640209,0.491347,0.782037
Still_Subscripted,-0.009375,-0.146962,-0.028201,0.480592,1.0,0.581366,0.236361,0.048665
subscription_duration,-0.01826,0.294741,-0.01942,0.640209,0.581366,1.0,0.321319,0.420162
trial_duration,-0.06112,0.320415,0.02955,0.491347,0.236361,0.321319,1.0,0.3844
amount_usd,-0.028537,0.904747,-0.005258,0.782037,0.048665,0.420162,0.3844,1.0


In [248]:
df_final[['user_id','subscription_renewed','amount_usd']]

Unnamed: 0,user_id,subscription_renewed,amount_usd
0,1,0.0,
1,2,3.0,0.00
2,3,3.0,35.96
3,4,0.0,19.96
4,5,0.0,0.00
...,...,...,...
995,996,0.0,29.97
996,997,0.0,4.99
997,998,0.0,0.00
998,999,1.0,0.00
