In [44]:
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime



In [45]:
conn = sqlite3.connect('papcorns.sqlite')

In [46]:
#Import Users Table into dataframe

users_df = pd.read_sql_query("SELECT*FROM users;",conn)

In [47]:
#Import Users Event Table into dataframe

events_df = pd.read_sql_query("SELECT*FROM user_events;",conn)

In [48]:
#Check columns and records's amount of Users df

users_df.shape

(1002, 5)

In [49]:
#Check columns and records's amount of Users Event df

events_df.shape

(3486, 5)

In [50]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  1002 non-null   int64 
 1   created_at          1002 non-null   object
 2   attribution_source  1002 non-null   object
 3   country             1002 non-null   object
 4   name                1002 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.3+ KB


In [51]:
events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3486 entries, 0 to 3485
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          3486 non-null   int64  
 1   created_at  3486 non-null   object 
 2   user_id     3486 non-null   int64  
 3   event_name  3486 non-null   object 
 4   amount_usd  1231 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 136.3+ KB


In [52]:
#5 head records of Users df

users_df.head()

Unnamed: 0,id,created_at,attribution_source,country,name
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown
1,2,2024-10-12T00:00:00,instagram,NL,Frank Moore
2,3,2024-10-15T00:00:00,tiktok,TR,Ivy Anderson
3,4,2024-08-28T00:00:00,tiktok,TR,Alice Brown
4,5,2024-04-03T00:00:00,organic,NL,Bob Moore


In [53]:
#5 head records of Users Event df

events_df.head()

Unnamed: 0,id,created_at,user_id,event_name,amount_usd
0,1,2024-05-07T00:00:00,1,app_install,
1,2,2024-05-12T00:00:00,1,trial_started,
2,3,2024-05-24T00:00:00,1,trial_cancelled,
3,4,2024-10-12T00:00:00,2,app_install,
4,5,2024-10-13T00:00:00,2,trial_started,


In [54]:
#lets remove id in events_df. user_id is reference od id in users_df


events_df = events_df.drop(columns='id',axis = 1)

In [55]:
#Check Missing values by columns Users df

#Note  No missing values in Users df

users_df.isnull().sum()

id                    0
created_at            0
attribution_source    0
country               0
name                  0
dtype: int64

In [56]:
#Check Missing values by columns Users Event df

#Note!... 2255 of 3486 records of amount_usd column are null that is too high
#Take a look and give address what category have missing values 'most'

events_df.isnull().sum()

created_at       0
user_id          0
event_name       0
amount_usd    2255
dtype: int64

In [57]:
#There is a problem with created column cronologically. There are some date at the future so those values should be removed from dataset

events_df['created_at'] = pd.to_datetime(events_df['created_at'])


In [58]:
events_df.shape

(3486, 4)

In [59]:
with pd.option_context('display.max_rows',None):
    print(events_df)

     created_at  user_id              event_name  amount_usd
0    2024-05-07        1             app_install         NaN
1    2024-05-12        1           trial_started         NaN
2    2024-05-24        1         trial_cancelled         NaN
3    2024-10-12        2             app_install         NaN
4    2024-10-13        2           trial_started         NaN
5    2024-10-20        2    subscription_started        8.99
6    2024-11-19        2    subscription_renewed        8.99
7    2024-12-19        2    subscription_renewed        8.99
8    2025-01-18        2    subscription_renewed        8.99
9    2025-02-12        2  subscription_cancelled         NaN
10   2024-10-15        3             app_install         NaN
11   2024-10-19        3           trial_started         NaN
12   2024-10-21        3    subscription_started        4.99
13   2024-11-20        3    subscription_renewed        4.99
14   2024-12-20        3    subscription_renewed        4.99
15   2025-01-19        3

In [60]:
row1 = events_df.shape[0]

In [61]:
events_df.drop(events_df[events_df['created_at']>pd.Timestamp.now()].index,axis = 0,inplace=True)

In [62]:
row2 = events_df.shape[0]

In [63]:
events_df.shape

(3318, 4)

In [64]:
print(row1-row2,' rows are dropped from events_df')

168  rows are dropped from events_df


In [65]:
events_df['event_name'] = events_df['event_name'].astype('category')

# Count 'subscription_renewed' events per user_id
renewed_counts = events_df[events_df['event_name'] == 'subscription_renewed'].groupby('user_id').size().reset_index(name='subscription_renewed')


In [66]:
#renewed_counts

In [67]:
x_df = events_df[['user_id']].drop_duplicates()  # Ensure all users are included
df_subscription_renewed = x_df.merge(renewed_counts, on='user_id', how='left')

In [68]:
df_subscription_renewed

Unnamed: 0,user_id,subscription_renewed
0,1,
1,2,3.0
2,3,3.0
3,4,
4,5,
...,...,...
997,998,
998,999,1.0
999,1000,
1000,1001,


In [69]:
filtered_df = events_df[events_df['event_name']!='subscription_renewed']

In [70]:
filtered_df['event_name'].value_counts()

app_install               1002
trial_started              682
subscription_started       480
subscription_cancelled     307
trial_cancelled            201
subscription_renewed         0
Name: event_name, dtype: int64

In [71]:
df_pivot_no_renewed = filtered_df.pivot_table(index='user_id', columns='event_name', values='created_at')

In [72]:
df_pivot_no_renewed = df_pivot_no_renewed.reset_index()

In [73]:
events_df[events_df['user_id']==1002]

Unnamed: 0,created_at,user_id,event_name,amount_usd
3485,2025-02-25,1002,app_install,


In [74]:
df_final = df_pivot_no_renewed.merge(df_subscription_renewed, on='user_id', how='inner')

In [75]:
df_final

Unnamed: 0,user_id,app_install,subscription_cancelled,subscription_started,trial_cancelled,trial_started,subscription_renewed
0,1,2024-05-07,NaT,NaT,2024-05-24,2024-05-12,
1,2,2024-10-12,2025-02-12,2024-10-20,NaT,2024-10-13,3.0
2,3,2024-10-15,2025-01-20,2024-10-21,NaT,2024-10-19,3.0
3,4,2024-08-28,NaT,NaT,2024-09-06,2024-08-31,
4,5,2024-04-03,NaT,NaT,NaT,NaT,
...,...,...,...,...,...,...,...
997,998,2025-02-01,NaT,NaT,NaT,NaT,
998,999,2024-12-24,NaT,2025-01-03,NaT,2024-12-29,1.0
999,1000,2025-02-13,NaT,NaT,2025-02-25,2025-02-15,
1000,1001,2025-02-25,NaT,2025-02-25,NaT,2025-02-25,


In [76]:
users_df

Unnamed: 0,id,created_at,attribution_source,country,name
0,1,2024-05-07T00:00:00,instagram,US,Eve Brown
1,2,2024-10-12T00:00:00,instagram,NL,Frank Moore
2,3,2024-10-15T00:00:00,tiktok,TR,Ivy Anderson
3,4,2024-08-28T00:00:00,tiktok,TR,Alice Brown
4,5,2024-04-03T00:00:00,organic,NL,Bob Moore
...,...,...,...,...,...
997,998,2025-02-01T00:00:00,instagram,TR,Bob Davis
998,999,2024-12-24T00:00:00,organic,NL,Charlie Davis
999,1000,2025-02-13T00:00:00,organic,NL,Jack Anderson
1000,1001,2025-02-16T00:00:00,instagram,US,Bruce Wayne


In [77]:
df_final =  pd.merge(df_final,users_df[['attribution_source','country','name']], left_on=df_final['user_id'],right_on =users_df['id'], how='inner').drop(columns=['key_0'])

In [78]:
df_final

Unnamed: 0,user_id,app_install,subscription_cancelled,subscription_started,trial_cancelled,trial_started,subscription_renewed,attribution_source,country,name
0,1,2024-05-07,NaT,NaT,2024-05-24,2024-05-12,,instagram,US,Eve Brown
1,2,2024-10-12,2025-02-12,2024-10-20,NaT,2024-10-13,3.0,instagram,NL,Frank Moore
2,3,2024-10-15,2025-01-20,2024-10-21,NaT,2024-10-19,3.0,tiktok,TR,Ivy Anderson
3,4,2024-08-28,NaT,NaT,2024-09-06,2024-08-31,,tiktok,TR,Alice Brown
4,5,2024-04-03,NaT,NaT,NaT,NaT,,organic,NL,Bob Moore
...,...,...,...,...,...,...,...,...,...,...
997,998,2025-02-01,NaT,NaT,NaT,NaT,,instagram,TR,Bob Davis
998,999,2024-12-24,NaT,2025-01-03,NaT,2024-12-29,1.0,organic,NL,Charlie Davis
999,1000,2025-02-13,NaT,NaT,2025-02-25,2025-02-15,,organic,NL,Jack Anderson
1000,1001,2025-02-25,NaT,2025-02-25,NaT,2025-02-25,,instagram,US,Bruce Wayne


In [79]:
pd.notna(df_final.loc[4,'trial_started'])

False

In [80]:
df_final['Duration_app_install_to_trial'] = (df_final['trial_started']-df_final['app_install']).dt.days

In [81]:
#App indirenlerin 3 te 2 si trial basliyor
#Trial ilk 6 gunde kullanmaya her gune ortlama yaklasik esit dagilacak sekilde 
#6 gun gectikten sonra trial e kullanan yok

df_final['Duration_app_install_to_trial'].value_counts(dropna =False).sort_index()

0.0      1
1.0    113
2.0    121
3.0    113
4.0    118
5.0    105
6.0    111
NaN    320
Name: Duration_app_install_to_trial, dtype: int64

In [82]:
#There is no subscription_started without trial_strated
#Then we can calculate directly app_install conversion into subscription_started as well.

df_final[(df_final['trial_started'].isnull())&(df_final['subscription_started'].notna())]

Unnamed: 0,user_id,app_install,subscription_cancelled,subscription_started,trial_cancelled,trial_started,subscription_renewed,attribution_source,country,name,Duration_app_install_to_trial


In [83]:
df_final['DAY_trialStart_trialCancel'] = (df_final['trial_cancelled']-df_final['trial_started']).dt.days	

In [84]:
#Trial started yapanlarin buyuk kismi subscription yapiyor


df_final['DAY_trialStart_trialCancel'].value_counts(dropna =False).sort_index()

1.0      19
2.0      12
3.0      12
4.0      18
5.0      12
6.0      17
7.0      19
8.0      16
9.0      19
10.0     12
11.0     13
12.0     17
13.0     15
NaN     801
Name: DAY_trialStart_trialCancel, dtype: int64

In [86]:
ax = df_final[df_final['subscription_started'].notna()]['trial_duration'].value_counts(dropna =False).sort_index().plot(kind='bar')
ax.bar_label(container =ax.containers[0] )
plt.title('Subscription Started by Trial Duration (Day)')

KeyError: 'trial_duration'

In [97]:
a = df_final[df_final['subscription_started'].notna()][['trial_duration','Still_Subscripted']].groupby('trial_duration')['Still_Subscripted'].sum()
a

KeyError: "['trial_duration'] not in index"

In [None]:
b  = df_final[df_final['subscription_started'].notna()][['trial_duration','Still_Subscripted']].groupby('trial_duration')['Still_Subscripted'].count()
b

In [None]:
df_ab = pd.concat([a,b], axis =1)

In [None]:
df_ab.columns ='ongoing','total'

In [None]:
import seaborn as sns

In [None]:
df_ab['sub_cancelled'] = df_ab.total - df_ab.ongoing

In [None]:
df_ab.columns

In [None]:
sns.barplot(x = df_ab.index)

In [None]:
plt.figure(figsize=(10, 6))

# Define bar width
bar_width = 0.4  

# Create positions for bars
x = np.arange(len(df_ab))  # X positions for bars
plt.xticks(x, df_ab.index, rotation=45)  # Set x-axis labels

# Plot bars side by side
ax1 = plt.bar(x - bar_width/2, df_ab['ongoing'], width=bar_width, color='blue', label='Ongoing')
ax2 = plt.bar(x + bar_width/2, df_ab['sub_cancelled'], width=bar_width, color='red', label='Sub Cancelled')

# Add labels on bars
plt.bar_label(ax1, fmt='%.0f', padding=3)
plt.bar_label(ax2, fmt='%.0f', padding=3)

# Labels, title, and legend
plt.xlabel("Index")
plt.ylabel("Count")
plt.title("Ongoing vs Subscription Cancelled")
plt.legend()

# Show the plot

In [None]:
df_ab

In [None]:
df_final[['Ever_Subscripted','Still_Subscripted']].groupby('trial_duration')['Still_Subscripted'].sum()

In [None]:
df_final[df_final['subscription_started'].notna()][['trial_duration','Still_Subscripted']].groupby('trial_duration')['Still_Subscripted'].sum()

In [None]:
events_df[events_df['event_name']=='trial_started']['event_name'].value_counts(dropna = False).sort_index()

In [None]:
events_df[events_df['event_name']=='trial_started']['event_name'].value_counts(dropna = False).sort_index()

In [None]:
len(events_df[events_df['event_name']=='subscription_started']['user_id'].isin(events_df[events_df['event_name']=='trial_started']['user_id']))

In [None]:
#Trial e baslayanlarin %70 i  subscription oluyor


len(events_df[events_df['event_name']=='subscription_started']['user_id']\
    .isin(events_df[events_df['event_name']=='trial_started']['user_id']))/events_df[events_df['event_name']=='trial_started']['user_id'].nunique()

In [None]:
#App insatall edenlerin %48 si subscripted oluyor

len(events_df[events_df['event_name']=='subscription_started']['user_id']\
    .isin(events_df[events_df['event_name']=='app_install']['user_id']))/events_df[events_df['event_name']=='app_install']['user_id'].nunique()

In [None]:
events_df[events_df['event_name']=='app_install']['user_id'].nunique()

In [None]:
## df_final['DAY_trialStart_trialCancel'] = (df_final['trial_cancelled']-df_final['trial_started']).dt.days

In [None]:
#

sum((df_final['subscription_started']-df_final['trial_started']).dt.days.value_counts(dropna = False).values[1:])

In [None]:
events_df.columns

In [None]:
users_df.columns

In [None]:
dic = {i: events_df[events_df['user_id']==i].event_name.unique().to_list() for i in events_df.user_id.unique()}

In [None]:
dic

In [None]:
#Just in case to check the code block belove

[i for i in dic.keys() if 'subscription_renewed' in dic[i]]

In [None]:
df_final[df_final['subscription_renewed'].notna()].user_id

In [None]:
df_final[df_final.subscription_renewed.notna()].user_id

In [None]:
df_final

In [None]:
events_df[events_df['user_id']==1].event_name.unique().to_list()

In [None]:
dic[1]

In [None]:
type(df_final['subscription_started'][0])

In [None]:
#Check csncle subscription ever

[0 if pd.isnull(i) else 1 for i in df_final['subscription_started']]

In [None]:
[1 if (pd.isnull(i) and pd.isnull(k)) or  (pd.isnull(i) and pd.isnull(k))  else 0 for i,k in zip(df_final['trial_started'],df_final['subscription_cancelled'])]

In [None]:
#Check still subscripted. If there is 'Nat' value then it is still kept subscrition

[1 if pd.isnull(i) else 0 for i in df_final['subscription_cancelled'] for y in df_final['trial_started'] ]

In [87]:
df_final['Ever_Subscripted'] = pd.Series([0 if pd.isnull(i) else 1 for i in df_final['subscription_started']])

In [88]:
#At some case subscription has started less than 1 month. This column solve problem if sebscription_renewed has 0

#df_final[(df_final['trial_cancelled'].notnull())&(df_final['subscription_started'].notnull())]
#Bu kod trial cancel olmadan subscription started olan var mi ona bakiyor. Boyle bir durum yok

#df_final[(df_final['trial_started'].isnull())&(df_final['subscription_started'].notnull())]
#Bu kod ise trial olmadan dogrudan subscript var mi onu cek ediyor. Boyle bir durum da yok

#Bu check kodlari is_still sutunu olusturmak icin lazim

df_final['Still_Subscripted'] = pd.Series([1 if pd.notnull(i)==True and pd.isnull(k)==True\
                                           else 0 for i,k in\
                                           zip(df_final['subscription_started'],df_final['subscription_cancelled'])
  ])

In [89]:
events_df[events_df.user_id==14]['amount_usd'].sum()

26.97

In [90]:
df_final.corr()

Unnamed: 0,user_id,subscription_renewed,Duration_app_install_to_trial,DAY_trialStart_trialCancel,Ever_Subscripted,Still_Subscripted
user_id,1.0,-0.07008,-0.01371,-0.163208,-0.014186,0.003245
subscription_renewed,-0.07008,1.0,0.036483,,,-0.125734
Duration_app_install_to_trial,-0.01371,0.036483,1.0,-0.04729,-0.037507,-0.027491
DAY_trialStart_trialCancel,-0.163208,,-0.04729,1.0,,
Ever_Subscripted,-0.014186,,-0.037507,,1.0,0.476388
Still_Subscripted,0.003245,-0.125734,-0.027491,,0.476388,1.0


In [91]:
df_final['subscription_renewed'].value_counts(dropna=False)

NaN    662
1.0    140
2.0    118
3.0     58
4.0     24
Name: subscription_renewed, dtype: int64

In [92]:
df_final[(df_final['trial_cancelled'].notnull())&(df_final['subscription_started'].notnull())]

Unnamed: 0,user_id,app_install,subscription_cancelled,subscription_started,trial_cancelled,trial_started,subscription_renewed,attribution_source,country,name,Duration_app_install_to_trial,DAY_trialStart_trialCancel,Ever_Subscripted,Still_Subscripted


In [93]:
df_final[(df_final['trial_started'].isnull())&(df_final['subscription_started'].notnull())]

Unnamed: 0,user_id,app_install,subscription_cancelled,subscription_started,trial_cancelled,trial_started,subscription_renewed,attribution_source,country,name,Duration_app_install_to_trial,DAY_trialStart_trialCancel,Ever_Subscripted,Still_Subscripted


In [94]:
[1 if pd.notnull(i)==True and pd.isnull(k)==True else 0 for i,k in zip(df_final['subscription_started'],df_final['subscription_cancelled'])
  ]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [95]:
df_final[['trial_started','trial_cancelled','subscription_started','subscription_cancelled','Still_Subscripted']]

Unnamed: 0,trial_started,trial_cancelled,subscription_started,subscription_cancelled,Still_Subscripted
0,2024-05-12,2024-05-24,NaT,NaT,0
1,2024-10-13,NaT,2024-10-20,2025-02-12,0
2,2024-10-19,NaT,2024-10-21,2025-01-20,0
3,2024-08-31,2024-09-06,NaT,NaT,0
4,NaT,NaT,NaT,NaT,0
...,...,...,...,...,...
997,NaT,NaT,NaT,NaT,0
998,2024-12-29,NaT,2025-01-03,NaT,1
999,2025-02-15,2025-02-25,NaT,NaT,0
1000,2025-02-25,NaT,2025-02-25,NaT,1


In [96]:
#Subscription duration cahnge depending on subscription ongoing, calceled or never subscripted

df_final['subscription_duration'] = (
    (now - df_final['subscription_started']).dt.days.where(df_final['subscription_cancelled'].isna(), 
    (df_final['subscription_cancelled'] - df_final['subscription_started']).dt.days)
)

# If user never subscribed, set duration to 0
df_final['subscription_duration'] = df_final['subscription_duration'].fillna(0).astype(int)

NameError: name 'now' is not defined

In [None]:
df_final.info()

In [None]:
df_final[df_final.trial_cancelled.isnull()]

In [None]:
df_final.columns

In [None]:
#Trial duration is calculated depending on 3 situation
#1- never trial return 0
#ever trial but not subcripted trial_cancelled -trial_started
#when go into subscrition started then subscription_start-trial_started

df_final['trial_duration'] = (df_final['trial_cancelled'] - df_final['trial_started']).dt.days

# Case 2: If user subscribed before canceling trial (without trial cancellation), use subscription_started instead
df_final.loc[
    (df_final['Ever_Subscripted']) & (df_final['trial_cancelled'].isna()), 
    'trial_duration'
] = (df_final['subscription_started'] - df_final['trial_started']).dt.days

# Case 3: If trial was never started, return 0
df_final['trial_duration'] = df_final['trial_duration'].fillna(0).astype(int)

In [None]:
df_final

In [None]:
df_final.corr()

In [None]:
df_final.groupby('attribution_source')['subscription_duration'].mean()

In [None]:
#Users still subscripted are long time user
#Users who cancelled subscription do it about end of the month of in average


df_final.groupby(['attribution_source','Still_Subscripted'])['subscription_duration'].mean()

In [None]:
#Long time user have a rate  1-of

df_final.groupby(['attribution_source','Still_Subscripted'])['subscription_duration'].count()

In [None]:
#Still subscripted user are longterm users
#And, the rate 1 of 3 of Ever_subscripted are still subscripted and as result they are longterm depending on former analysis

df_final.groupby(['attribution_source','Ever_Subscripted','Still_Subscripted'])['subscription_duration'].count()

In [None]:
df_final.groupby('attribution_source')['trial_duration'].mean()

In [None]:
df_final.groupby('attribution_source')['Ever_Subscripted'].count()

In [None]:
df_final.groupby('country')['subscription_duration'].mean()

In [None]:
df_final.groupby(['country','Still_Subscripted'])['subscription_duration'].count()

In [None]:
df_final.groupby(['country','attribution_source'])['attribution_source'].count()

In [None]:
events_df.groupby('user_id')['amount_usd'].sum().corr(df_final.subscription_renewed.fillna(0))

In [None]:
events_df['amount_usd'] = events_df['amount_usd'].fillna(0)

In [None]:
pd.concat([events_df.groupby('user_id')[['amount_usd']].sum(),df_final.subscription_renewed],axis=1)

In [None]:
events_df.groupby('user_id')['amount_usd'].sum().corr(df_final.subscription_renewed.fillna(0))

In [None]:
df_final.subscription_renewed

In [None]:
events_df.tail(50)

In [None]:
#Put spent usd per user_id into df_final

df_final['amount_usd']=events_df.groupby('user_id')['amount_usd'].sum()

In [None]:
df_final = df_final.drop(columns='amount_usd')

"""
he issue arises because groupby() returns a Series with a different index (grouped by user_id),
while df_final expects the same index alignment. When you assign directly, pandas attempts to align values by index,
leading to NaN values for users that are missing in df_final.

"""

In [None]:
#Let create it  properly 

df_final['amount_usd'] = df_final['user_id'].map(events_df.groupby('user_id')['amount_usd'].sum())


In [None]:
#fill Nan value with 0 at subscription_renewed so if it is Nana then no renewed

df_final.subscription_renewed = df_final.subscription_renewed.fillna(0)

In [None]:
events_df[['user_id','amount_usd']].head(50)

In [None]:
events_df[['user_id','amount_usd']].tail(50)

In [None]:
#expected high corelation between subscription renewed and amount_usd

df_final.corr()

In [None]:
df_final[['user_id','subscription_renewed','amount_usd']]

In [None]:
df_final.info()

In [None]:
df_final

In [None]:
df_final.app_install.dt.day_name()

In [None]:
xyz = df_final.copy()

In [None]:
xyz['day_name']=xyz.app_install.dt.day_name()

In [None]:
xyz.groupby('day_name')[['Ever_Subscripted']].count()

In [None]:
xyz[xyz['Ever_Subscripted']==1].groupby('day_name')[['Ever_Subscripted']].count()

In [None]:
xyz.groupby('day_name').trial_started.count().sort_index()

In [None]:
df_final

In [None]:
events_df[events_df['event_name']=='subscription_renewed'].groupby('user_id')['event_name'].count()

In [None]:
events_df.tail(50)