<a href="#measuring-churn-rate">Measuring Churn Rate</a>
- <a href="#standard-based-churn-rate">Standard Churn Rate (Subscription)</a>
- <a href="#activity-based-churn-rate">Activity Churn Rate (Non-Subscription)</a>

<a href="#measuring-customers">Measuring Customers</a>
- <a href="#common-summay-metrics">Common Summary Metrics (Count, Total, Avg, etc)</a>
- <a href="#account-tenure">Account Tenure Metrics</a>

<a href="#observing-churn">Observing Churn</a>
- <a href="#observation-datetime-picking">Observation Datetime Picking and Outcome</a>
- <a href="#analytic-data">Features (Metrics) for each observation instance. Analytic summary data</a>

In [1648]:
# import libraries
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import math


In [1084]:
df = pd.read_json('data/mini_sparkify_event_data.json', orient='records', lines=True)
df.head(5)

Unnamed: 0,ts,userId,sessionId,page,auth,method,status,level,itemInSession,location,userAgent,lastName,firstName,registration,gender,artist,song,length
0,1538352117000,30,29,NextSong,Logged In,PUT,200,paid,50,"Bakersfield, CA",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,Freeman,Colin,1538173000000.0,M,Martha Tilston,Rockpools,277.89016
1,1538352180000,9,8,NextSong,Logged In,PUT,200,free,79,"Boston-Cambridge-Newton, MA-NH","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",Long,Micah,1538332000000.0,M,Five Iron Frenzy,Canada,236.09424
2,1538352394000,30,29,NextSong,Logged In,PUT,200,paid,51,"Bakersfield, CA",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,Freeman,Colin,1538173000000.0,M,Adam Lambert,Time For Miracles,282.8273
3,1538352416000,9,8,NextSong,Logged In,PUT,200,free,80,"Boston-Cambridge-Newton, MA-NH","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",Long,Micah,1538332000000.0,M,Enigma,Knocking On Forbidden Doors,262.71302
4,1538352676000,30,29,NextSong,Logged In,PUT,200,paid,52,"Bakersfield, CA",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,Freeman,Colin,1538173000000.0,M,Daft Punk,Harder Better Faster Stronger,223.60771


In [1085]:
df = df.drop(np.where(df.userId == '')[0])
df.head(5)

Unnamed: 0,ts,userId,sessionId,page,auth,method,status,level,itemInSession,location,userAgent,lastName,firstName,registration,gender,artist,song,length
0,1538352117000,30,29,NextSong,Logged In,PUT,200,paid,50,"Bakersfield, CA",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,Freeman,Colin,1538173000000.0,M,Martha Tilston,Rockpools,277.89016
1,1538352180000,9,8,NextSong,Logged In,PUT,200,free,79,"Boston-Cambridge-Newton, MA-NH","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",Long,Micah,1538332000000.0,M,Five Iron Frenzy,Canada,236.09424
2,1538352394000,30,29,NextSong,Logged In,PUT,200,paid,51,"Bakersfield, CA",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,Freeman,Colin,1538173000000.0,M,Adam Lambert,Time For Miracles,282.8273
3,1538352416000,9,8,NextSong,Logged In,PUT,200,free,80,"Boston-Cambridge-Newton, MA-NH","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",Long,Micah,1538332000000.0,M,Enigma,Knocking On Forbidden Doors,262.71302
4,1538352676000,30,29,NextSong,Logged In,PUT,200,paid,52,"Bakersfield, CA",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,Freeman,Colin,1538173000000.0,M,Daft Punk,Harder Better Faster Stronger,223.60771


In [1086]:
# Convert User Id to numeric type
df.userId = df.userId.astype('int')

# Drop 'useless' attributes
df = df.drop(['registration', 'userAgent', 'status', 'method', ], axis=1)
df.head(10)

# Join First Name and Last Name to Fullname column, then drop First Name, Last Name
df['fullname'] = df['firstName'] + ' ' + df['lastName']
df = df.drop(['firstName', 'lastName'], axis=1)

In [1087]:
# Create 'datetime' column, based on timpstamp value, convert UTC time.
# Convert type to pandas datetime type

df['datetime'] = df['ts'].apply(lambda x: datetime.utcfromtimestamp(x/1000).isoformat())
df['datetime'] = df['datetime'].astype('datetime64[ns]')

df['date'] = df.ts.apply(lambda x: datetime.utcfromtimestamp(x/1000).isoformat()[:10])

In [1088]:
# Check earlist date and latest date in the dataset

print(np.min(df['datetime']))
print(np.max(df['datetime']))

2018-10-01 00:01:57
2018-12-03 01:11:16


In [1089]:
# Date range in the dataset
np.max(df['datetime']) - np.min(df['datetime'])

Timedelta('63 days 01:09:19')

In [1090]:
# Helper Function. get timedelta
# Get a user's timerange of event. (delta of earlist event date to latest event date)

def get_timedelta_by_user(user_id, data=df):
    min_datetime = np.min(data[data.userId == user_id]['datetime'])
    max_datetime = np.max(data[data.userId == user_id]['datetime'])
    d = max_datetime - min_datetime
    return d

# Spot Check
get_timedelta_by_user(152)

Timedelta('31 days 17:01:23')

In [1091]:
# Spot Check: Cancellation Event Count
df[(df.page == 'Cancel') | (df.page == 'Cancellation Confirmation')]['page'].value_counts()

Cancel                       52
Cancellation Confirmation    52
Name: page, dtype: int64

In [1092]:
# Spot Check: Cancellation Event Count (string method)
df[(df.page.str.startswith('Cancel'))]['page'].value_counts()

Cancel                       52
Cancellation Confirmation    52
Name: page, dtype: int64

In [1093]:
# Number of event records
print(df.shape[0])

# Number of distinct users
print(df.userId.nunique())

278154
225


In [1094]:
# Number of 'authentication' values check

df['auth'].value_counts()

Logged In    278102
Cancelled        52
Name: auth, dtype: int64

---

<a id="measuring-churn-rate"></a>

## Measuring Churn Rate

---

observation date recall that follow weekly cycles, so pick a last sunday (or monday 0:00) date as end day

In [1095]:
# Set the observation timeframe we want to do churn model.
# 4 Weeks. 

observation_end_date = pd.Timestamp('2018-12-03') # 49th Monday of 2018.
observation_start_date = observation_end_date - pd.Timedelta("28 days") # 45th Monday of 2018

In [1096]:
def check_last_event(ids, data=df):
    # Check the last event for a list of users.
    
    # return (page, auth, level, time)
    
    l = []
    
    for i in ids:
        page = data[data.userId == i].tail(1).page.values[0]
        auth = data[data.userId == i].tail(1).auth.values[0]
        level = data[data.userId == i].tail(1).level.values[0]
        dt = pd.Timestamp(data[data.userId == i].tail(1).datetime.values[0])
        
        l.append((i, page, auth, level, dt))
        
    return l


---
<a id="standard-based-churn-rate"></a>

### Standard account-basd churn rate

Glance of churn rate for 2018 Nov full calendar.

Since the limitation of event data, we dont have access to Subscription database (information such as sub id, renew date, therefore, to utilize the event data, we calculate the standard account-based churn rate (for Nov 2018) as:

Number of users whose first event is a paid status last event is also a paid status and not cancel auth
/
Total distinct number of users in the month




In [1097]:
# Define 2018 Nov start and end dates
nov_start_date = '2018-11-01T00:00:00'
nov_end_date = '2018-12-01T00:00:00'

# 2018 Nov Data Frame
df_nov = df[(df.datetime > nov_start_date) & (df.datetime < nov_end_date)]

# Distinct user ids in Nov 2018
nov_uids = df_nov.userId.value_counts().index 

# Numbers of user ids in Nov 2018
n_nov_uids = len(nov_uids) 

In [1098]:
# The VERY FIRST event for each user in Nov 2018
df_nov_first = df_nov.drop_duplicates('userId', keep='first').\
sort_values('userId')[['userId', 'auth', 'level', 'datetime']]

df_nov_first = df_nov_first.set_index('userId')

In [1099]:
# Users that first event in Nov 2018 is under 'paid' status. (Subscriber already)
df_nov_starts_with_paid = df_nov_first[df_nov_first['level'] == 'paid']
df_nov_starts_with_paid

Unnamed: 0_level_0,auth,level,datetime
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Logged In,paid,2018-11-02 14:49:33
4,Logged In,paid,2018-11-04 13:23:21
6,Logged In,paid,2018-11-01 06:40:13
9,Logged In,paid,2018-11-01 01:49:01
10,Logged In,paid,2018-11-03 14:27:42
...,...,...,...
300020,Logged In,paid,2018-11-13 12:06:12
300021,Logged In,paid,2018-11-01 13:10:20
300022,Logged In,paid,2018-11-05 01:08:50
300023,Logged In,paid,2018-11-01 05:47:58


In [1100]:
# The VERY LAST event for each user in Nov 2018

df_nov_last = df_nov.drop_duplicates('userId', keep='last').\
sort_values('userId')[['userId', 'auth', 'level', 'datetime']]

df_nov_last = df_nov_last.set_index('userId')
df_nov_last.head(10)

Unnamed: 0_level_0,auth,level,datetime
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Logged In,paid,2018-11-21 22:30:04
4,Logged In,paid,2018-11-30 16:32:28
5,Logged In,free,2018-11-08 00:33:21
6,Logged In,paid,2018-11-29 22:48:26
7,Logged In,free,2018-11-23 06:46:51
8,Logged In,free,2018-11-30 13:54:57
9,Logged In,paid,2018-11-30 14:19:12
10,Logged In,paid,2018-11-19 12:49:48
11,Logged In,paid,2018-11-27 09:08:22
13,Logged In,free,2018-11-29 17:35:09


In [1101]:
# Merger userFirst and userLast. Then check who churn under condition: 
# 1 - if begin with paid, end with free. (cancelled before the very last), OR
# 2 - if begin with paid, end with paid but it is ALSO the last paid event (cancel is the last event)

df_nov_first_last = pd.merge(df_nov_first, df_nov_last, how='inner', on='userId')

df_nov_churn = df_nov_first_last[
    ((df_nov_first_last.level_x == 'paid') & (df_nov_first_last.level_y == 'free')) | 
    ((df_nov_first_last.level_x == 'paid') & (df_nov_first_last.auth_y == 'Cancelled'))
]

df_nov_churn

Unnamed: 0_level_0,auth_x,level_x,datetime_x,auth_y,level_y,datetime_y
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
28,Logged In,paid,2018-11-01 00:00:06,Cancelled,paid,2018-11-06 00:20:32
29,Logged In,paid,2018-11-01 00:03:11,Cancelled,paid,2018-11-14 07:37:11
35,Logged In,paid,2018-11-01 05:23:02,Logged In,free,2018-11-30 10:24:09
53,Logged In,paid,2018-11-04 00:20:10,Cancelled,paid,2018-11-19 12:34:56
54,Logged In,paid,2018-11-01 00:02:21,Cancelled,paid,2018-11-12 19:40:08
70,Logged In,paid,2018-11-02 01:20:40,Cancelled,paid,2018-11-14 22:34:27
92,Logged In,paid,2018-11-01 05:09:45,Logged In,free,2018-11-30 23:34:25
103,Logged In,paid,2018-11-03 08:27:37,Cancelled,paid,2018-11-04 14:01:31
106,Logged In,paid,2018-11-02 13:07:17,Cancelled,paid,2018-11-02 13:47:04
109,Logged In,paid,2018-11-05 17:12:05,Logged In,free,2018-11-16 12:33:04


In [1102]:
# Calculate Nov 2018 Churn Rate based on Standard account-based (paid-status or cancellation event)

print(f"Number of customers at the start of nov: {df_nov_starts_with_paid.shape[0]}")
print(f"Number of customers at the end of nov: {df_nov_churn.shape[0]}")

nov_churn_rate = df_nov_churn.shape[0] / df_nov_starts_with_paid.shape[0]

print(f"2018 Nov churn rate: {nov_churn_rate}")
print(f"2018 Nov retention rate: {1 - nov_churn_rate}")

Number of customers at the start of nov: 102
Number of customers at the end of nov: 21
2018 Nov churn rate: 0.20588235294117646
2018 Nov retention rate: 0.7941176470588236


---
<a id="activity-based-churn-rate"></a>
### Activity Based Churn for non-subscriber

In addtion to standard churn rate calculation, I would also calculate churn rate for non-paid ('free') user, based on activity
 measure for Nov 2018.
 
We can obtain an overview of the engagement, utilization of free user. 

How many of free user discontine to use the free service. 

Based on activity recency. Define an inactive time interval, eg 1 month.
Short gaps can be ignore



1. Observe on the start day. (Nov 1), get all free uses have activities within oct 1 - nov 1. Get the head count. 
2. Observe on the end day. (Before Dec 1), get all users have activities within nov 1 - dec 1. Get the head count. 
3. Calculation: End head count / Start head count  (Retaintion Rate)
4. Churn Rate + Retaintion Rate = 1

In [1103]:
# Two observations: 
# 1. Observation on start date of the month 
# 2. Observation on end date of the month
# Recency interval: 30 days

nov_start_date = '2018-11-01T00:00:00'
nov_end_date = '2018-12-01T00:00:00'
recency_interval = '30 days'

# Convert to pd timestamp type
free_nov_ob_start_date = pd.Timestamp(nov_start_date)
free_nov_ob_end_date = pd.Timestamp(nov_end_date)

# Calcuate the inactivity limit for start and end
recency_start_date = free_nov_ob_start_date - pd.Timedelta(recency_interval)
recency_end_date = free_nov_ob_end_date - pd.Timedelta(recency_interval)

print(recency_start_date)
print(recency_end_date)

2018-10-02 00:00:00
2018-11-01 00:00:00


In [1104]:
# Free users count at the start

free_start_count = df[
    (df.datetime >= recency_start_date) & 
    (df.datetime < recency_end_date) & 
    (df.level == 'free')
].groupby('userId').size().shape[0]

free_start_count

173

In [1105]:
# Free users count at the end

free_end_count = df[
    (df.datetime >= recency_end_date) & 
    (df.datetime < nov_end_date) & 
    (df.level == 'free')
].groupby('userId').size().shape[0]

free_end_count # retain

103

In [1106]:
print(f"Number of free users churn for 2018 Nov (Based on activity recency): {free_start_count - free_end_count}")

print(f"Free Users Churn Rate for 2018 Nov (Based on activity recency): \
{(free_start_count - free_end_count) / free_start_count}")

print(f"Free Users Retain Rate for 2018 Nov (Based on activity recency): \
{free_end_count / free_start_count}")

Number of free users churn for 2018 Nov (Based on activity recency): 70
Free Users Churn Rate for 2018 Nov (Based on activity recency): 0.4046242774566474
Free Users Retain Rate for 2018 Nov (Based on activity recency): 0.5953757225433526


---

<a id="measuring-customers"></a>

## Measuring Customers (Feature Engineering)


- Measuring counts, averages, and totals of customer events 
- Measuring how long a customer has been using a service 
- Measuring subscription metrics

---

In [1107]:
# The measuring metrics based on this timeframe (4 weeks)

print(observation_start_date)  # 45th Monday of 2018
print(observation_end_date)  # 49th Monday of 2018.

# 4 weeks in between
observation_end_date - observation_start_date

2018-11-05 00:00:00
2018-12-03 00:00:00


Timedelta('28 days 00:00:00')

In [1108]:
# Softcopy for periods subset to work on.

df_p = df.copy()
df_p.head()

Unnamed: 0,ts,userId,sessionId,page,auth,level,itemInSession,location,gender,artist,song,length,fullname,datetime,date
0,1538352117000,30,29,NextSong,Logged In,paid,50,"Bakersfield, CA",M,Martha Tilston,Rockpools,277.89016,Colin Freeman,2018-10-01 00:01:57,2018-10-01
1,1538352180000,9,8,NextSong,Logged In,free,79,"Boston-Cambridge-Newton, MA-NH",M,Five Iron Frenzy,Canada,236.09424,Micah Long,2018-10-01 00:03:00,2018-10-01
2,1538352394000,30,29,NextSong,Logged In,paid,51,"Bakersfield, CA",M,Adam Lambert,Time For Miracles,282.8273,Colin Freeman,2018-10-01 00:06:34,2018-10-01
3,1538352416000,9,8,NextSong,Logged In,free,80,"Boston-Cambridge-Newton, MA-NH",M,Enigma,Knocking On Forbidden Doors,262.71302,Micah Long,2018-10-01 00:06:56,2018-10-01
4,1538352676000,30,29,NextSong,Logged In,paid,52,"Bakersfield, CA",M,Daft Punk,Harder Better Faster Stronger,223.60771,Colin Freeman,2018-10-01 00:11:16,2018-10-01


In [1109]:
# Utility Function: Assign period number. 

def period_from_datetime(x):
    """
    Assign each log a period number based on date. 
    Default 0.
    '4' is actually the latest 4 full weeks (28 days).
    `period 3` is 7-days before `period 4`. (28 days).
    `period 2` is 7-days before `period 3`. (28 days).
    `period 1` is 7-days before `period 2`. (28 days).
    
    Each period is 28 days, moving 7 days ahead from its previous period. 
    (moving 7 days)
    """
    period = 0 # default 0: out of window concerned. to filter out.
    
    if (x >= pd.Timestamp('2018-10-15')) & (x < pd.Timestamp('2018-11-12')): 
        period = 1
    
    if (x >= pd.Timestamp('2018-10-22')) & (x < pd.Timestamp('2018-11-19')):
        period = 2
    
    if (x >= pd.Timestamp('2018-10-29')) & (x < pd.Timestamp('2018-11-26')):
        period = 3
        
    if (x >= pd.Timestamp('2018-11-05')) & (x < pd.Timestamp('2018-12-03')):
        period = 4
        
    return period


# df_p[-20:]['datetime'].apply(period_from_datetime)

In [1110]:
# Add a column of period, assign each event to a period base on its datetime.

df_p['period'] = df_p['datetime'].apply(period_from_datetime)

In [1111]:
# p1,2,3,4 is what we concern. 0 is out of timeframe we don't consider. 

df_p[df_p.period.isin([1,2,3,4])].head(5)

Unnamed: 0,ts,userId,sessionId,page,auth,level,itemInSession,location,gender,artist,song,length,fullname,datetime,date,period
43747,1539561613000,133,167,Home,Logged In,free,0,"San Diego-Carlsbad, CA",M,,,,Liam Fleming,2018-10-15 00:00:13,2018-10-15,1
43748,1539561620000,9,583,NextSong,Logged In,paid,96,"Boston-Cambridge-Newton, MA-NH",M,Edward Sharpe & The Magnetic Zeros,Up From Below,250.27873,Micah Long,2018-10-15 00:00:20,2018-10-15,1
43749,1539561692000,133,167,NextSong,Logged In,free,1,"San Diego-Carlsbad, CA",M,Starflyer 59,When You Feel The Mess,376.86812,Liam Fleming,2018-10-15 00:01:32,2018-10-15,1
43750,1539561762000,54,782,NextSong,Logged In,paid,243,"Spokane-Spokane Valley, WA",F,Aphex Twin,Xtal,294.05995,Alexi Warren,2018-10-15 00:02:42,2018-10-15,1
43751,1539561778000,61,766,NextSong,Logged In,free,18,"San Francisco-Oakland-Hayward, CA",M,Ludovico Einaudi,Nefeli,266.94485,Parker Williams,2018-10-15 00:02:58,2018-10-15,1


In [1112]:
# Spot Check: Distinct Events

df['page'].value_counts()

NextSong                     228108
Thumbs Up                     12551
Home                          10082
Add to Playlist                6526
Add Friend                     4277
Roll Advert                    3933
Logout                         3226
Thumbs Down                    2546
Downgrade                      2055
Settings                       1514
Help                           1454
Upgrade                         499
About                           495
Save Settings                   310
Error                           252
Submit Upgrade                  159
Submit Downgrade                 63
Cancel                           52
Cancellation Confirmation        52
Name: page, dtype: int64

<a id="common-summay-metrics"></a>
### Common Summary Metrics

e.g Count, Total, Avg, Sum

In [1113]:
# Summary df for n_like within 4 moving period, each period is 4 weeks

df_n_like = df_p[df_p.period.isin([1,2,3,4])]
df_n_like = df_n_like[df_n_like.page == 'Thumbs Up']
ds_like_count = df_n_like.groupby(['userId', 'period'])['page'].count()

# turn groupby output to dataframe
df_n_like_count = pd.DataFrame(ds_like_count).reset_index()
df_n_like_count.columns = ['userId', 'period', 'n_like']
df_n_like_count.head(10)

Unnamed: 0,userId,period,n_like
0,2,2,1
1,2,3,3
2,2,4,8
3,3,2,10
4,3,3,3
5,4,1,5
6,4,2,12
7,4,3,25
8,4,4,53
9,5,2,5


In [1114]:
# Summary df for n_songplay within 4 moving period, each period is 4 weeks

df_n_play = df_p[df_p.period.isin([1,2,3,4])]
df_n_play = df_n_play[df_n_play.page == 'NextSong']
ds_nplay_count = df_n_play.groupby(['userId', 'period'])['page'].count()

# turn groupby output to dataframe
df_n_play_count = pd.DataFrame(ds_nplay_count).reset_index()
df_n_play_count.columns = ['userId', 'period', 'n_songplay']
df_n_play_count.head(10)

Unnamed: 0,userId,period,n_songplay
0,2,2,50
1,2,3,99
2,2,4,247
3,3,2,154
4,3,3,38
5,4,1,102
6,4,2,315
7,4,3,576
8,4,4,1025
9,5,2,78


In [1115]:
# Summary df for n_displike within 4 moving period, each period is 4 weeks

df_n_dislike = df_p[df_p.period.isin([1,2,3,4])]
df_n_dislike = df_n_dislike[df_n_dislike.page == 'Thumbs Down']
ds_ndislike_count = df_n_dislike.groupby(['userId', 'period'])['page'].count()

# turn groupby output to dataframe
df_n_dislike_count = pd.DataFrame(ds_ndislike_count).reset_index()
df_n_dislike_count.columns = ['userId', 'period', 'n_dislike']
df_n_dislike_count.head(10)

Unnamed: 0,userId,period,n_dislike
0,2,4,3
1,3,2,3
2,4,1,4
3,4,2,1
4,4,3,10
5,4,4,11
6,6,1,2
7,6,2,4
8,6,3,10
9,6,4,11


In [1116]:
# Summary df for n_addplaylist within 4 moving period, each period is 4 weeks

df_n_addplaylist = df_p[df_p.period.isin([1,2,3,4])]
df_n_addplaylist = df_n_addplaylist[df_n_addplaylist.page == 'Add to Playlist']
ds_n_addplaylist = df_n_addplaylist.groupby(['userId', 'period'])['page'].count()

# turn groupby output to dataframe
df_n_addplaylist_count = pd.DataFrame(ds_n_addplaylist).reset_index()
df_n_addplaylist_count.columns = ['userId', 'period', 'n_addplaylist']
df_n_addplaylist_count.head(10)

Unnamed: 0,userId,period,n_addplaylist
0,2,3,1
1,2,4,7
2,3,2,3
3,3,3,1
4,4,1,4
5,4,2,7
6,4,3,13
7,4,4,35
8,5,2,4
9,5,4,2


In [1117]:
# Summary df for n_addfriend within 4 moving period, each period is 4 weeks

df_n_addfriend = df_p[df_p.period.isin([1,2,3,4])]
df_n_addfriend = df_n_addfriend[df_n_addfriend.page == 'Add Friend']
ds_n_addfriend = df_n_addfriend.groupby(['userId', 'period'])['page'].count()

# turn groupby output to dataframe
df_n_addfriend_count = pd.DataFrame(ds_n_addfriend).reset_index()
df_n_addfriend_count.columns = ['userId', 'period', 'n_addfriend']
df_n_addfriend_count.head(10)

Unnamed: 0,userId,period,n_addfriend
0,2,2,2
1,2,3,4
2,2,4,7
3,3,2,1
4,4,2,6
5,4,3,12
6,4,4,27
7,5,2,2
8,5,4,1
9,6,1,1


In [1118]:
# Summary df for n_adview within 4 moving period, each period is 4 weeks

df_n_adview = df_p[df_p.period.isin([1,2,3,4])]
df_n_adview = df_n_adview[df_n_adview.page == 'Roll Advert']
ds_n_adview = df_n_adview.groupby(['userId', 'period'])['page'].count()

# turn groupby output to dataframe
df_n_adview_count = pd.DataFrame(ds_n_adview).reset_index()
df_n_adview_count.columns = ['userId', 'period', 'n_adview']
df_n_adview_count.head(10)

Unnamed: 0,userId,period,n_adview
0,3,3,1
1,4,1,4
2,5,2,4
3,5,4,3
4,6,1,1
5,6,2,1
6,6,3,1
7,6,4,2
8,7,1,2
9,7,2,9


In [1119]:
# Summary df for n_session within 4 moving period, each period is 4 weeks

df_n_session = df_p[df_p.period.isin([1,2,3,4])]

df_n_session = \
pd.DataFrame(df_n_session.groupby(['userId', 'period', 'sessionId']).size()).reset_index().drop([0], axis=1)

df_n_session_count = pd.DataFrame(df_n_session.groupby(['userId','period'])['sessionId'].count()).reset_index()
df_n_session_count.columns = ['userId', 'period', 'n_session']
df_n_session_count


Unnamed: 0,userId,period,n_session
0,2,2,1
1,2,3,2
2,2,4,3
3,3,2,2
4,3,3,1
...,...,...,...
612,300024,1,1
613,300025,1,2
614,300025,2,3
615,300025,3,1


In [1120]:
# Summary df for n_itemsession within 4 moving period, each period is 4 weeks
# Total items in all sessions per period per user (represents usage, shoule be corelated to song play)

ds_n_itemsession = pd.DataFrame(
    df_p[df_p.period.isin([1,2,3,4])].groupby(['userId', 'period','sessionId'])['itemInSession'].count()
).reset_index().groupby(['userId', 'period'])['itemInSession'].sum()

df_n_itemsession_count = pd.DataFrame(ds_n_itemsession).reset_index()
df_n_itemsession_count.columns = ['userId', 'period', 'n_itemsession']

df_n_itemsession_count 


Unnamed: 0,userId,period,n_itemsession
0,2,2,55
1,2,3,118
2,2,4,298
3,3,2,183
4,3,3,47
...,...,...,...
612,300024,1,102
613,300025,1,42
614,300025,2,139
615,300025,3,185


In [1121]:
# Summary. Sessions Count, Session Items Count, Items per session each period. 

df_session_summary = pd.merge(df_n_session_count, df_n_itemsession_count)
df_session_summary['items_per_session'] = df_session_summary['n_itemsession'] / df_session_summary['n_session']
df_session_summary

Unnamed: 0,userId,period,n_session,n_itemsession,items_per_session
0,2,2,1,55,55.000000
1,2,3,2,118,59.000000
2,2,4,3,298,99.333333
3,3,2,2,183,91.500000
4,3,3,1,47,47.000000
...,...,...,...,...,...
612,300024,1,1,102,102.000000
613,300025,1,2,42,21.000000
614,300025,2,3,139,46.333333
615,300025,3,1,185,185.000000


In [1122]:
# Summary. Total listen duration for each user per period. 

df_duration = df_p[(df_p['page'] == 'NextSong') & df_p.period.isin([1,2,3,4])][['userId', 'period', 'length']]

df_duration_total = pd.DataFrame(
    df_duration.groupby(['userId', 'period'])['length'].sum()
).reset_index()

df_duration_total.columns = ['userId', 'period', 'total_duration']
df_duration_total


Unnamed: 0,userId,period,total_duration
0,2,2,11945.95611
1,2,3,23915.42291
2,2,4,62124.84475
3,3,2,38639.29017
4,3,3,10421.45543
...,...,...,...
611,300024,1,22698.64417
612,300025,1,7703.44387
613,300025,2,27444.63957
614,300025,3,39919.26219


In [1123]:
# Summary. Playback duration per session.

df_duration_m = \
df_p[(df_p['page'] == 'NextSong') & df_p.period.isin([1,2,3,4])][['userId','period','sessionId','length']]

df_duration_n_ses = pd.DataFrame(
    pd.DataFrame(
        df_duration_m.groupby(['userId', 'period', 'sessionId'])['length'].sum()
    ).reset_index()\
        .groupby(['userId', 'period'])['sessionId'].size()
        ).reset_index()

df_duration_total = pd.DataFrame(
    df_duration_m.groupby(['userId', 'period'])['length'].sum()
).reset_index()

df_duration_summary = pd.merge(df_duration_n_ses, df_duration_total)
df_duration_summary['avg_play_duration'] = df_duration_summary['length'] / df_duration_summary['sessionId']
df_duration_summary.columns = ['userId','period','n_play_session', 'total_playback_duration', 'avg_playback_duration']
df_duration_summary


Unnamed: 0,userId,period,n_play_session,total_playback_duration,avg_playback_duration
0,2,2,1,11945.95611,11945.956110
1,2,3,2,23915.42291,11957.711455
2,2,4,3,62124.84475,20708.281583
3,3,2,2,38639.29017,19319.645085
4,3,3,1,10421.45543,10421.455430
...,...,...,...,...,...
611,300024,1,1,22698.64417,22698.644170
612,300025,1,2,7703.44387,3851.721935
613,300025,2,3,27444.63957,9148.213190
614,300025,3,1,39919.26219,39919.262190


<a id="account-tenure"></a>
### Account Tenure

tenure should measure only the current subscription.



In [1124]:
user4 = df[df.userId == 4]

In [1125]:
np.where(user4.groupby('level').cumcount() == 0)

(array([  0, 141]),)

In [1126]:
# experiment cumcount() shift()
# https://stackoverflow.com/questions/25119524/pandas-conditional-rolling-count

dft = pd.DataFrame({
    'x': [1,2,3,4,5,6,7,8,9,10,11],
    'y': ['a','a', 'a', 'b', 'b','b','b','b','a','a','a']
})

dft.groupby(
    (dft['y'] != dft['y'].shift(1)).cumsum()
).cumcount()

0     0
1     1
2     2
3     0
4     1
5     2
6     3
7     4
8     0
9     1
10    2
dtype: int64

In [1127]:
# Experiment on Tenure calculation with User ID 4

user4['cumcount'] = user4.groupby(
    (user4['level'] != user4['level'].shift(1)).cumsum()
).cumcount()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user4['cumcount'] = user4.groupby(


In [1128]:
user4[['level', 'cumcount', 'datetime']]

# Start Date of latest subscription
user4[(user4.cumcount == 0) & (user4.level == 'paid')].tail(1).iloc[0].datetime

# End Date of latest subscription
user4[
    (user4.level == 'paid') & (~user4.page.str.startswith('Cancel'))
].tail(1).iloc[0].datetime

# Tenure Days
(user4[
    (user4.level == 'paid') & (~user4.page.str.startswith('Cancel'))
].tail(1).iloc[0].datetime) - (user4[(user4.cumcount == 0) & (user4.level == 'paid')].tail(1).iloc[0].datetime)

Timedelta('40 days 05:03:25')

In [1129]:
# Users in Observation timeframe [Timestamp('2018-11-05 00:00:00') to Timestamp('2018-12-03 00:00:00')]
# that also have 'paid' status. (subscriber)
# These are the users we concern currently.

uids_obs = list(df_p[
        (df_p.period == 4) & 
        (df_p.level == 'paid')
    ].groupby('userId').size().index)

len(uids_obs)

132

In [1310]:
# Utility Function. Get tunure days
def get_tenures(ids=uids_obs):
     
    """
    Get user's tenure days.
    Key value pair.
    
    Input: list of user ids. Assuming that argument ids are paid user. 
    Output: dict of key value pairs. key: user, value: tunure
    """
    
    dfsub = df[df.datetime < observation_end_date]
    
    tenures = {}
    
    for i in ids:
        dfu = dfsub[dfsub.userId == i]
        
        # Make a cumulative count column
        dfu['cumcount'] = dfu.groupby(
            (dfu['level'] != dfu['level'].shift(1)).cumsum()
        ).cumcount()
        
        # Start Date of latest subscription
        start = dfu[(dfu.cumcount == 0) & (dfu.level == 'paid')].tail(1).iloc[0].datetime
        start_date = str(start)[:10]
        
        # End Date of latest subscription
        end = dfu[
            (dfu.level == 'paid') & (~dfu.page.str.startswith('Cancel'))
        ].tail(1).iloc[0].datetime
        end_date = str(end)[:10]
        
        diff = pd.Timestamp(end_date) - pd.Timestamp(start_date)
#         print(diff.days)
        
        
        # append result, first day starts with 0
        tenures[i] = diff.days
        
    return tenures

tenures = get_tenures()
tenures

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfu['cumcount'] = dfu.groupby(


{2: 44,
 4: 40,
 6: 54,
 9: 30,
 10: 42,
 11: 11,
 13: 0,
 14: 49,
 15: 55,
 16: 57,
 20: 10,
 21: 34,
 23: 2,
 25: 10,
 26: 0,
 28: 8,
 29: 37,
 30: 2,
 33: 57,
 35: 13,
 36: 17,
 37: 43,
 38: 37,
 39: 0,
 40: 30,
 41: 51,
 42: 54,
 45: 39,
 46: 29,
 49: 0,
 50: 9,
 52: 26,
 53: 26,
 54: 14,
 55: 0,
 56: 22,
 59: 24,
 60: 46,
 61: 26,
 62: 50,
 65: 53,
 66: 35,
 67: 31,
 69: 47,
 70: 42,
 71: 26,
 74: 27,
 75: 52,
 77: 35,
 79: 15,
 81: 42,
 82: 23,
 83: 24,
 85: 33,
 86: 16,
 88: 34,
 89: 22,
 91: 35,
 92: 46,
 93: 3,
 96: 4,
 97: 32,
 98: 36,
 99: 23,
 100: 16,
 104: 36,
 108: 23,
 109: 36,
 111: 27,
 113: 58,
 114: 30,
 115: 54,
 118: 49,
 120: 60,
 124: 60,
 126: 42,
 127: 56,
 128: 45,
 131: 49,
 132: 50,
 136: 45,
 137: 1,
 138: 39,
 139: 20,
 140: 10,
 141: 41,
 142: 60,
 145: 57,
 147: 16,
 151: 0,
 152: 15,
 153: 22,
 155: 19,
 100002: 38,
 100004: 2,
 100007: 58,
 100008: 42,
 100014: 41,
 100015: 22,
 100016: 53,
 100018: 37,
 200002: 28,
 200003: 4,
 200004: 55,
 200005: 2

In [1311]:
tenures[25]

10

---

<a id="observing-churn"></a>

## Observing Churn


### Approach Overview
- Picking observation dates from subscribers 
- **Creating an analytic dataset by flattening metric data**
- Exporting a current customer list for segmentation

Forming Churn analysis dataset — A dataset in which every row represents a customer facing the decision to churn or stay. 

The outcome is what they do. The facts about these situations are the customer’s behavioral metric measurements.

Observe them all at the interval **relative to their life cycle** with the product, which makes them comparable.


#### About LeadTime

Since we have not details access for every user's exact sub/renew date (although, some can be inferenced from event, 
evidence of first transit from free to paid during timeframe. **However**, some users whose first event in log are already with 'paid', means that they are already ongoing subscribers we don't know when exactly they sub).

Therefore we won't have all sub/renew date for subscribers. Not a good idea to user LeadTime which suggests
observe at one-week before monthly renewal.
For this case, event log only data, to make it consistent for known retain | churn users, I use **Sequence of Regularly Spaced** Observation dates that **relative** to each distinct user. 

#### Observation DateRange and Outcome Set

Make an observation date range dataset. Each row is a sub user's observation date range and is_churn outcome.
(a user can have multiple observation date range)

USER | OBSERVATION_TIME | OUTCOME

(OBSERVATION_DATE represents the end date of the observation with interval of 14 days. Garantee to have interval (e.g 14 days), therefor no need to input observationFrom date. If not enough 14 days, I will not make it an observation 

#### Relative Observations Parameters (adjustable)

- Observation Interval(Length): 14 days (observation_date_from to observation_date: 14 days within)
- Frequency: 14 days (observe every 14 days)
- Initial is_churn outcome. (the latest of 'paid' event, ongoing or churn)
- Iterate backwards to make possible observations

#### Variables
- FirstEventDate: a user's very first shown log date. No matter under 'free' or 'paid.
- TenureFromDate: The start paid day of a user's latest subscription's from date.
- TenureToDate: The latest paid day of a user's latest subscription's from date. (It can be the churn day or sub ongoing day)


#### Users to select
- Select the users from the **latest 4 weeks**. Range: (**>= 45th Monday of 2018 AND < 49th Monday of 2018**)
that is \[>= 2018-11-05 to < 2018-12-03\]
- Users who are ongoing subscribers OR are churn in this timeframe
- (Why not use a full month calendar, e.g Nov 2018? We can. But again, the dataset is a event log, latest records are not completed in a full month Dec instances, so use relative of latest. Use **units of full weeks**. Comsumer Human Behaviors follows WEEK cycle: for example, monday to friday busy, sat, sun more usage for consumer service)

In [1455]:
# Snippet. Datetime 

the_day_dt = datetime.fromtimestamp(df.loc[182974].ts/1000)

# Use utc timestamp for consistency
tsgmt = np.max(df.ts)/1000
tutc = datetime.utcfromtimestamp(tsgmt)

# Datetime Readable (both works)
tutc.isoformat()
str(tutc)

# 'on-the-day' plus one day and trim to 00:00, then can timeframe filter set condition before 
# the begining of next day, which also means taking all time on-the-day itself. 

the_day_plus1_dt = the_day_dt + timedelta(days=1)

# Trim hour/min/sec info, so default to 00:00
the_day_plus1_dt = the_day_plus1_dt.isoformat()[:10]

# Convert to pd.Timestamp
pd.Timestamp(new_dt.isoformat()[:10])


Timestamp('2018-12-01 00:00:00')

<a id="observation-datetime-picking"></a>

In [1456]:
# Subset timeframe 

# Set the timeframe we want to do churn model.
# 4 Weeks. 

range_to = pd.Timestamp('2018-12-03') # 49th Monday of 2018.
range_from = range_to - pd.Timedelta("28 days") # 45th Monday of 2018

# Make a subset to extract user ids that we concern.
dfu = df[(df.datetime >= range_from) & (df.datetime < range_to)]

# Get all subscriber users ids within this range (subscribers)
uids = dfu[dfu['level'] == 'paid'].drop_duplicates('userId', keep='last').sort_values(by='userId')['userId'].values

# These subsribers and their last events within the range.
uids_last_e = check_last_event(uids, data=dfu)

# Separate Retain uids and Churn uids. Condition to check:
# 1) if the last e is 'free', means user trainsited from paid to free within the timeframe, OR
# 2) even tho last is 'paid', but its a 'Cancelled' auth, means the 'paid' user submit cancel. 

# Churn users ids
u_churn = [x[0] for x in uids_last_e if (x[3] == 'free') or (x[2] == 'Cancelled')]
print(len(u_churn))

# Retain users
u_retain = [x[0] for x in uids_last_e if (x[3] == 'paid') and (x[2] != 'Cancelled')]
print(len(u_retain))




21
111


In [1457]:
# Make a tenure dataframe base on above
df_tenure = pd.DataFrame(
    {
        "userId": [x[0] for x in uids_last_e],
        "page": [x[1] for x in uids_last_e],
        "auth": [x[2] for x in uids_last_e],
        "level": [x[3] for x in uids_last_e],
        "latest_log_time": [x[4] for x in uids_last_e],
    }
)

# Add info to df_tenure)
# is_churn: DONE
# tenure_days: 
# tenure_from:
# tenure_to:
# first_log_time: 

def _get_is_churn(l, a):
    
    if (l == 'free') or (a == 'Cancelled'):
        return 1
    else:
        return 0
    
def _get_tenure_to_date(x):
    return dfu[(dfu.userId == x) & (dfu.level=='paid')].iloc[-1].datetime

def _get_tenure_from_time(u, t):
    
    tenure_days = tenures[u]
    dfx = df[df.userId == u]
    from_date = (t - pd.Timedelta(f'{tenure_days} days')).isoformat()[:10]
    tenure_from = dfx[(dfx.level=='paid') & (dfx.date==from_date)].iloc[0].datetime
    
    return tenure_from

def _get_first_log_date(x):
    return df[df.userId==x].iloc[0].datetime
    

    
# Add Columns

df_tenure['is_churn'] = df_tenure.apply(lambda x: _get_is_churn(x.level, x.auth), axis=1)
df_tenure['tenure_days'] = df_tenure['userId'].apply(lambda x: tenures[x])
df_tenure['tenure_to'] = df_tenure['userId'].apply(_get_tenure_to_date)
df_tenure['tenure_from'] = df_tenure.apply(lambda x: _get_tenure_from_date(x.userId, x.tenure_to), axis=1)
df_tenure['first_log_time'] = df_tenure['userId'].apply(_get_first_log_date)


In [1458]:
df_tenure.head(5)

Unnamed: 0,userId,page,auth,level,latest_log_time,is_churn,tenure_days,tenure_to,tenure_from,first_log_time
0,2,Add Friend,Logged In,paid,2018-11-21 22:30:04,0,44,2018-11-21 22:30:04,2018-10-08 04:49:55,2018-10-08 04:49:55
1,4,Add to Playlist,Logged In,paid,2018-11-30 16:32:28,0,40,2018-11-30 16:32:28,2018-10-21 11:29:03,2018-10-01 01:17:30
2,6,Home,Logged In,paid,2018-11-29 22:48:26,0,54,2018-11-29 22:48:26,2018-10-06 15:37:21,2018-10-01 15:11:43
3,9,Home,Logged In,paid,2018-11-30 14:19:12,0,30,2018-11-30 14:19:12,2018-10-31 03:25:12,2018-10-01 00:03:00
4,10,Downgrade,Logged In,paid,2018-11-19 12:49:48,0,42,2018-11-19 12:49:48,2018-10-08 02:20:20,2018-10-08 02:20:20


Define date picking constant parameter (adjustable)
- frequency = pd.Timedelta('14 days')
- interval = pd.Timedelta('21 days')

Boundary datetime range for user within its tenure range to iterate. (tenure range).

For every iteration of observation_time (It is the TO date, NOT from). And it must be within the tenure range.

**Date Picking Algorithm**

- For ongoing/retain users, is_churn ALWAYS False, any iteration of observation's outcome is_churn ALL False.
- For churn users, the initial observation is_churn is True, any following iteration outcomes (if exist, means they were under the 'paid' level) is_churn are False.


1) Condition: If an observation is within tenure range, record it. Loop to next frequency, check agian if is within range, record then loop next.

Retain Users: It is guarentee to be NotChurn since it is within his/her the tenure range, so that no worry if it has record on the date or nearest date. 

Churn Users: intial outcome always IsChurn, but any following iterations of observation are not.


2) Condition: If a observation date - user's first event date >= interval (means enough observation interval)



In [1570]:
# pseudo 

# observations = []
# intial ob_date = tenure_end_date
#
# while (ob_date isin tenure daterange), AND (ob_data - first_event_date >= interval)
#    ob_date_from = ob_date - interval
#    ob_churn_outcome = False
#    obervations.append([ob_data_from, ob_date, ob_churn_outcome])
#    ob_date -= frequency
#    

# Define Parameters. (Adjustable)
frequency = pd.Timedelta('14 days') # observation frequency. every x days. 
interval = pd.Timedelta('28 days') # length of each observation


def get_observations(userId):
    """
    Make observations of datetime range and churn outcome for a subscriber.
    
    Output: list of tuples. 
    """
    
    ds = df_tenure[df_tenure.userId==userId].iloc[0]
    
    first_log_time = ds.first_log_time
    tenure_from = ds.tenure_from
    
    tenure_to = ds.tenure_to

    # Initial observation time is tenure_to time
    obsevation_time = tenure_to

    # Initializer
    observations = []
    
    while (tenure_from <= obsevation_time <= tenure_to) & (obsevation_time - interval >= first_log_time):
        observations.append(obsevation_time)
        obsevation_time -= frequency
        
    # If user is in churn user list, initial observation always True, otherwise False
    if (userId in u_churn):
        res = [(userId, x, 1) if x == tenure_to else (userId, x, 0) for x in observations]
    else:
        res = [(userId, x, 0) for x in observations]
        
    return res
    
    
# Spot Check
get_observations(300004)

[(300004, Timestamp('2018-11-14 19:49:26'), 1)]

In [1582]:
# Generate observation daterange outcome dataset

def make_datetime_picking_outcome_dateset(sub_ids=u_churn + u_retain):
#     sub_ids = u_churn + u_retain
    
    userIds = []
    observation_times = []
    outcomes = []
    
    for i in sub_ids:
        for k in get_observations(i):
            userIds.append(k[0])
            observation_times.append(k[1])
            outcomes.append(k[2])
            
    res = pd.DataFrame({
        'userId': userIds,
        'observation_time': observation_times,
        'is_churn': outcomes
    })
    
    return res


dfobs = make_datetime_picking_outcome_dateset()
dfobs = dfobs.sort_values(['userId', 'observation_time'], ascending=[True, True]).reset_index().drop('index', axis=1)

# Spot Check
dfobs

Unnamed: 0,userId,observation_time,is_churn
0,2,2018-11-07 22:30:04,0
1,2,2018-11-21 22:30:04,0
2,4,2018-11-02 16:32:28,0
3,4,2018-11-16 16:32:28,0
4,4,2018-11-30 16:32:28,0
...,...,...,...
245,300022,2018-11-27 17:28:28,0
246,300023,2018-11-29 17:49:21,0
247,300025,2018-11-02 07:12:50,0
248,300025,2018-11-16 07:12:50,0


<a id="analytic-data"></a>
---

Common Summary Metrics

e.g Count, Total, Avg, Sum

In [1583]:
# Subset dataset only contain the selected within-range (latest 4 weeks) subscriber (uids)
# Full date range as original set. (Oct, Nov, Dec2days)
dfs = df[df.userId.isin(uids)]
dfs.head()

Unnamed: 0,ts,userId,sessionId,page,auth,level,itemInSession,location,gender,artist,song,length,fullname,datetime,date
0,1538352117000,30,29,NextSong,Logged In,paid,50,"Bakersfield, CA",M,Martha Tilston,Rockpools,277.89016,Colin Freeman,2018-10-01 00:01:57,2018-10-01
1,1538352180000,9,8,NextSong,Logged In,free,79,"Boston-Cambridge-Newton, MA-NH",M,Five Iron Frenzy,Canada,236.09424,Micah Long,2018-10-01 00:03:00,2018-10-01
2,1538352394000,30,29,NextSong,Logged In,paid,51,"Bakersfield, CA",M,Adam Lambert,Time For Miracles,282.8273,Colin Freeman,2018-10-01 00:06:34,2018-10-01
3,1538352416000,9,8,NextSong,Logged In,free,80,"Boston-Cambridge-Newton, MA-NH",M,Enigma,Knocking On Forbidden Doors,262.71302,Micah Long,2018-10-01 00:06:56,2018-10-01
4,1538352676000,30,29,NextSong,Logged In,paid,52,"Bakersfield, CA",M,Daft Punk,Harder Better Faster Stronger,223.60771,Colin Freeman,2018-10-01 00:11:16,2018-10-01


In [1584]:
# for every observation instance, calculate metrics, e.g number of song play, n like etc
dfobs.head()

Unnamed: 0,userId,observation_time,is_churn
0,2,2018-11-07 22:30:04,0
1,2,2018-11-21 22:30:04,0
2,4,2018-11-02 16:32:28,0
3,4,2018-11-16 16:32:28,0
4,4,2018-11-30 16:32:28,0


In [1585]:
# Convert to np array

npobs = np.array(dfobs)
npobs

array([[2, Timestamp('2018-11-07 22:30:04'), 0],
       [2, Timestamp('2018-11-21 22:30:04'), 0],
       [4, Timestamp('2018-11-02 16:32:28'), 0],
       [4, Timestamp('2018-11-16 16:32:28'), 0],
       [4, Timestamp('2018-11-30 16:32:28'), 0],
       [6, Timestamp('2018-11-01 22:48:26'), 0],
       [6, Timestamp('2018-11-15 22:48:26'), 0],
       [6, Timestamp('2018-11-29 22:48:26'), 0],
       [9, Timestamp('2018-11-02 14:19:12'), 0],
       [9, Timestamp('2018-11-16 14:19:12'), 0],
       [9, Timestamp('2018-11-30 14:19:12'), 0],
       [10, Timestamp('2018-11-05 12:49:48'), 0],
       [10, Timestamp('2018-11-19 12:49:48'), 0],
       [11, Timestamp('2018-11-27 09:08:22'), 0],
       [13, Timestamp('2018-11-08 07:44:01'), 1],
       [14, Timestamp('2018-11-13 23:48:14'), 0],
       [14, Timestamp('2018-11-27 23:48:14'), 0],
       [15, Timestamp('2018-11-11 04:56:58'), 0],
       [15, Timestamp('2018-11-25 04:56:58'), 0],
       [16, Timestamp('2018-11-02 02:20:11'), 0],
       [16,

In [1716]:
# Computing Metrics functions

def get_n_songplay(uid, d_from, d_to):
    ds = dfs[(dfs.userId==uid) & (dfs.datetime > d_from) & (dfs.datetime <= d_to)]\
.page.value_counts()
    
    try:
        res = ds['NextSong']
    except:
        res = 0
        
    return res


def get_n_like(uid, d_from, d_to):
    ds = dfs[(dfs.userId==uid) & (dfs.datetime > d_from) & (dfs.datetime <= d_to)]\
.page.value_counts()
    
    try:
        res = ds['Thumbs Up']
    except:
        res = 0
        
    return res


def get_n_dislike(uid, d_from, d_to):
    ds = dfs[(dfs.userId==uid) & (dfs.datetime > d_from) & (dfs.datetime <= d_to)]\
.page.value_counts()
    
    try:
        res = ds['Thumbs Down']
    except:
        res = 0
        
    return res


def get_n_addtoplaylist(uid, d_from, d_to):
    ds = dfs[(dfs.userId==uid) & (dfs.datetime > d_from) & (dfs.datetime <= d_to)]\
.page.value_counts()
    
    try:
        res = ds['Add to Playlist']
    except:
        res = 0
        
    return res

def get_n_addfriend(uid, d_from, d_to):
    ds = dfs[(dfs.userId==uid) & (dfs.datetime > d_from) & (dfs.datetime <= d_to)]\
.page.value_counts()
    
    try:
        res = ds['Add Friend']
    except:
        res = 0
        
    return res


def get_n_adview(uid, d_from, d_to):
    ds = dfs[(dfs.userId==uid) & (dfs.datetime > d_from) & (dfs.datetime <= d_to)]\
.page.value_counts()
    
    try:
        res = ds['Roll Advert']
    except:
        res = 0
        
    return res


def get_n_viewdowngrade(uid, d_from, d_to):
    ds = dfs[(dfs.userId==uid) & (dfs.datetime > d_from) & (dfs.datetime <= d_to)]\
.page.value_counts()
    
    try:
        res = ds['Downgrade']
    except:
        res = 0
        
    return res


def get_n_session(uid, d_from, d_to):
    
    try:
        res = dfs[(dfs.userId==uid) & (dfs.datetime > d_from) & (dfs.datetime <= d_to)].sessionId.nunique()

    except:
        res = 0
        
    return res



def get_total_playlength(uid, d_from, d_to):
    
    try:
        res = math.ceil(dfs[(dfs.userId==uid) & (dfs.datetime > d_from) & (dfs.datetime <= d_to)]['length'].sum())

    except:
        res = 0
        
    return res

def get_tenure_days(uid, u_to):
    res = (pd.Timestamp(str(d_to)[:10]) - \
           pd.Timestamp(str(df_tenure[df_tenure.userId==uid].tenure_from.iloc[0])[:10])).days
    
    return res

def get_songplay_change_perc(uid, d_from, d_to):
    first_log_time = df_tenure[df_tenure.userId==uid]['first_log_time'].iloc[0]
    delta = frequency
    end = get_n_songplay(uid, d_from, d_to)
    start = get_n_songplay(uid, d_from - delta, d_to - delta) + 1 # plus 1 in case denominator 0 to avoid error
    perc = (end / start) - 1
    perc = round(perc, 4)
    
    if (d_from - delta < first_log_time):
        # not enough timerange data to compare with, 
        # which means 'unfair' comparation with previous timeframe
        # default to 0% change.
        perc = 0
    
    return perc


In [1721]:
df_tenure[df_tenure.userId==2]['first_log_time'].iloc[0] < d_to

True

In [1722]:
# 

"""observe to: 2018-11-07 22:30:04"""

print(get_n_songplay(2, pd.Timestamp('2018-10-24 22:30:04'), pd.Timestamp('2018-11-21 22:30:04')))


print(get_n_songplay(2, pd.Timestamp('2018-10-24 22:30:04') - interval, pd.Timestamp('2018-11-21 22:30:04') - interval))

396
359


In [1723]:
dfs.page.value_counts()

NextSong                     194005
Thumbs Up                     10882
Home                           8347
Add to Playlist                5581
Add Friend                     3616
Logout                         2610
Roll Advert                    2378
Thumbs Down                    2023
Downgrade                      1810
Settings                       1243
Help                           1229
About                           431
Upgrade                         337
Save Settings                   249
Error                           212
Submit Upgrade                  132
Submit Downgrade                 52
Cancel                           10
Cancellation Confirmation        10
Name: page, dtype: int64

In [1724]:
# Get metrics for every observation instance

for i in npobs:
    uid = i[0]
    d_to = i[1]
    d_from = d_to - interval
    is_churn = i[2]
    
    n_songplay = get_n_songplay(uid, d_from, d_to)
    song_play_change_perc = get_songplay_change_perc(uid, d_from, d_to)
    n_session = get_n_session(uid, d_from, d_to)
    total_playlength = get_total_playlength(uid, d_from, d_to)
    n_addtoplaylist = get_n_addtoplaylist(uid, d_from, d_to)
    n_addfriend = get_n_addfriend(uid, d_from, d_to)
    n_like = get_n_like(uid, d_from, d_to)
    n_dislike = get_n_dislike(uid, d_from, d_to)
    n_adview = get_n_adview(uid, d_from, d_to)
    n_viewdowngrade = get_n_viewdowngrade(uid, d_from, d_to)
    avg_length_per_session = round(total_playlength / n_session, 2)
    tenure = get_tenure_days(uid, d_to)
    
    print()
    print(f"uid: {uid}")
    print(f"observe from: {d_from}")
    print(f"observe to: {d_to}")
    print(f"n_songplay: {n_songplay}")
    print(f"song_play_change_perc: {song_play_change_perc}")
    print(f"n_session: {n_session}")
    print(f"total_playlength: {total_playlength}")
    print(f"n_addtoplaylist: {n_addtoplaylist}")
    print(f"n_addfriend: {n_addfriend}")
    print(f"n_like: {n_like}")
    print(f"n_dislike: {n_dislike}")
    print(f"n_adview: {n_adview}")
    print(f"n_viewdowngrade: {n_viewdowngrade}")
    print(f"avg_length_per_session: {avg_length_per_session}")
    print(f"tenure: {tenure} days")
    print(f"is_churn: {is_churn}")
    
    



uid: 2
observe from: 2018-10-10 22:30:04
observe to: 2018-11-07 22:30:04
n_songplay: 154
song_play_change_perc: 0
n_session: 4
total_playlength: 36986
n_addtoplaylist: 2
n_addfriend: 6
n_like: 4
n_dislike: 0
n_adview: 0
n_viewdowngrade: 2
avg_length_per_session: 9246.5
tenure: 30 days
is_churn: 0

uid: 2
observe from: 2018-10-24 22:30:04
observe to: 2018-11-21 22:30:04
n_songplay: 396
song_play_change_perc: 1.5714
n_session: 6
total_playlength: 97987
n_addtoplaylist: 8
n_addfriend: 13
n_like: 12
n_dislike: 3
n_adview: 0
n_viewdowngrade: 7
avg_length_per_session: 16331.17
tenure: 44 days
is_churn: 0

uid: 4
observe from: 2018-10-05 16:32:28
observe to: 2018-11-02 16:32:28
n_songplay: 846
song_play_change_perc: 0
n_session: 10
total_playlength: 209280
n_addtoplaylist: 21
n_addfriend: 15
n_like: 34
n_dislike: 13
n_adview: 4
n_viewdowngrade: 8
avg_length_per_session: 20928.0
tenure: 12 days
is_churn: 0

uid: 4
observe from: 2018-10-19 16:32:28
observe to: 2018-11-16 16:32:28
n_songplay: 1


uid: 29
observe from: 2018-10-03 07:37:11
observe to: 2018-10-31 07:37:11
n_songplay: 2067
song_play_change_perc: 0
n_session: 23
total_playlength: 519695
n_addtoplaylist: 63
n_addfriend: 34
n_like: 104
n_dislike: 15
n_adview: 11
n_viewdowngrade: 13
avg_length_per_session: 22595.43
tenure: 23 days
is_churn: 0

uid: 29
observe from: 2018-10-17 07:37:11
observe to: 2018-11-14 07:37:11
n_songplay: 1792
song_play_change_perc: -0.133
n_session: 18
total_playlength: 446082
n_addtoplaylist: 50
n_addfriend: 22
n_like: 97
n_dislike: 14
n_adview: 6
n_viewdowngrade: 16
avg_length_per_session: 24782.33
tenure: 37 days
is_churn: 1

uid: 30
observe from: 2018-11-02 23:58:40
observe to: 2018-11-30 23:58:40
n_songplay: 760
song_play_change_perc: 0.3547
n_session: 15
total_playlength: 190310
n_addtoplaylist: 24
n_addfriend: 15
n_like: 29
n_dislike: 13
n_adview: 37
n_viewdowngrade: 3
avg_length_per_session: 12687.33
tenure: 2 days
is_churn: 0

uid: 33
observe from: 2018-10-02 13:48:42
observe to: 2018-


uid: 53
observe from: 2018-10-22 12:34:56
observe to: 2018-11-19 12:34:56
n_songplay: 1405
song_play_change_perc: 1.4058
n_session: 12
total_playlength: 349065
n_addtoplaylist: 34
n_addfriend: 20
n_like: 59
n_dislike: 14
n_adview: 1
n_viewdowngrade: 13
avg_length_per_session: 29088.75
tenure: 26 days
is_churn: 1

uid: 54
observe from: 2018-10-01 19:40:08
observe to: 2018-10-29 19:40:08
n_songplay: 1761
song_play_change_perc: 0
n_session: 23
total_playlength: 437724
n_addtoplaylist: 43
n_addfriend: 22
n_like: 93
n_dislike: 16
n_adview: 46
n_viewdowngrade: 16
avg_length_per_session: 19031.48
tenure: 0 days
is_churn: 0

uid: 54
observe from: 2018-10-15 19:40:08
observe to: 2018-11-12 19:40:08
n_songplay: 1400
song_play_change_perc: -0.205
n_session: 25
total_playlength: 356228
n_addtoplaylist: 33
n_addfriend: 12
n_like: 94
n_dislike: 20
n_adview: 42
n_viewdowngrade: 20
avg_length_per_session: 14249.12
tenure: 14 days
is_churn: 1

uid: 55
observe from: 2018-10-24 23:11:11
observe to: 2018


uid: 74
observe from: 2018-11-01 19:15:26
observe to: 2018-11-29 19:15:26
n_songplay: 1576
song_play_change_perc: 0.3046
n_session: 11
total_playlength: 397051
n_addtoplaylist: 56
n_addfriend: 30
n_like: 88
n_dislike: 11
n_adview: 1
n_viewdowngrade: 24
avg_length_per_session: 36095.55
tenure: 27 days
is_churn: 0

uid: 75
observe from: 2018-10-14 14:42:03
observe to: 2018-11-11 14:42:03
n_songplay: 608
song_play_change_perc: 0
n_session: 4
total_playlength: 151080
n_addtoplaylist: 21
n_addfriend: 18
n_like: 35
n_dislike: 4
n_adview: 1
n_viewdowngrade: 4
avg_length_per_session: 37770.0
tenure: 38 days
is_churn: 0

uid: 75
observe from: 2018-10-28 14:42:03
observe to: 2018-11-25 14:42:03
n_songplay: 485
song_play_change_perc: -0.2023
n_session: 4
total_playlength: 120612
n_addtoplaylist: 16
n_addfriend: 19
n_like: 30
n_dislike: 3
n_adview: 0
n_viewdowngrade: 6
avg_length_per_session: 30153.0
tenure: 52 days
is_churn: 0

uid: 77
observe from: 2018-10-03 02:27:12
observe to: 2018-10-31 02:


uid: 97
observe from: 2018-10-19 09:37:14
observe to: 2018-11-16 09:37:14
n_songplay: 1070
song_play_change_perc: 0
n_session: 15
total_playlength: 267495
n_addtoplaylist: 41
n_addfriend: 19
n_like: 59
n_dislike: 7
n_adview: 14
n_viewdowngrade: 6
avg_length_per_session: 17833.0
tenure: 18 days
is_churn: 0

uid: 97
observe from: 2018-11-02 09:37:14
observe to: 2018-11-30 09:37:14
n_songplay: 1531
song_play_change_perc: 0.4308
n_session: 14
total_playlength: 380741
n_addtoplaylist: 52
n_addfriend: 26
n_like: 81
n_dislike: 10
n_adview: 1
n_viewdowngrade: 14
avg_length_per_session: 27195.79
tenure: 32 days
is_churn: 0

uid: 98
observe from: 2018-10-05 03:23:49
observe to: 2018-11-02 03:23:49
n_songplay: 476
song_play_change_perc: 0
n_session: 10
total_playlength: 120857
n_addtoplaylist: 17
n_addfriend: 5
n_like: 27
n_dislike: 6
n_adview: 21
n_viewdowngrade: 2
avg_length_per_session: 12085.7
tenure: 8 days
is_churn: 0

uid: 98
observe from: 2018-10-19 03:23:49
observe to: 2018-11-16 03:23:


uid: 120
observe from: 2018-10-19 01:08:23
observe to: 2018-11-16 01:08:23
n_songplay: 218
song_play_change_perc: -0.8296
n_session: 5
total_playlength: 53357
n_addtoplaylist: 8
n_addfriend: 5
n_like: 10
n_dislike: 2
n_adview: 0
n_viewdowngrade: 4
avg_length_per_session: 10671.4
tenure: 46 days
is_churn: 0

uid: 120
observe from: 2018-11-02 01:08:23
observe to: 2018-11-30 01:08:23
n_songplay: 270
song_play_change_perc: 0.2385
n_session: 5
total_playlength: 67254
n_addtoplaylist: 8
n_addfriend: 4
n_like: 8
n_dislike: 4
n_adview: 1
n_viewdowngrade: 2
avg_length_per_session: 13450.8
tenure: 60 days
is_churn: 0

uid: 124
observe from: 2018-10-05 15:12:46
observe to: 2018-11-02 15:12:46
n_songplay: 1446
song_play_change_perc: 0
n_session: 11
total_playlength: 355394
n_addtoplaylist: 54
n_addfriend: 23
n_like: 56
n_dislike: 12
n_adview: 3
n_viewdowngrade: 13
avg_length_per_session: 32308.55
tenure: 32 days
is_churn: 0

uid: 124
observe from: 2018-10-19 15:12:46
observe to: 2018-11-16 15:12:

  perc = (end / start) - 1



uid: 140
observe from: 2018-11-02 19:33:12
observe to: 2018-11-30 19:33:12
n_songplay: 2504
song_play_change_perc: -0.2833
n_session: 32
total_playlength: 627511
n_addtoplaylist: 60
n_addfriend: 82
n_like: 118
n_dislike: 30
n_adview: 42
n_viewdowngrade: 27
avg_length_per_session: 19609.72
tenure: 10 days
is_churn: 0

uid: 141
observe from: 2018-10-06 01:29:31
observe to: 2018-11-03 01:29:31
n_songplay: 588
song_play_change_perc: 0
n_session: 7
total_playlength: 146544
n_addtoplaylist: 18
n_addfriend: 11
n_like: 23
n_dislike: 2
n_adview: 11
n_viewdowngrade: 3
avg_length_per_session: 20934.86
tenure: 27 days
is_churn: 0

uid: 141
observe from: 2018-10-20 01:29:31
observe to: 2018-11-17 01:29:31
n_songplay: 355
song_play_change_perc: -0.3963
n_session: 6
total_playlength: 89193
n_addtoplaylist: 13
n_addfriend: 7
n_like: 16
n_dislike: 3
n_adview: 0
n_viewdowngrade: 3
avg_length_per_session: 14865.5
tenure: 41 days
is_churn: 0

uid: 142
observe from: 2018-10-05 19:32:17
observe to: 2018-11


uid: 200004
observe from: 2018-10-30 14:04:22
observe to: 2018-11-27 14:04:22
n_songplay: 845
song_play_change_perc: -0.0474
n_session: 14
total_playlength: 208727
n_addtoplaylist: 34
n_addfriend: 11
n_like: 37
n_dislike: 22
n_adview: 0
n_viewdowngrade: 15
avg_length_per_session: 14909.07
tenure: 55 days
is_churn: 0

uid: 200007
observe from: 2018-10-21 23:55:06
observe to: 2018-11-18 23:55:06
n_songplay: 65
song_play_change_perc: 0
n_session: 2
total_playlength: 15740
n_addtoplaylist: 0
n_addfriend: 1
n_like: 2
n_dislike: 0
n_adview: 0
n_viewdowngrade: 0
avg_length_per_session: 7870.0
tenure: 30 days
is_churn: 0

uid: 200008
observe from: 2018-10-06 00:02:59
observe to: 2018-11-03 00:02:59
n_songplay: 329
song_play_change_perc: 0
n_session: 4
total_playlength: 79096
n_addtoplaylist: 9
n_addfriend: 2
n_like: 15
n_dislike: 10
n_adview: 0
n_viewdowngrade: 9
avg_length_per_session: 19774.0
tenure: 30 days
is_churn: 0

uid: 200008
observe from: 2018-10-20 00:02:59
observe to: 2018-11-17 0


uid: 300015
observe from: 2018-10-19 16:53:06
observe to: 2018-11-16 16:53:06
n_songplay: 1447
song_play_change_perc: 0.0782
n_session: 18
total_playlength: 351013
n_addtoplaylist: 42
n_addfriend: 27
n_like: 102
n_dislike: 11
n_adview: 0
n_viewdowngrade: 12
avg_length_per_session: 19500.72
tenure: 38 days
is_churn: 0

uid: 300015
observe from: 2018-11-02 16:53:06
observe to: 2018-11-30 16:53:06
n_songplay: 1039
song_play_change_perc: -0.282
n_session: 14
total_playlength: 255045
n_addtoplaylist: 27
n_addfriend: 21
n_like: 91
n_dislike: 9
n_adview: 0
n_viewdowngrade: 7
avg_length_per_session: 18217.5
tenure: 52 days
is_churn: 0

uid: 300016
observe from: 2018-10-16 15:51:32
observe to: 2018-11-13 15:51:32
n_songplay: 174
song_play_change_perc: 0
n_session: 4
total_playlength: 43292
n_addtoplaylist: 8
n_addfriend: 0
n_like: 17
n_dislike: 1
n_adview: 0
n_viewdowngrade: 1
avg_length_per_session: 10823.0
tenure: 38 days
is_churn: 0

uid: 300016
observe from: 2018-10-30 15:51:32
observe to:

In [1543]:
print(dfs[(dfs.userId==2) & (dfs.datetime > '2018-11-07 22:30:04') & (dfs.datetime <= '2018-11-21 22:30:04')].shape)

print(dfs[(dfs.userId==2) & (dfs.datetime > '2018-10-24 22:30:04') & (dfs.datetime <= '2018-11-07 22:30:04')].shape)

print(dfs[(dfs.userId==2) & (dfs.datetime > '2018-10-10 22:30:04') & (dfs.datetime <= '2018-10-24 22:30:04')].shape)

(290, 15)
(181, 15)
(0, 15)


In [1651]:
dfs[(dfs.userId==20) & (dfs.datetime > '2018-11-01 13:44:09') & (dfs.datetime <= '2018-11-29 13:44:09')]['length'].sum()



154407.65370000002

In [1607]:
u_churn

[13,
 28,
 29,
 35,
 53,
 54,
 61,
 70,
 92,
 109,
 131,
 100007,
 100008,
 100014,
 100015,
 100016,
 100018,
 200003,
 200017,
 300001,
 300004]