In [105]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [106]:
data = pd.read_csv("../Customer_Lifecycles_Markov_Chains/data/big_upto15Units_UK.txt", sep="\t", header=None)
data.columns = ['marketplace_id','encrypted_customer_id','unencrypted_customer_id','fulfill_date_local',
               'fulfill_datetime_local','revenue','units','content_age',
                'first_purchase','is_promo','is_pmd','content_type','units_ttm','rank','first_ever','is_active_ttm']
data.shape

(1876808, 16)

In [107]:
data.head()

Unnamed: 0,marketplace_id,encrypted_customer_id,unencrypted_customer_id,fulfill_date_local,fulfill_datetime_local,revenue,units,content_age,first_purchase,is_promo,is_pmd,content_type,units_ttm,rank,first_ever,is_active_ttm
0,3,A02031072A8PG12IH69EA,9223372034566733651,2019-02-17 00:00:00,2019-02-17 10:20:04,2.91,1,E_2y_3yr,2019-02-17 00:00:00,non_promo,non_pmd,Movie VOD,3,1,2019-02-17 00:00:00,1
1,3,A02031072A8PG12IH69EA,9223372034566733651,2019-05-31 00:00:00,2019-05-31 21:37:37,2.91,1,F_Over_3yr,2019-02-17 00:00:00,non_promo,non_pmd,Movie VOD,3,2,2019-02-17 00:00:00,1
2,3,A02031072A8PG12IH69EA,9223372034566733651,2019-06-02 00:00:00,2019-06-02 10:08:15,3.32,1,B_30days_6m,2019-02-17 00:00:00,promo,is_pmd,Movie VOD,3,3,2019-02-17 00:00:00,1
3,3,A0394092I84H9W8CLF7T,9223372034578986151,2019-01-14 00:00:00,2019-01-14 10:19:36,1.57,1,D_1y_2yr,2019-01-14 00:00:00,non_promo,non_pmd,TV Episode,3,1,2019-01-14 00:00:00,1
4,3,A0394092I84H9W8CLF7T,9223372034578986151,2019-02-13 00:00:00,2019-02-13 18:09:13,2.07,1,D_1y_2yr,2019-01-14 00:00:00,non_promo,non_pmd,TV Episode,3,2,2019-01-14 00:00:00,1


In [108]:
def make_dataset_adjustments(df):
    
    df['fulfill_date_local'] = pd.to_datetime(df['fulfill_date_local'])
    df['fulfill_datetime_local'] = pd.to_datetime(df['fulfill_datetime_local'])
    df['first_ever'] = pd.to_datetime(df['first_ever'])
    df = df.dropna()
    df['tenure'] = [int(i.days) for i in (max(df['fulfill_date_local']) - df['first_ever'])]
    df['tenure_up_to_last_purchase'] = [int(i.days) for i in (df['fulfill_date_local']) - df['first_ever']]
    df['tenure_months_up_to_last_purchase'] = df['tenure_up_to_last_purchase'] / 30
    
    df['cumsum'] = df.groupby('encrypted_customer_id')['revenue'].cumsum()
    df['tenure_up_to_last_purchase_corrected'] = np.where(df['tenure_months_up_to_last_purchase'] < 1, 1, df['tenure_months_up_to_last_purchase'])
    df['arpu_monthly_dynamic'] = df['cumsum'] / (df['tenure_up_to_last_purchase_corrected'])
    
    #Difference between purchases
    df['tenure_diff'] = df.groupby('encrypted_customer_id')['tenure_up_to_last_purchase'].diff()
    
    df = df.fillna(0)
    
    f = df.groupby(['encrypted_customer_id','is_pmd'])['fulfill_datetime_local'].min().reset_index()
    f['first_deal'] = 1
    
    df = pd.merge(left = df, right = f[['encrypted_customer_id','first_deal','fulfill_datetime_local']],
                  right_on = ['encrypted_customer_id','fulfill_datetime_local'],
                  left_on = ['encrypted_customer_id','fulfill_datetime_local'], how = 'left')
    
    first_rank = df[df['first_deal'] == 1][['encrypted_customer_id','is_promo','is_pmd','first_deal']] 
    
    list_duplicated = first_rank[first_rank['encrypted_customer_id'].duplicated() == True]
    
    df = df[~df['encrypted_customer_id'].isin(list_duplicated['encrypted_customer_id'])]    
     
        
    df['first_deal'] = np.where(df['first_deal'] == 1, 1, 0)
    first_rank = df[df['first_deal'] == 1][['encrypted_customer_id','is_promo','is_pmd','first_deal']] 
    pd.set_option('mode.chained_assignment', None)
    first_rank['how_started'] = 'start_non_promo'
    first_rank['how_started'][(first_rank['is_pmd'] == 'is_pmd')] = 'start_pmd'
    first_rank['how_started'][(first_rank['is_pmd'] == 'non_pmd') & (first_rank['is_promo'] == 'promo') ] = 'start_promo_nonpmd'
    
    pd.set_option('mode.chained_assignment', None)
    # set default value
    df['tenure_years'] = "default"
    df['tenure_years'][(df['tenure'] < 360)] = "under_year"
    df['tenure_years'][(df['tenure'] >= 360) & (df['tenure'] < 360*2)] = 'btw_1and2yrs'
    df['tenure_years'][(df['tenure'] >= 360*2) & (df['tenure'] < 360*3)] = 'btw_2and3yrs'
    df['tenure_years'][(df['tenure'] >= 360*3) & (df['tenure'] < 360*4)] = 'btw_3and4yrs'
    df['tenure_years'][(df['tenure'] >= 360*4) & (df['tenure'] < 360*5)] = 'btw_4and5yrs'
    df['tenure_years'][(df['tenure'] >= 360*5) ] = 'over_5years'
    
    result = pd.merge(right = df, left = first_rank[['encrypted_customer_id','how_started']], right_on = ['encrypted_customer_id'], left_on = ['encrypted_customer_id'], how = 'left')
    
    return result

In [109]:
data = make_dataset_adjustments(data)

In [110]:
data[data['fulfill_date_local'].isnull() == True]

Unnamed: 0,encrypted_customer_id,how_started,marketplace_id,unencrypted_customer_id,fulfill_date_local,fulfill_datetime_local,revenue,units,content_age,first_purchase,...,is_active_ttm,tenure,tenure_up_to_last_purchase,tenure_months_up_to_last_purchase,cumsum,tenure_up_to_last_purchase_corrected,arpu_monthly_dynamic,tenure_diff,first_deal,tenure_years


In [111]:
data.shape

(1427697, 26)

In [112]:
data['encrypted_customer_id'].nunique()

438203

In [113]:
print(data.shape)
data.tail()

(1427697, 26)


Unnamed: 0,encrypted_customer_id,how_started,marketplace_id,unencrypted_customer_id,fulfill_date_local,fulfill_datetime_local,revenue,units,content_age,first_purchase,...,is_active_ttm,tenure,tenure_up_to_last_purchase,tenure_months_up_to_last_purchase,cumsum,tenure_up_to_last_purchase_corrected,arpu_monthly_dynamic,tenure_diff,first_deal,tenure_years
1427692,AZR5H76DDOQ5K,start_non_promo,3,6450009435,2019-11-13,2019-11-13 17:48:11,2.91,1,D_1y_2yr,2019-11-13 00:00:00,...,1,20,0,0.0,2.91,1.0,2.91,0.0,1,under_year
1427693,AZR9VY1F5E34T,start_promo_nonpmd,3,40824463612,2019-01-06,2019-01-06 21:35:43,8.32,1,C_6m_12m,2019-01-06 00:00:00,...,1,331,0,0.0,8.32,1.0,8.32,0.0,1,under_year
1427694,AZTTMAIP73S58,start_non_promo,3,414033813,2017-05-24,2017-05-24 21:37:43,5.83,1,C_6m_12m,2017-05-24 00:00:00,...,0,923,0,0.0,5.83,1.0,5.83,0.0,1,btw_2and3yrs
1427695,AZXB2KT0O1OPG,start_non_promo,3,1027594262,2017-03-11,2017-03-11 16:47:05,1.58,1,F_Over_3yr,2017-03-11 00:00:00,...,0,997,0,0.0,1.58,1.0,1.58,0.0,1,btw_2and3yrs
1427696,AZZ4J3LOEZ70V,start_non_promo,3,7789970812,2017-11-30,2017-11-30 14:53:33,8.33,1,B_30days_6m,2017-11-30 00:00:00,...,0,733,0,0.0,8.33,1.0,8.33,0.0,1,btw_2and3yrs


In [114]:
# a = data.groupby(['encrypted_customer_id'])['cumsum'].max().reset_index()
# b = data.groupby(['encrypted_customer_id'])['revenue'].sum().reset_index()
# c = pd.concat([a, b], axis=1)

In [115]:
def f(x):
    d = {}
    d['count'] = x['encrypted_customer_id'].nunique()
    d['avg_tenure']=x['tenure'].mean()
    
    return pd.Series(d, index = ['count','avg_tenure'])


summary = data.groupby('units_ttm').apply(f).reset_index()
summary

Unnamed: 0,units_ttm,count,avg_tenure
0,1,185978.0,691.229904
1,2,76643.0,805.605641
2,3,44335.0,873.829593
3,4,29789.0,937.968258
4,5,21639.0,967.683698
5,6,16483.0,1016.218272
6,7,12953.0,1040.85809
7,8,10598.0,1069.44251
8,9,8584.0,1094.085447
9,10,7392.0,1132.52668


In [116]:
summary['count'].sum()

438203.0

In [117]:
summary['count'].sum() == data['encrypted_customer_id'].nunique()

True

In [118]:
def get_diff_stats(x):
    """provides various metrics for count, sum, revenue"""
    d = {}
    d['count'] = x['encrypted_customer_id'].nunique()
    d['total_revenue'] = x['revenue'].sum()
    d['total_units'] = x['units'].sum()
    d['arpu_monthly_dynamic'] = x['arpu_monthly_dynamic'].mean()
    
    return pd.Series(d, index = ['count','total_revenue','total_units','arpu_monthly_dynamic'])




def make_proper_dataset(df):
    grouped = data.groupby(["units_ttm",'is_promo','is_pmd','rank','tenure_years']).apply(get_diff_stats).reset_index()
    total_group = df.groupby(["units_ttm"])['units'].sum().reset_index()

    grouped_df = pd.merge(left = grouped, right = total_group, left_on = 'units_ttm', right_on = 'units_ttm', how = 'left')
    grouped_df.columns = ['units_ttm','is_promo','is_pmd','rank','tenure_years','count_customers','total_revenue',
                          'total_units','arpu_monthly_dynamic','group_units']
    grouped_df['share'] = grouped_df['total_units'] / grouped_df['group_units']
    grouped_df = grouped_df.sort_values(by = ['units_ttm', 'rank'])
    
    grouped_by_rank = grouped.groupby(['units_ttm','rank'])['total_units'].sum().reset_index()
    
    final = pd.merge(left = grouped_df, right=grouped_by_rank, left_on = ['units_ttm','rank'], right_on = ['units_ttm','rank'], how = 'left' )
    final.columns = ['units_ttm','is_promo','is_pmd','rank', 'tenure_years','count_customers','total_revenue','total_units',
                     'arpu_monthly_dynamic','group_units','share_per_group','units_per_rank']
    final['share_by_rank'] = final['total_units'] / final['units_per_rank']

    return final

In [119]:
grouped = make_proper_dataset(data)

In [120]:
grouped[0:10]

Unnamed: 0,units_ttm,is_promo,is_pmd,rank,tenure_years,count_customers,total_revenue,total_units,arpu_monthly_dynamic,group_units,share_per_group,units_per_rank,share_by_rank
0,1,non_promo,non_pmd,1,btw_1and2yrs,34593.0,154997.75,34593.0,4.48018,185978,0.186006,185978.0,0.186006
1,1,non_promo,non_pmd,1,btw_2and3yrs,25867.0,119214.5,25867.0,4.592859,185978,0.139086,185978.0,0.139086
2,1,non_promo,non_pmd,1,btw_3and4yrs,19700.0,91737.9,19700.0,4.656746,185978,0.105927,185978.0,0.105927
3,1,non_promo,non_pmd,1,btw_4and5yrs,14492.0,61148.12,14492.0,4.21944,185978,0.077923,185978.0,0.077923
4,1,non_promo,non_pmd,1,over_5years,9625.0,42415.37,9625.0,4.406792,185978,0.051753,185978.0,0.051753
5,1,non_promo,non_pmd,1,under_year,51629.0,230706.66,51629.0,4.468548,185978,0.277608,185978.0,0.277608
6,1,promo,is_pmd,1,btw_1and2yrs,2199.0,4474.68,2199.0,2.03487,185978,0.011824,185978.0,0.011824
7,1,promo,is_pmd,1,btw_2and3yrs,1302.0,2161.79,1302.0,1.640783,185978,0.007001,185978.0,0.007001
8,1,promo,is_pmd,1,under_year,4606.0,9265.52,4606.0,2.01162,185978,0.024766,185978.0,0.024766
9,1,promo,non_pmd,1,btw_1and2yrs,5788.0,25531.89,5788.0,4.411177,185978,0.031122,185978.0,0.031122


In [121]:
grouped.to_csv("f.csv")

# How did monthly ARPU change dynamically ?

In [122]:
print(data.shape)
print("Unique customesr:" , data['encrypted_customer_id'].nunique())
data.iloc[0:5,0:27]

(1427697, 26)
Unique customesr: 438203


Unnamed: 0,encrypted_customer_id,how_started,marketplace_id,unencrypted_customer_id,fulfill_date_local,fulfill_datetime_local,revenue,units,content_age,first_purchase,...,is_active_ttm,tenure,tenure_up_to_last_purchase,tenure_months_up_to_last_purchase,cumsum,tenure_up_to_last_purchase_corrected,arpu_monthly_dynamic,tenure_diff,first_deal,tenure_years
0,A0394092I84H9W8CLF7T,start_non_promo,3,9223372034578986151,2019-01-14,2019-01-14 10:19:36,1.57,1,D_1y_2yr,2019-01-14 00:00:00,...,1,323,0,0.0,1.57,1.0,1.57,0.0,1,under_year
1,A0394092I84H9W8CLF7T,start_non_promo,3,9223372034578986151,2019-02-13,2019-02-13 18:09:13,2.07,1,D_1y_2yr,2019-01-14 00:00:00,...,1,323,30,1.0,3.64,1.0,3.64,30.0,0,under_year
2,A0394092I84H9W8CLF7T,start_non_promo,3,9223372034578986151,2019-05-07,2019-05-07 19:04:39,24.99,1,C_6m_12m,2019-01-14 00:00:00,...,1,323,113,3.766667,28.63,3.766667,7.600885,83.0,0,under_year
3,A04514481OGK0VXFSN6MK,start_non_promo,3,9223372034580075651,2014-10-03,2014-10-03 21:26:05,2.08,1,C_6m_12m,2014-10-03 00:00:00,...,0,1887,0,0.0,2.08,1.0,2.08,0.0,1,over_5years
4,A055171022RG3W01G2SWK,start_non_promo,3,9223372034618444251,2018-08-04,2018-08-04 20:16:48,2.91,1,E_2y_3yr,2018-08-04 00:00:00,...,0,486,0,0.0,2.91,1.0,2.91,0.0,1,btw_1and2yrs


In [123]:
#data[data['encrypted_customer_id'] == 'A00885171WD9B3CHJSTV7'].iloc[:,5:20]
data[data['encrypted_customer_id'] == 'A0394092I84H9W8CLF7T'].iloc[:,10:]

Unnamed: 0,is_promo,is_pmd,content_type,units_ttm,rank,first_ever,is_active_ttm,tenure,tenure_up_to_last_purchase,tenure_months_up_to_last_purchase,cumsum,tenure_up_to_last_purchase_corrected,arpu_monthly_dynamic,tenure_diff,first_deal,tenure_years
0,non_promo,non_pmd,TV Episode,3,1,2019-01-14,1,323,0,0.0,1.57,1.0,1.57,0.0,1,under_year
1,non_promo,non_pmd,TV Episode,3,2,2019-01-14,1,323,30,1.0,3.64,1.0,3.64,30.0,0,under_year
2,promo,non_pmd,TV Season,3,3,2019-01-14,1,323,113,3.766667,28.63,3.766667,7.600885,83.0,0,under_year


In [124]:
check = data.groupby(['encrypted_customer_id'])['first_deal'].sum().reset_index()
check[check['first_deal'] > 1]

Unnamed: 0,encrypted_customer_id,first_deal


In [125]:
data.columns

Index(['encrypted_customer_id', 'how_started', 'marketplace_id',
       'unencrypted_customer_id', 'fulfill_date_local',
       'fulfill_datetime_local', 'revenue', 'units', 'content_age',
       'first_purchase', 'is_promo', 'is_pmd', 'content_type', 'units_ttm',
       'rank', 'first_ever', 'is_active_ttm', 'tenure',
       'tenure_up_to_last_purchase', 'tenure_months_up_to_last_purchase',
       'cumsum', 'tenure_up_to_last_purchase_corrected',
       'arpu_monthly_dynamic', 'tenure_diff', 'first_deal', 'tenure_years'],
      dtype='object')

In [126]:
foo = data.groupby(['units_ttm','rank','how_started','tenure_years'])['arpu_monthly_dynamic'].mean().reset_index()

In [127]:
foo.to_csv("foo.csv")