In [34]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [35]:
data = pd.read_csv("../Customer_Lifecycles_Markov_Chains/data/big_upto15Units_DE.txt", sep="\t", header=None)
data.columns = ['marketplace_id','encrypted_customer_id','unencrypted_customer_id','fulfill_date_local',
               'fulfill_datetime_local','revenue','units','content_age',
                'first_purchase','is_promo','is_pmd','content_type','units_ttm','rank','first_ever','is_active_ttm']
data.shape

(2216222, 16)

In [36]:
data.head()

Unnamed: 0,marketplace_id,encrypted_customer_id,unencrypted_customer_id,fulfill_date_local,fulfill_datetime_local,revenue,units,content_age,first_purchase,is_promo,is_pmd,content_type,units_ttm,rank,first_ever,is_active_ttm
0,4,A00335623Q2HKCHINC64Z,9223372034603266751,2018-09-26 00:00:00,2018-09-26 18:56:12,3.35,1,F_Over_3yr,2018-09-26 00:00:00,non_promo,non_pmd,Movie VOD,2,1,2018-09-26 00:00:00,0
1,4,A00335623Q2HKCHINC64Z,9223372034603266751,2018-10-09 00:00:00,2018-10-09 19:48:30,4.19,1,A_Under_30d,2018-09-26 00:00:00,non_promo,non_pmd,Movie VOD,2,2,2018-09-26 00:00:00,0
2,4,A00885171WD9B3CHJSTV7,9223372034560327451,2017-10-19 00:00:00,2017-10-19 19:16:00,4.19,1,A_Under_30d,2017-10-19 00:00:00,non_promo,non_pmd,Movie VOD,8,1,2017-10-19 00:00:00,0
3,4,A00885171WD9B3CHJSTV7,9223372034560327451,2017-11-05 00:00:00,2017-11-05 20:08:46,4.19,1,A_Under_30d,2017-10-19 00:00:00,non_promo,non_pmd,Movie VOD,8,2,2017-10-19 00:00:00,0
4,4,A00885171WD9B3CHJSTV7,9223372034560327451,2018-11-13 00:00:00,2018-11-13 20:30:44,4.19,1,B_30days_6m,2017-10-19 00:00:00,non_promo,non_pmd,Movie VOD,8,3,2017-10-19 00:00:00,0


In [37]:
def make_dataset_adjustments(df):
    
    df['fulfill_date_local'] = pd.to_datetime(df['fulfill_date_local'])
    df['fulfill_datetime_local'] = pd.to_datetime(df['fulfill_datetime_local'])
    df['first_ever'] = pd.to_datetime(df['first_ever'])
    df = df.dropna()
    df['tenure'] = [int(i.days) for i in (max(df['fulfill_date_local']) - df['first_ever'])]
    df['tenure_up_to_last_purchase'] = [int(i.days) for i in (df['fulfill_date_local']) - df['first_ever']]
    df['tenure_months_up_to_last_purchase'] = df['tenure_up_to_last_purchase'] / 30
    
    df['cumsum'] = df.groupby('encrypted_customer_id')['revenue'].cumsum()
    df['tenure_up_to_last_purchase_corrected'] = np.where(df['tenure_months_up_to_last_purchase'] < 1, 1, df['tenure_months_up_to_last_purchase'])
    df['arpu_monthly_dynamic'] = df['cumsum'] / (df['tenure_up_to_last_purchase_corrected'])
    
    
    f = df.groupby(['encrypted_customer_id','is_pmd'])['fulfill_datetime_local'].min().reset_index()
    f['first_deal'] = 1
    
    df = pd.merge(left = df, right = f[['encrypted_customer_id','first_deal','fulfill_datetime_local']],
                  right_on = ['encrypted_customer_id','fulfill_datetime_local'],
                  left_on = ['encrypted_customer_id','fulfill_datetime_local'], how = 'left')
    
    first_rank = df[df['first_deal'] == 1][['encrypted_customer_id','is_promo','is_pmd','first_deal']] 
    
    list_duplicated = first_rank[first_rank['encrypted_customer_id'].duplicated() == True]
    
    df = df[~df['encrypted_customer_id'].isin(list_duplicated['encrypted_customer_id'])]    
     
        
    df['first_deal'] = np.where(df['first_deal'] == 1, 1, 0)
    first_rank = df[df['first_deal'] == 1][['encrypted_customer_id','is_promo','is_pmd','first_deal']] 
    pd.set_option('mode.chained_assignment', None)
    first_rank['how_started'] = 'start_non_promo'
    first_rank['how_started'][(first_rank['is_pmd'] == 'is_pmd')] = 'start_pmd'
    first_rank['how_started'][(first_rank['is_pmd'] == 'non_pmd') & (first_rank['is_promo'] == 'promo') ] = 'start_promo_nonpmd'
    
    pd.set_option('mode.chained_assignment', None)
    # set default value
    df['tenure_years'] = "default"
    df['tenure_years'][(df['tenure'] < 360)] = "under_year"
    df['tenure_years'][(df['tenure'] >= 360) & (df['tenure'] < 360*2)] = 'btw_1and2yrs'
    df['tenure_years'][(df['tenure'] >= 360*2) & (df['tenure'] < 360*3)] = 'btw_2and3yrs'
    df['tenure_years'][(df['tenure'] >= 360*3) & (df['tenure'] < 360*4)] = 'btw_3and4yrs'
    df['tenure_years'][(df['tenure'] >= 360*4) & (df['tenure'] < 360*5)] = 'btw_4and5yrs'
    df['tenure_years'][(df['tenure'] >= 360*5) ] = 'over_5years'
    
    result = pd.merge(right = df, left = first_rank[['encrypted_customer_id','how_started']], right_on = ['encrypted_customer_id'], left_on = ['encrypted_customer_id'], how = 'left')
    
    return result

In [38]:
data = make_dataset_adjustments(data)

In [39]:
data[data['fulfill_date_local'].isnull() == True]

Unnamed: 0,encrypted_customer_id,how_started,marketplace_id,unencrypted_customer_id,fulfill_date_local,fulfill_datetime_local,revenue,units,content_age,first_purchase,...,first_ever,is_active_ttm,tenure,tenure_up_to_last_purchase,tenure_months_up_to_last_purchase,cumsum,tenure_up_to_last_purchase_corrected,arpu_monthly_dynamic,first_deal,tenure_years


In [40]:
data.shape

(1749092, 25)

In [41]:
data['encrypted_customer_id'].nunique()

441216

In [42]:
print(data.shape)
data.tail()

(1749092, 25)


Unnamed: 0,encrypted_customer_id,how_started,marketplace_id,unencrypted_customer_id,fulfill_date_local,fulfill_datetime_local,revenue,units,content_age,first_purchase,...,first_ever,is_active_ttm,tenure,tenure_up_to_last_purchase,tenure_months_up_to_last_purchase,cumsum,tenure_up_to_last_purchase_corrected,arpu_monthly_dynamic,first_deal,tenure_years
1749087,AZU55STA9IGWG,start_non_promo,4,18130840225,2018-04-22,2018-04-22 20:44:17,11.66,1,C_6m_12m,2017-02-26 00:00:00,...,2017-02-26,0,1011,420,14.0,48.25,14.0,3.446429,0,btw_2and3yrs
1749088,AZU55STA9IGWG,start_non_promo,4,18130840225,2018-05-04,2018-05-04 23:30:37,0.82,1,B_30days_6m,2017-02-26 00:00:00,...,2017-02-26,0,1011,432,14.4,49.07,14.4,3.407639,0,btw_2and3yrs
1749089,AZU55STA9IGWG,start_non_promo,4,18130840225,2019-02-10,2019-02-10 19:06:43,1.65,1,B_30days_6m,2017-02-26 00:00:00,...,2017-02-26,1,1011,714,23.8,50.72,23.8,2.131092,0,btw_2and3yrs
1749090,AZU55STA9IGWG,start_non_promo,4,18130840225,2019-02-22,2019-02-22 14:30:53,5.82,1,D_1y_2yr,2017-02-26 00:00:00,...,2017-02-26,1,1011,726,24.2,56.54,24.2,2.336364,0,btw_2and3yrs
1749091,AZU719R31OHJ,start_non_promo,4,1155672212,2019-11-17,2019-11-17 15:46:06,11.76,1,B_30days_6m,2019-11-17 00:00:00,...,2019-11-17,1,17,0,0.0,11.76,1.0,11.76,1,under_year


In [43]:
# a = data.groupby(['encrypted_customer_id'])['cumsum'].max().reset_index()
# b = data.groupby(['encrypted_customer_id'])['revenue'].sum().reset_index()
# c = pd.concat([a, b], axis=1)

In [44]:
def f(x):
    d = {}
    d['count'] = x['encrypted_customer_id'].nunique()
    d['avg_tenure']=x['tenure'].mean()
    
    return pd.Series(d, index = ['count','avg_tenure'])


summary = data.groupby('units_ttm').apply(f).reset_index()
summary

Unnamed: 0,units_ttm,count,avg_tenure
0,1,151278.0,596.296196
1,2,72412.0,707.740562
2,3,46061.0,784.837234
3,4,33614.0,838.153971
4,5,25281.0,876.927973
5,6,20836.0,906.670706
6,7,16964.0,939.35291
7,8,14366.0,970.659085
8,9,12232.0,995.665749
9,10,10735.0,1028.984583


In [45]:
summary['count'].sum()

441216.0

In [46]:
summary['count'].sum() == data['encrypted_customer_id'].nunique()

True

In [47]:
def get_diff_stats(x):
    """provides various metrics for count, sum, revenue"""
    d = {}
    d['count'] = x['encrypted_customer_id'].nunique()
    d['total_revenue'] = x['revenue'].sum()
    d['total_units'] = x['units'].sum()
    d['arpu_monthly_dynamic'] = x['arpu_monthly_dynamic'].mean()
    
    return pd.Series(d, index = ['count','total_revenue','total_units','arpu_monthly_dynamic'])




def make_proper_dataset(df):
    grouped = data.groupby(["units_ttm",'is_promo','is_pmd','rank','tenure_years']).apply(get_diff_stats).reset_index()
    total_group = df.groupby(["units_ttm"])['units'].sum().reset_index()

    grouped_df = pd.merge(left = grouped, right = total_group, left_on = 'units_ttm', right_on = 'units_ttm', how = 'left')
    grouped_df.columns = ['units_ttm','is_promo','is_pmd','rank','tenure_years','count_customers','total_revenue',
                          'total_units','arpu_monthly_dynamic','group_units']
    grouped_df['share'] = grouped_df['total_units'] / grouped_df['group_units']
    grouped_df = grouped_df.sort_values(by = ['units_ttm', 'rank'])
    
    grouped_by_rank = grouped.groupby(['units_ttm','rank'])['total_units'].sum().reset_index()
    
    final = pd.merge(left = grouped_df, right=grouped_by_rank, left_on = ['units_ttm','rank'], right_on = ['units_ttm','rank'], how = 'left' )
    final.columns = ['units_ttm','is_promo','is_pmd','rank', 'tenure_years','count_customers','total_revenue','total_units',
                     'arpu_monthly_dynamic','group_units','share_per_group','units_per_rank']
    final['share_by_rank'] = final['total_units'] / final['units_per_rank']

    return final

In [48]:
grouped = make_proper_dataset(data)

In [49]:
grouped[0:10]

Unnamed: 0,units_ttm,is_promo,is_pmd,rank,tenure_years,count_customers,total_revenue,total_units,arpu_monthly_dynamic,group_units,share_per_group,units_per_rank,share_by_rank
0,1,non_promo,non_pmd,1,btw_1and2yrs,28512.0,141925.44,28512.0,4.977744,151278,0.188474,151278.0,0.188474
1,1,non_promo,non_pmd,1,btw_2and3yrs,20581.0,109680.41,20581.0,5.329207,151278,0.136048,151278.0,0.136048
2,1,non_promo,non_pmd,1,btw_3and4yrs,14249.0,75136.48,14249.0,5.273105,151278,0.094191,151278.0,0.094191
3,1,non_promo,non_pmd,1,btw_4and5yrs,9359.0,43890.64,9359.0,4.689672,151278,0.061866,151278.0,0.061866
4,1,non_promo,non_pmd,1,over_5years,3932.0,19532.19,3932.0,4.967495,151278,0.025992,151278.0,0.025992
5,1,non_promo,non_pmd,1,under_year,54189.0,299052.51,54189.0,5.518694,151278,0.358208,151278.0,0.358208
6,1,promo,is_pmd,1,btw_1and2yrs,849.0,1617.34,849.0,1.904994,151278,0.005612,151278.0,0.005612
7,1,promo,is_pmd,1,btw_2and3yrs,194.0,161.02,194.0,0.83,151278,0.001282,151278.0,0.001282
8,1,promo,is_pmd,1,btw_3and4yrs,280.0,232.4,280.0,0.83,151278,0.001851,151278.0,0.001851
9,1,promo,is_pmd,1,btw_4and5yrs,103.0,85.49,103.0,0.83,151278,0.000681,151278.0,0.000681


In [50]:
grouped.to_csv("f.csv")

# How did monthly ARPU change dynamically ?

In [51]:
print(data.shape)
data.iloc[0:5,0:24]

(1749092, 25)


Unnamed: 0,encrypted_customer_id,how_started,marketplace_id,unencrypted_customer_id,fulfill_date_local,fulfill_datetime_local,revenue,units,content_age,first_purchase,...,rank,first_ever,is_active_ttm,tenure,tenure_up_to_last_purchase,tenure_months_up_to_last_purchase,cumsum,tenure_up_to_last_purchase_corrected,arpu_monthly_dynamic,first_deal
0,A00335623Q2HKCHINC64Z,start_non_promo,4,9223372034603266751,2018-09-26,2018-09-26 18:56:12,3.35,1,F_Over_3yr,2018-09-26 00:00:00,...,1,2018-09-26,0,434,0,0.0,3.35,1.0,3.35,1
1,A00335623Q2HKCHINC64Z,start_non_promo,4,9223372034603266751,2018-10-09,2018-10-09 19:48:30,4.19,1,A_Under_30d,2018-09-26 00:00:00,...,2,2018-09-26,0,434,13,0.433333,7.54,1.0,7.54,0
2,A00885171WD9B3CHJSTV7,start_non_promo,4,9223372034560327451,2017-10-19,2017-10-19 19:16:00,4.19,1,A_Under_30d,2017-10-19 00:00:00,...,1,2017-10-19,0,776,0,0.0,4.19,1.0,4.19,1
3,A00885171WD9B3CHJSTV7,start_non_promo,4,9223372034560327451,2017-11-05,2017-11-05 20:08:46,4.19,1,A_Under_30d,2017-10-19 00:00:00,...,2,2017-10-19,0,776,17,0.566667,8.38,1.0,8.38,0
4,A00885171WD9B3CHJSTV7,start_non_promo,4,9223372034560327451,2018-11-13,2018-11-13 20:30:44,4.19,1,B_30days_6m,2017-10-19 00:00:00,...,3,2017-10-19,0,776,390,13.0,12.57,13.0,0.966923,0


In [52]:
check = data.groupby(['encrypted_customer_id'])['first_deal'].sum().reset_index()
check[check['first_deal'] > 1]

Unnamed: 0,encrypted_customer_id,first_deal


In [53]:
data.columns

Index(['encrypted_customer_id', 'how_started', 'marketplace_id',
       'unencrypted_customer_id', 'fulfill_date_local',
       'fulfill_datetime_local', 'revenue', 'units', 'content_age',
       'first_purchase', 'is_promo', 'is_pmd', 'content_type', 'units_ttm',
       'rank', 'first_ever', 'is_active_ttm', 'tenure',
       'tenure_up_to_last_purchase', 'tenure_months_up_to_last_purchase',
       'cumsum', 'tenure_up_to_last_purchase_corrected',
       'arpu_monthly_dynamic', 'first_deal', 'tenure_years'],
      dtype='object')

In [54]:
foo = data.groupby(['units_ttm','rank','how_started','tenure_years'])['arpu_monthly_dynamic'].mean().reset_index()

In [55]:
foo.to_csv("foo.csv")