In [2]:
import pandas as pd
import numpy as np

In [3]:
#read the large csv file with specified chunksize 
df_chunk = pd.read_csv("//ant/dept-eu/LHR16/Dept4/AIV-TVOD/BI/PMD_Analysis/PMD_data_PMD_cohort4.txt", sep='\t', 
                       low_memory = False, encoding = 'ISO-8859-1', chunksize = 1000000, error_bad_lines = False)


In [4]:
def make_prep(df):

    df['transaction_date_local'] = pd.to_datetime(df['transaction_date_local'])
    df['transaction_datetime_local'] = pd.to_datetime(df['transaction_datetime_local'])
    df['first_purchase_date'] = pd.to_datetime(df['first_purchase_date'])
    df['tenure'] = [int(i.days) for i in (max(df['transaction_date_local']) - df['first_purchase_date'])]
    df['cumulative_sum_total'] = df['cumulative_sum_total'].astype(float)
    df['tenure_days_up_to_last_purchase'] = [int(i.days) for i in (df['transaction_date_local']) - df['first_purchase_date']] 
    df['tenure_days_up_to_last_purchase_corrected'] = np.where(df['tenure_days_up_to_last_purchase'] < 0, 0, df['tenure_days_up_to_last_purchase'] )
    
    df['tenure_months_up_to_last_purchase'] = df['tenure_days_up_to_last_purchase'] / 30
    df['tenure_up_to_last_purchase_corrected'] = np.where(df['tenure_months_up_to_last_purchase'] < 1, 1, df['tenure_months_up_to_last_purchase']).astype(float)
    df['arpu_monthly_dynamic'] = df['cumulative_sum_total'] / (df['tenure_up_to_last_purchase_corrected'])
    
    df['frequency'] = df.groupby('encrypted_customer_id')['tenure_days_up_to_last_purchase_corrected'].diff()
    
    pd.set_option('mode.chained_assignment', None)
      # set default value
    df['tenure_years'] = "default"
    df['tenure_years'][(df['tenure'] < 360)] = "under_year"
    df['tenure_years'][(df['tenure'] >= 360) & (df['tenure'] < 360*2)] = 'btw_1and2yrs'
    df['tenure_years'][(df['tenure'] >= 360*2) & (df['tenure'] < 360*3)] = 'btw_2and3yrs'
    df['tenure_years'][(df['tenure'] >= 360*3) & (df['tenure'] < 360*4)] = 'btw_3and4yrs'
    df['tenure_years'][(df['tenure'] >= 360*4) & (df['tenure'] < 360*5)] = 'btw_4and5yrs'
    df['tenure_years'][(df['tenure'] >= 360*5) ] = 'over_5years'
    
    
    df = df.fillna(0)
    
    return df

In [5]:
chunk_list = []  # append each chunk df here 

# Each chunk is in df format
for chunk in df_chunk:  
    # perform data filtering 
    chunk_filter = make_prep(chunk)
    
    # Once the data filtering is done, append the chunk to list
    chunk_list.append(chunk_filter)
    
# concat the list into dataframe 
df_concat = pd.concat(chunk_list)

In [6]:

#df_concat.to_pickle("data_granularUK_pmdReduced.pkl")
#df_concat.to_pickle("data_granularDE_pmdReduced.pkl")

#df_concat.to_pickle("data_granularUK_pmdbig.pkl")
#df_concat.to_pickle("data_granularDE_pmdbig.pkl")

In [None]:
#df_concatenated = pd.read_pickle("data_granularDE_pmdbig.pkl")

In [7]:
#df_concatenated.head()
df_concat.head()

Unnamed: 0,marketplace_id,encrypted_customer_id,unencrypted_customer_id,transaction_date_local,transaction_datetime_local,week_ending,revenue,units,content_age,first_purchase_date,...,running_tenure_ever,running_arpu_monthly_ever,tenure,tenure_days_up_to_last_purchase,tenure_days_up_to_last_purchase_corrected,tenure_months_up_to_last_purchase,tenure_up_to_last_purchase_corrected,arpu_monthly_dynamic,frequency,tenure_years
0,4,A0199987XFI4BF5J7UE2,9223372034566728451,2015-10-31,2015-10-31 16:06:53,2019-12-07 00:00:00,1.67,1,D_1y_2yr,2015-10-31 16:06:53,...,0.0,1.67,1513,-1,0,-0.033333,1.0,1.67,0.0,btw_4and5yrs
1,4,A0199987XFI4BF5J7UE2,9223372034566728451,2015-11-08,2015-11-08 12:36:50,2019-12-07 00:00:00,2.51,1,D_1y_2yr,2015-10-31 16:06:53,...,1.0,4.18,1513,7,7,0.233333,1.0,4.18,7.0,btw_4and5yrs
2,4,A0199987XFI4BF5J7UE2,9223372034566728451,2016-02-27,2016-02-27 20:16:41,2019-12-07 00:00:00,4.19,1,A_Under_30d,2015-10-31 16:06:53,...,4.0,2.0925,1513,118,118,3.933333,3.933333,2.127966,111.0,btw_4and5yrs
3,4,A0199987XFI4BF5J7UE2,9223372034566728451,2019-07-04,2019-07-04 18:57:06,2019-12-07 00:00:00,4.18,1,E_2y_3yr,2015-10-31 16:06:53,...,45.0,0.2788,1513,1341,1341,44.7,44.7,0.280761,1223.0,btw_4and5yrs
4,4,A0199987XFI4BF5J7UE2,9223372034566728451,2019-10-05,2019-10-05 12:41:34,2019-12-07 00:00:00,4.19,1,A_Under_30d,2015-10-31 16:06:53,...,48.0,0.3487,1513,1434,1434,47.8,47.8,0.350209,93.0,btw_4and5yrs


In [8]:
df_concat['promo_dist'].unique()

array(['non_promo', 'pmd_promo', 'vendisto_promo'], dtype=object)

# Analyse different entry points into PMD

In [9]:
def make_final_adjastments(df):
    subset_first_transaction_by_type = df.groupby(['encrypted_customer_id','promo_dist'])['rank_transactions'].min().reset_index()
    subset_first_transaction_by_type.columns = ['encrypted_customer_id','promo_dist_entry','rank_transactions']
    
    copy_dataset = df[["marketplace_id", "encrypted_customer_id","transaction_date_local","revenue","units","content_age",
                   "arpu_monthly_dynamic","frequency","total_units", "total_revenue",
                   "total_units_ttm", "total_revenue_ttm","rank_transactions",
                   "total_units_before_ttm", "total_revenue_before_ttm", "promo_dist",
                   "tenure_years","tenure_up_to_last_purchase_corrected","tenure","is_active_ttm"]]
    df_new = pd.merge(left = copy_dataset, right = subset_first_transaction_by_type, left_on = ['encrypted_customer_id','rank_transactions'],
                 right_on = ['encrypted_customer_id','rank_transactions'], how = 'left')
    
    
    subset_first_transaction_pmd = subset_first_transaction_by_type[subset_first_transaction_by_type['promo_dist_entry'] == 'pmd_promo']
    subset_first_transaction_pmd.columns = ['encrypted_customer_id','promo_dist_entry_specific','rank_transactions_entry']
    
    df_new = pd.merge(left = df_new, right = subset_first_transaction_pmd[['encrypted_customer_id','rank_transactions_entry']],
                      left_on = ['encrypted_customer_id'],
                      right_on = ['encrypted_customer_id'], how = 'left')
    
    df_new['rank_transactions_entry'] = df_new['rank_transactions_entry'].fillna(0)
    
    
    pd.set_option('mode.chained_assignment', None)
    
    df_new['which_relative_to_pmd'] = 'no_entry'
    df_new['which_relative_to_pmd'][(df_new['rank_transactions'] < df_new['rank_transactions_entry']) &
                                   (df_new['rank_transactions_entry'] != 0) ] = "before"
    df_new['which_relative_to_pmd'][(df_new['rank_transactions'] > df_new['rank_transactions_entry']) &
                                   (df_new['rank_transactions_entry'] != 0)] = "after"
    df_new['which_relative_to_pmd'][(df_new['rank_transactions'] == df_new['rank_transactions_entry']) &
                                   (df_new['rank_transactions_entry'] != 0)] = "pmd_entry"
    
    return df_new

In [10]:
df_new = make_final_adjastments(df_concat)

In [16]:
print(df_new['encrypted_customer_id'].nunique())
df_new.shape

1300000


(16172186, 23)

In [12]:
df_new.columns

Index(['marketplace_id', 'encrypted_customer_id', 'transaction_date_local',
       'revenue', 'units', 'content_age', 'arpu_monthly_dynamic', 'frequency',
       'total_units', 'total_revenue', 'total_units_ttm', 'total_revenue_ttm',
       'rank_transactions', 'total_units_before_ttm',
       'total_revenue_before_ttm', 'promo_dist', 'tenure_years',
       'tenure_up_to_last_purchase_corrected', 'tenure', 'is_active_ttm',
       'promo_dist_entry', 'rank_transactions_entry', 'which_relative_to_pmd'],
      dtype='object')

In [13]:
df_new['which_relative_to_pmd'].unique()

array(['before', 'pmd_entry', 'after'], dtype=object)

In [14]:
def get_diff_stats(x):
    """provides various metrics for count, sum, revenue"""
    d = {}
    d['count_customers'] = x['encrypted_customer_id'].nunique()
    d['arpu_monthly_dynamic'] = x['arpu_monthly_dynamic'].mean()
    d['average_frequency'] = x['frequency'].mean()
    d['units'] = x['units'].sum()
    d['revenue'] = x['revenue'].sum()
    
    return pd.Series(d, index = ['count_customers','arpu_monthly_dynamic', 'average_frequency', 'units','revenue'])

# Subset of those who have done BOTH before and after

In [17]:
def create_subset(df):
    before = df[df['which_relative_to_pmd'].isin(['before'])]['encrypted_customer_id'].unique()
    after_with_before = df[(df['which_relative_to_pmd'] == 'after') &
                           df['encrypted_customer_id'].isin(before)]['encrypted_customer_id'].unique()
    
    subset = df[df['encrypted_customer_id'].isin(after_with_before)]
    
    return subset

In [18]:
d = create_subset(df_new)

In [19]:
d.shape

(13444886, 23)

In [20]:
print(d['encrypted_customer_id'].nunique())
print(d[d['which_relative_to_pmd'] == 'before']['encrypted_customer_id'].nunique())
print(d[d['which_relative_to_pmd'] == 'after']['encrypted_customer_id'].nunique())

889584
889584
889584


In [21]:
def final_output(df):
    grouped = df.groupby(["total_units","tenure_years",
                       "total_units_ttm","rank_transactions_entry","which_relative_to_pmd"]).apply(get_diff_stats).reset_index()
    
    grouped_dist = df.groupby(["total_units","tenure_years",
                       "total_units_ttm","rank_transactions_entry","which_relative_to_pmd","promo_dist"]).apply(get_diff_stats).reset_index()
    
    
    output = pd.merge(left = grouped_dist, right = grouped[['total_units','tenure_years','total_units_ttm',
                                                           'rank_transactions_entry','which_relative_to_pmd','units']],
                     left_on = ['total_units','tenure_years','total_units_ttm',
                                                           'rank_transactions_entry','which_relative_to_pmd'],
                     right_on = ['total_units','tenure_years','total_units_ttm',
                                                           'rank_transactions_entry','which_relative_to_pmd'], how = 'left')
    
    output['share'] = output['units_x'] / output['units_y']
    return grouped, output

In [22]:
grouped, dist = final_output(d)

In [23]:
print(grouped.shape)
grouped.head()

(117180, 10)


Unnamed: 0,total_units,tenure_years,total_units_ttm,rank_transactions_entry,which_relative_to_pmd,count_customers,arpu_monthly_dynamic,average_frequency,units,revenue
0,3,btw_1and2yrs,0.0,2,after,1192.0,4.229195,41.323826,1192.0,3995.35
1,3,btw_1and2yrs,0.0,2,before,1192.0,4.717122,0.0,1192.0,5622.81
2,3,btw_1and2yrs,0.0,2,pmd_entry,1192.0,3.979368,74.390101,1192.0,1952.0
3,3,btw_1and2yrs,1.0,2,after,1675.0,1.219823,266.970149,1675.0,7563.92
4,3,btw_1and2yrs,1.0,2,before,1675.0,4.703057,0.0,1675.0,7877.62


In [24]:
print(dist.shape)
dist.head()

(188032, 13)


Unnamed: 0,total_units,tenure_years,total_units_ttm,rank_transactions_entry,which_relative_to_pmd,promo_dist,count_customers,arpu_monthly_dynamic,average_frequency,units_x,revenue,units_y,share
0,3,btw_1and2yrs,0.0,2,after,non_promo,698.0,4.798913,55.961318,698.0,3405.23,1192.0,0.58557
1,3,btw_1and2yrs,0.0,2,after,pmd_promo,482.0,3.414882,20.051867,482.0,563.43,1192.0,0.404362
2,3,btw_1and2yrs,0.0,2,after,vendisto_promo,12.0,3.798853,44.333333,12.0,26.69,1192.0,0.010067
3,3,btw_1and2yrs,0.0,2,before,non_promo,1177.0,4.743942,0.0,1177.0,5583.62,1192.0,0.987416
4,3,btw_1and2yrs,0.0,2,before,vendisto_promo,15.0,2.612667,0.0,15.0,39.19,1192.0,0.012584


In [25]:
#grouped.to_csv("grouped_cleanDEbig.csv")

In [28]:
#dist.to_csv("grouped_by_distDEbig.csv")

In [30]:
d.groupby(['tenure_years'])['encrypted_customer_id'].nunique().reset_index()

Unnamed: 0,tenure_years,encrypted_customer_id
0,btw_1and2yrs,185281
1,btw_2and3yrs,219255
2,btw_3and4yrs,193007
3,btw_4and5yrs,141633
4,over_5years,65076
5,under_year,85332


# Reengagement tool

In [49]:
def create_dist_by_promo_type(df):
    dist_units = df.groupby(['tenure_years','promo_dist'])['units'].sum().reset_index()
    summed = df.groupby(['tenure_years'])['units'].sum().reset_index()
    
    out  = pd.merge(left = dist_units, right = summed, left_on = ['tenure_years'],
                    right_on = ['tenure_years'], how = 'left')
    
    out['share'] = out['units_x'] / out['units_y']
    return out

In [50]:
dd = create_dist_by_promo_type(d)

In [52]:
dd.tail()

Unnamed: 0,tenure_years,promo_dist,units_x,units_y,share
13,over_5years,pmd_promo,197270,1157808,0.170382
14,over_5years,vendisto_promo,8364,1157808,0.007224
15,under_year,non_promo,732585,905553,0.808992
16,under_year,pmd_promo,167077,905553,0.184503
17,under_year,vendisto_promo,5891,905553,0.006505


In [56]:
def create_freqdist_by_promo_type(df):
    dist_cust = df.groupby(['tenure_years','promo_dist'])['encrypted_customer_id'].nunique().reset_index()
    summed = df.groupby(['tenure_years'])['encrypted_customer_id'].nunique().reset_index()
    
    out  = pd.merge(left = dist_cust, right = summed, left_on = ['tenure_years'],
                    right_on = ['tenure_years'], how = 'left')
    
    out['share'] = out['encrypted_customer_id_x'] / out['encrypted_customer_id_y']
    return out

In [57]:
reeng = create_freqdist_by_promo_type(d[d['frequency'] > 365])

In [59]:
reeng.head()

Unnamed: 0,tenure_years,promo_dist,encrypted_customer_id_x,encrypted_customer_id_y,share
0,btw_1and2yrs,non_promo,5451,9463,0.576033
1,btw_1and2yrs,pmd_promo,3966,9463,0.419106
2,btw_1and2yrs,vendisto_promo,46,9463,0.004861
3,btw_2and3yrs,non_promo,30766,48774,0.630787
4,btw_2and3yrs,pmd_promo,18352,48774,0.376266
