In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
date_parser = lambda s: dt.datetime.strptime(s,'%d.%m.%Y')

In [3]:
df = pd.read_csv('drive/MyDrive/dataset/ecommerce_data_new.csv', parse_dates=['invoicedate'], date_parser=date_parser)

In [4]:
df.head()

Unnamed: 0,invoiceno,invoicedate,customerid,country,amount
0,536365,2010-12-01,17850,United Kingdom,138
1,536366,2010-12-01,17850,United Kingdom,22
2,536367,2010-12-01,13047,United Kingdom,281
3,536368,2010-12-01,13047,United Kingdom,71
4,536369,2010-12-01,13047,United Kingdom,18


In [5]:
df.dtypes

invoiceno              object
invoicedate    datetime64[ns]
customerid              int64
country                object
amount                  int64
dtype: object

In [6]:
cohorts = df.groupby(by=['customerid']).agg({'invoicedate':min}).reset_index()

In [7]:
cohorts.columns = ['customerid','first_date']

In [8]:
cohorts.head()

Unnamed: 0,customerid,first_date
0,12346,2011-01-18
1,12347,2010-12-07
2,12348,2010-12-16
3,12349,2011-11-21
4,12350,2011-02-02


In [9]:
cohorts['month_cohort'] = cohorts['first_date'].values.astype('datetime64[M]')

In [10]:
cohorts.head()

Unnamed: 0,customerid,first_date,month_cohort
0,12346,2011-01-18,2011-01-01
1,12347,2010-12-07,2010-12-01
2,12348,2010-12-16,2010-12-01
3,12349,2011-11-21,2011-11-01
4,12350,2011-02-02,2011-02-01


In [11]:
df['month_sale'] = df['invoicedate'].values.astype('datetime64[M]')

In [12]:
df.head()

Unnamed: 0,invoiceno,invoicedate,customerid,country,amount,month_sale
0,536365,2010-12-01,17850,United Kingdom,138,2010-12-01
1,536366,2010-12-01,17850,United Kingdom,22,2010-12-01
2,536367,2010-12-01,13047,United Kingdom,281,2010-12-01
3,536368,2010-12-01,13047,United Kingdom,71,2010-12-01
4,536369,2010-12-01,13047,United Kingdom,18,2010-12-01


In [13]:
df = df.merge(cohorts,how='left', on='customerid')

In [14]:
df.head()

Unnamed: 0,invoiceno,invoicedate,customerid,country,amount,month_sale,first_date,month_cohort
0,536365,2010-12-01,17850,United Kingdom,138,2010-12-01,2010-12-01,2010-12-01
1,536366,2010-12-01,17850,United Kingdom,22,2010-12-01,2010-12-01,2010-12-01
2,536367,2010-12-01,13047,United Kingdom,281,2010-12-01,2010-12-01,2010-12-01
3,536368,2010-12-01,13047,United Kingdom,71,2010-12-01,2010-12-01,2010-12-01
4,536369,2010-12-01,13047,United Kingdom,18,2010-12-01,2010-12-01,2010-12-01


In [15]:
df_result = df.drop(['invoicedate','first_date'], axis = 1)

In [16]:
df_result.head()

Unnamed: 0,invoiceno,customerid,country,amount,month_sale,month_cohort
0,536365,17850,United Kingdom,138,2010-12-01,2010-12-01
1,536366,17850,United Kingdom,22,2010-12-01,2010-12-01
2,536367,13047,United Kingdom,281,2010-12-01,2010-12-01
3,536368,13047,United Kingdom,71,2010-12-01,2010-12-01
4,536369,13047,United Kingdom,18,2010-12-01,2010-12-01


In [17]:
tbl = pd.pivot_table(df_result, values='amount',index='month_cohort',columns=['month_sale'],aggfunc=np.sum)

# Распределение суммы продаж по когортам в абсолютном выражении

In [18]:
tbl

month_sale,2010-12-01,2011-01-01,2011-02-01,2011-03-01,2011-04-01,2011-05-01,2011-06-01,2011-07-01,2011-08-01,2011-09-01,2011-10-01,2011-11-01,2011-12-01
month_cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2010-12-01,749590.0,356442.0,292209.0,405629.0,267893.0,396493.0,395592.0,410445.0,376810.0,554265.0,534407.0,839300.0,273647.0
2011-01-01,,203622.0,56239.0,62301.0,41511.0,82343.0,84020.0,70205.0,72928.0,74344.0,104188.0,121980.0,27824.0
2011-02-01,,,149650.0,25280.0,37460.0,45865.0,35668.0,31072.0,47681.0,55780.0,51838.0,60531.0,9352.0
2011-03-01,,,,190082.0,26414.0,53684.0,40331.0,46767.0,38330.0,60567.0,61654.0,65265.0,11244.0
2011-04-01,,,,,119851.0,28936.0,24930.0,23866.0,25991.0,29623.0,28150.0,33849.0,6280.0
2011-05-01,,,,,,115927.0,17666.0,18857.0,17872.0,26561.0,32940.0,31381.0,10623.0
2011-06-01,,,,,,,92535.0,13613.0,13908.0,29895.0,25823.0,39712.0,7917.0
2011-07-01,,,,,,,,65895.0,11150.0,15382.0,17222.0,19317.0,6059.0
2011-08-01,,,,,,,,,77629.0,19093.0,32945.0,39849.0,14124.0
2011-09-01,,,,,,,,,,153160.0,25780.0,35774.0,12263.0


In [19]:
df_result_group = df_result.groupby(by=['month_cohort','month_sale']).agg({'amount':np.sum,'customerid':lambda x: len(x.unique())}).reset_index()

In [20]:
df_result_group = df_result_group.rename(columns={'customerid': 'count_customer'})

In [21]:
df_result_group

Unnamed: 0,month_cohort,month_sale,amount,count_customer
0,2010-12-01,2010-12-01,749590,949
1,2010-12-01,2011-01-01,356442,363
2,2010-12-01,2011-02-01,292209,318
3,2010-12-01,2011-03-01,405629,368
4,2010-12-01,2011-04-01,267893,342
...,...,...,...,...
86,2011-10-01,2011-11-01,39140,93
87,2011-10-01,2011-12-01,12274,46
88,2011-11-01,2011-11-01,134119,321
89,2011-11-01,2011-12-01,14851,43


In [22]:
df_result_group_start = df_result_group.groupby(by=['month_cohort']).first().reset_index()

In [23]:
df_result_group_start

Unnamed: 0,month_cohort,month_sale,amount,count_customer
0,2010-12-01,2010-12-01,749590,949
1,2011-01-01,2011-01-01,203622,421
2,2011-02-01,2011-02-01,149650,380
3,2011-03-01,2011-03-01,190082,440
4,2011-04-01,2011-04-01,119851,299
5,2011-05-01,2011-05-01,115927,279
6,2011-06-01,2011-06-01,92535,235
7,2011-07-01,2011-07-01,65895,191
8,2011-08-01,2011-08-01,77629,167
9,2011-09-01,2011-09-01,153160,298


In [24]:
df_result_group_final = df_result_group.merge(df_result_group_start,
                                              how='left',
                                              left_on=['month_cohort'],
                                              right_on = ['month_cohort'], suffixes =('','_start'))

In [25]:
df_result_group_final = df_result_group_final.drop(['month_sale_start'], axis = 1)

In [26]:
df_result_group_final

Unnamed: 0,month_cohort,month_sale,amount,count_customer,amount_start,count_customer_start
0,2010-12-01,2010-12-01,749590,949,749590,949
1,2010-12-01,2011-01-01,356442,363,749590,949
2,2010-12-01,2011-02-01,292209,318,749590,949
3,2010-12-01,2011-03-01,405629,368,749590,949
4,2010-12-01,2011-04-01,267893,342,749590,949
...,...,...,...,...,...,...
86,2011-10-01,2011-11-01,39140,93,154739,352
87,2011-10-01,2011-12-01,12274,46,154739,352
88,2011-11-01,2011-11-01,134119,321,134119,321
89,2011-11-01,2011-12-01,14851,43,134119,321


In [27]:
df_result_group_final['percent_amount_of_first_month'] = df_result_group_final['amount']/df_result_group_final['amount_start']

In [28]:
df_result_group_final['percent_amount_of_first_month'] = df_result_group_final['percent_amount_of_first_month'].apply(lambda x: round(x,2))

In [29]:
df_result_group_final['percent_count_customer_of_first_month'] = df_result_group_final['count_customer']/df_result_group_final['count_customer_start']

In [30]:
df_result_group_final['percent_count_customer_of_first_month'] = df_result_group_final['percent_count_customer_of_first_month'].apply(lambda x:round(x,2))

In [31]:
df_result_group_final = df_result_group_final.drop(['amount_start','count_customer_start'], axis = 1)

In [32]:
df_result_group_final

Unnamed: 0,month_cohort,month_sale,amount,count_customer,percent_amount_of_first_month,percent_count_customer_of_first_month
0,2010-12-01,2010-12-01,749590,949,1.00,1.00
1,2010-12-01,2011-01-01,356442,363,0.48,0.38
2,2010-12-01,2011-02-01,292209,318,0.39,0.34
3,2010-12-01,2011-03-01,405629,368,0.54,0.39
4,2010-12-01,2011-04-01,267893,342,0.36,0.36
...,...,...,...,...,...,...
86,2011-10-01,2011-11-01,39140,93,0.25,0.26
87,2011-10-01,2011-12-01,12274,46,0.08,0.13
88,2011-11-01,2011-11-01,134119,321,1.00,1.00
89,2011-11-01,2011-12-01,14851,43,0.11,0.13


In [33]:
tbl2 = pd.pivot_table(df_result_group_final, values='percent_amount_of_first_month',index='month_cohort',columns=['month_sale'],aggfunc=np.sum)

# Распределение суммы продаж по когортам в относительном выражении

In [34]:
tbl2

month_sale,2010-12-01,2011-01-01,2011-02-01,2011-03-01,2011-04-01,2011-05-01,2011-06-01,2011-07-01,2011-08-01,2011-09-01,2011-10-01,2011-11-01,2011-12-01
month_cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2010-12-01,1.0,0.48,0.39,0.54,0.36,0.53,0.53,0.55,0.5,0.74,0.71,1.12,0.37
2011-01-01,,1.0,0.28,0.31,0.2,0.4,0.41,0.34,0.36,0.37,0.51,0.6,0.14
2011-02-01,,,1.0,0.17,0.25,0.31,0.24,0.21,0.32,0.37,0.35,0.4,0.06
2011-03-01,,,,1.0,0.14,0.28,0.21,0.25,0.2,0.32,0.32,0.34,0.06
2011-04-01,,,,,1.0,0.24,0.21,0.2,0.22,0.25,0.23,0.28,0.05
2011-05-01,,,,,,1.0,0.15,0.16,0.15,0.23,0.28,0.27,0.09
2011-06-01,,,,,,,1.0,0.15,0.15,0.32,0.28,0.43,0.09
2011-07-01,,,,,,,,1.0,0.17,0.23,0.26,0.29,0.09
2011-08-01,,,,,,,,,1.0,0.25,0.42,0.51,0.18
2011-09-01,,,,,,,,,,1.0,0.17,0.23,0.08


In [35]:
tbl3 = pd.pivot_table(df_result, values='customerid',index='month_cohort',columns=['month_sale'],aggfunc=lambda x: len(x.unique()))

# Распределение уникальных пользователей по когортам

In [36]:
tbl3

month_sale,2010-12-01,2011-01-01,2011-02-01,2011-03-01,2011-04-01,2011-05-01,2011-06-01,2011-07-01,2011-08-01,2011-09-01,2011-10-01,2011-11-01,2011-12-01
month_cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2010-12-01,949.0,363.0,318.0,368.0,342.0,377.0,361.0,337.0,337.0,375.0,355.0,475.0,261.0
2011-01-01,,421.0,101.0,119.0,102.0,138.0,126.0,110.0,108.0,131.0,146.0,155.0,63.0
2011-02-01,,,380.0,94.0,73.0,106.0,102.0,94.0,97.0,107.0,98.0,119.0,35.0
2011-03-01,,,,440.0,84.0,112.0,96.0,102.0,78.0,116.0,105.0,127.0,39.0
2011-04-01,,,,,299.0,68.0,66.0,63.0,62.0,71.0,69.0,78.0,25.0
2011-05-01,,,,,,279.0,66.0,48.0,48.0,60.0,68.0,74.0,29.0
2011-06-01,,,,,,,235.0,49.0,44.0,64.0,58.0,79.0,24.0
2011-07-01,,,,,,,,191.0,40.0,39.0,44.0,52.0,22.0
2011-08-01,,,,,,,,,167.0,42.0,42.0,42.0,23.0
2011-09-01,,,,,,,,,,298.0,89.0,97.0,36.0


In [37]:
tbl4 = pd.pivot_table(df_result_group_final, values='percent_count_customer_of_first_month',index='month_cohort',columns=['month_sale'],aggfunc=np.sum)

# Распределение уникальных пользователей по когортам в относительном выражении

In [38]:
tbl4

month_sale,2010-12-01,2011-01-01,2011-02-01,2011-03-01,2011-04-01,2011-05-01,2011-06-01,2011-07-01,2011-08-01,2011-09-01,2011-10-01,2011-11-01,2011-12-01
month_cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2010-12-01,1.0,0.38,0.34,0.39,0.36,0.4,0.38,0.36,0.36,0.4,0.37,0.5,0.28
2011-01-01,,1.0,0.24,0.28,0.24,0.33,0.3,0.26,0.26,0.31,0.35,0.37,0.15
2011-02-01,,,1.0,0.25,0.19,0.28,0.27,0.25,0.26,0.28,0.26,0.31,0.09
2011-03-01,,,,1.0,0.19,0.25,0.22,0.23,0.18,0.26,0.24,0.29,0.09
2011-04-01,,,,,1.0,0.23,0.22,0.21,0.21,0.24,0.23,0.26,0.08
2011-05-01,,,,,,1.0,0.24,0.17,0.17,0.22,0.24,0.27,0.1
2011-06-01,,,,,,,1.0,0.21,0.19,0.27,0.25,0.34,0.1
2011-07-01,,,,,,,,1.0,0.21,0.2,0.23,0.27,0.12
2011-08-01,,,,,,,,,1.0,0.25,0.25,0.25,0.14
2011-09-01,,,,,,,,,,1.0,0.3,0.33,0.12


# Расчета Customer Retention Rate (уровень удержания клиентов)

In [61]:
df_result_group.head(10)

Unnamed: 0,month_cohort,month_sale,amount,count_customer
0,2010-12-01,2010-12-01,749590,949
1,2010-12-01,2011-01-01,356442,363
2,2010-12-01,2011-02-01,292209,318
3,2010-12-01,2011-03-01,405629,368
4,2010-12-01,2011-04-01,267893,342
5,2010-12-01,2011-05-01,396493,377
6,2010-12-01,2011-06-01,395592,361
7,2010-12-01,2011-07-01,410445,337
8,2010-12-01,2011-08-01,376810,337
9,2010-12-01,2011-09-01,554265,375


In [40]:
df_result_group = df_result_group.sort_values(by=['month_cohort','month_sale'])

In [41]:
df_result_group_crr = df_result_group.drop(['month_cohort','amount'],axis=1)

In [42]:
df_result_group_crr.head(10)

Unnamed: 0,month_sale,count_customer
0,2010-12-01,949
1,2011-01-01,363
2,2011-02-01,318
3,2011-03-01,368
4,2011-04-01,342
5,2011-05-01,377
6,2011-06-01,361
7,2011-07-01,337
8,2011-08-01,337
9,2011-09-01,375


In [43]:
df_result_group_crr_group = df_result_group_crr.groupby(by=['month_sale']).agg({'count_customer':[np.sum,'last']}).reset_index()

In [44]:
df_result_group_crr_group.columns = ['month_sale', 'count_customer_end_month','count_customer_new']

In [45]:
df_result_group_crr_group['count_cstomer_start_month'] = df_result_group_crr_group['count_customer_end_month'].shift(periods=1,fill_value=0)

In [46]:
df_result_group_crr_group

Unnamed: 0,month_sale,count_customer_end_month,count_customer_new,count_cstomer_start_month
0,2010-12-01,949,949,0
1,2011-01-01,784,421,949
2,2011-02-01,799,380,784
3,2011-03-01,1021,440,799
4,2011-04-01,900,299,1021
5,2011-05-01,1080,279,900
6,2011-06-01,1052,235,1080
7,2011-07-01,994,191,1052
8,2011-08-01,981,167,994
9,2011-09-01,1303,298,981


In [47]:
df_result_group_crr_group['crr'] = (df_result_group_crr_group['count_customer_end_month'] - df_result_group_crr_group['count_customer_new'])/df_result_group_crr_group['count_cstomer_start_month']

In [48]:
df_result_group_crr_group = df_result_group_crr_group.fillna(value=0)

In [49]:
df_result_group_crr_group

Unnamed: 0,month_sale,count_customer_end_month,count_customer_new,count_cstomer_start_month,crr
0,2010-12-01,949,949,0,0.0
1,2011-01-01,784,421,949,0.382508
2,2011-02-01,799,380,784,0.534439
3,2011-03-01,1021,440,799,0.727159
4,2011-04-01,900,299,1021,0.588639
5,2011-05-01,1080,279,900,0.89
6,2011-06-01,1052,235,1080,0.756481
7,2011-07-01,994,191,1052,0.763308
8,2011-08-01,981,167,994,0.818913
9,2011-09-01,1303,298,981,1.024465


# Формула Churn rate (показатель оттока)

In [50]:
df_result_group_crr_group['churn_rate'] = (df_result_group_crr_group['count_cstomer_start_month']-(df_result_group_crr_group['count_customer_end_month']-df_result_group_crr_group['count_customer_new']))/df_result_group_crr_group['count_cstomer_start_month']

In [51]:
df_result_group_crr_group = df_result_group_crr_group.fillna(value=0)

In [52]:
df_result_group_crr_group

Unnamed: 0,month_sale,count_customer_end_month,count_customer_new,count_cstomer_start_month,crr,churn_rate
0,2010-12-01,949,949,0,0.0,0.0
1,2011-01-01,784,421,949,0.382508,0.617492
2,2011-02-01,799,380,784,0.534439,0.465561
3,2011-03-01,1021,440,799,0.727159,0.272841
4,2011-04-01,900,299,1021,0.588639,0.411361
5,2011-05-01,1080,279,900,0.89,0.11
6,2011-06-01,1052,235,1080,0.756481,0.243519
7,2011-07-01,994,191,1052,0.763308,0.236692
8,2011-08-01,981,167,994,0.818913,0.181087
9,2011-09-01,1303,298,981,1.024465,-0.024465


# Формула Monthly Recurring Revenue (регулярный месячный доход)

## Revenue за период

In [53]:
df_result_group_revenue = df_result.groupby(by=['month_sale']).agg({'amount':np.sum,'customerid':lambda x: len(x.unique()),'invoiceno':np.size}).reset_index()

In [54]:
df_result_group_revenue

Unnamed: 0,month_sale,amount,customerid,invoiceno
0,2010-12-01,749590,949,2025
1,2011-01-01,560064,784,1476
2,2011-02-01,498098,799,1393
3,2011-03-01,683292,1021,1983
4,2011-04-01,493129,900,1744
5,2011-05-01,723248,1080,2162
6,2011-06-01,690742,1052,2012
7,2011-07-01,680720,994,1927
8,2011-08-01,682299,981,1737
9,2011-09-01,1018670,1303,2327


# Средний чек (Average Invoice / Avg Receipt)

In [55]:
df_result_group_revenue['avg_receipt'] = (df_result_group_revenue['amount']/df_result_group_revenue['invoiceno']).apply(lambda x:round(x,0))

In [56]:
df_result_group_revenue

Unnamed: 0,month_sale,amount,customerid,invoiceno,avg_receipt
0,2010-12-01,749590,949,2025,370.0
1,2011-01-01,560064,784,1476,379.0
2,2011-02-01,498098,799,1393,358.0
3,2011-03-01,683292,1021,1983,345.0
4,2011-04-01,493129,900,1744,283.0
5,2011-05-01,723248,1080,2162,335.0
6,2011-06-01,690742,1052,2012,343.0
7,2011-07-01,680720,994,1927,353.0
8,2011-08-01,682299,981,1737,393.0
9,2011-09-01,1018670,1303,2327,438.0


# Формула Purchase Frequency (частота покупок)

In [57]:
df_result_group_revenue['purchase_frequency'] = (df_result_group_revenue['invoiceno']/df_result_group_revenue['customerid']).apply(lambda x:round(x,0))

In [58]:
df_result_group_revenue

Unnamed: 0,month_sale,amount,customerid,invoiceno,avg_receipt,purchase_frequency
0,2010-12-01,749590,949,2025,370.0,2.0
1,2011-01-01,560064,784,1476,379.0,2.0
2,2011-02-01,498098,799,1393,358.0,2.0
3,2011-03-01,683292,1021,1983,345.0,2.0
4,2011-04-01,493129,900,1744,283.0,2.0
5,2011-05-01,723248,1080,2162,335.0,2.0
6,2011-06-01,690742,1052,2012,343.0,2.0
7,2011-07-01,680720,994,1927,353.0,2.0
8,2011-08-01,682299,981,1737,393.0,2.0
9,2011-09-01,1018670,1303,2327,438.0,2.0


# Cредняя выручка на одного пользователя (ARPU)

In [59]:
df_result_group_revenue['ARPU'] = df_result_group_revenue['amount']/df_result_group_revenue['customerid']
df_result_group_revenue['ARPU'] = df_result_group_revenue['ARPU'].round()

In [60]:
df_result_group_revenue

Unnamed: 0,month_sale,amount,customerid,invoiceno,avg_receipt,purchase_frequency,ARPU
0,2010-12-01,749590,949,2025,370.0,2.0,790.0
1,2011-01-01,560064,784,1476,379.0,2.0,714.0
2,2011-02-01,498098,799,1393,358.0,2.0,623.0
3,2011-03-01,683292,1021,1983,345.0,2.0,669.0
4,2011-04-01,493129,900,1744,283.0,2.0,548.0
5,2011-05-01,723248,1080,2162,335.0,2.0,670.0
6,2011-06-01,690742,1052,2012,343.0,2.0,657.0
7,2011-07-01,680720,994,1927,353.0,2.0,685.0
8,2011-08-01,682299,981,1737,393.0,2.0,696.0
9,2011-09-01,1018670,1303,2327,438.0,2.0,782.0
