In [29]:
from pathlib import Path
import pandas as pd
import gc
from sklearn.preprocessing import LabelEncoder
import pickle
import bz2
from scipy.stats import binom_test

In [2]:
!gdown --id 13Qm6ztAmVyBHvo_mch6gk-2VYKUHeRuu
!gdown --id 1UU4ZecYoz16CBXDEVPi80jDIoy_9ntTp

Downloading...
From: https://drive.google.com/uc?id=13Qm6ztAmVyBHvo_mch6gk-2VYKUHeRuu
To: /content/inno_stats.tar.gz
1.91GB [00:08, 218MB/s]
Downloading...
From: https://drive.google.com/uc?id=1UU4ZecYoz16CBXDEVPi80jDIoy_9ntTp
To: /content/bad_ids.xlsx
100% 363k/363k [00:00<00:00, 47.3MB/s]


In [3]:
!mkdir data
!tar -xf inno_stats.tar.gz -C ./data

In [2]:
data_path = Path('./data/').resolve()
!ls {data_path}

09_groups.csv  10_groups.csv  11_groups.csv  product_groups.csv


In [2]:
bad_ids_df = pd.read_excel("bad_ids.xlsx")

In [3]:
g_9 = pd.read_csv('./data/09_groups.csv', parse_dates=['date'])
g_9.drop( pd.merge(g_9, bad_ids_df, how='inner', on='id_card', right_index=True).index, inplace=True)
g_9['total_sum'] = g_9['sum'] * g_9['quantity']
g_9.reset_index(inplace=True, drop=True)
g_9.head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,id_group
0,2020-09-01 17:01:25,1283228,0.0,1538855,52,271,1.0,108.0,0,10004
1,2020-09-01 17:35:36,8873113,0.0,267307,52,271,1.0,108.0,0,10004
2,2020-09-01 19:19:21,12712899,0.0,610220,52,271,1.0,107.95,0,10004
3,2020-09-01 21:53:34,21535283,0.0,441497,52,271,1.0,108.0,0,10004
4,2020-09-01 18:42:31,642341,0.0,1065358,61,97,2.0,88.0,0,10091


In [38]:
g_10 = pd.read_csv('./data/10_groups.csv', parse_dates=['date'])
g_10.drop( pd.merge(g_10, bad_ids_df, how='inner', on='id_card', right_index=True).index, inplace=True)
g_10['total_sum'] = g_10['sum'] * g_10['quantity']
g_10.reset_index(inplace=True, drop=True)
g_10.head()

Unnamed: 0,date,id_doc,id_order,id_card,id_tov,id_kontr,quantity,sum,is_green,id_group
0,2020-10-01 20:22:55,2035558,0.0,1636749,52,339,1.0,107.9,0,10004
1,2020-10-01 20:54:12,13037327,0.0,267261,52,339,1.0,80.04,0,10004
2,2020-10-01 21:25:59,1430590,0.0,1331395,61,379,1.0,31.92,1,10091
3,2020-10-01 13:31:01,3504660,0.0,641525,61,97,1.0,55.98,0,10091
4,2020-10-01 16:32:06,11024438,0.0,1325852,61,97,4.0,176.0,0,10091


In [54]:
g_11 = pd.read_csv('./data/11_groups.csv', parse_dates=['date'])
g_11.drop( pd.merge(g_11, bad_ids_df, how='inner', on='id_card', right_index=True).index, inplace=True)
g_11['total_sum'] = g_11['sum'] * g_11['quantity']
g_11.reset_index(inplace=True, drop=True)
g_11.head()

KeyboardInterrupt: ignored

In [4]:
p_g = pd.read_csv('./data/product_groups.csv', delimiter=';', encoding='windows-1251')
p_g.head()

Unnamed: 0,id_tov,id_group,name_group
0,52,10004,Кисломолочные продукты
1,75,10004,Кисломолочные продукты
2,77,10004,Кисломолочные продукты
3,143,10004,Кисломолочные продукты
4,151,10004,Кисломолочные продукты


## Task 1

Prove that buying discounted products depends on the average order sum. (It is more probable that the discounted product will be bought if bunch of money is going to be spend)

$H_0$: part of discount goods in check does not depend on its sum.

Let's at first calculate mean discount goods part in check

In [55]:
sum_discount_df = g_9.groupby("id_doc")['total_sum', 'is_green'].agg({'total_sum': 'sum', 'is_green': 'mean'}).sort_values('total_sum')
global_discount_mean = sum_discount_df.is_green.mean()
print("Mean discount goods part:", global_discount_mean)

  """Entry point for launching an IPython kernel.


Mean discount goods part: 0.07669907979471557


The fact that some good is discount can be modeled by bernoulli distribution $B_0$. Now we can split our dataframe to two equal parts. Each of them will have distribution of discount goods part $B_1$ and $B_2$. Now we can check if they come from the same distribution.

In [56]:
delimiter = len(sum_discount_df) // 2

left_discount_mean = sum_discount_df[:delimiter].is_green.mean()
left_len = len(sum_discount_df[:delimiter])
print("Mean discount goods part of cheap checks", left_discount_mean)
print("P-value that it come from they come from the same global distribution", 
      binom_test(left_discount_mean * left_len, left_len, p=global_discount_mean)
)

right_discount_mean = sum_discount_df[delimiter:].is_green.mean()
right_len = len(sum_discount_df[delimiter:])
print("Mean discount goods part of expensive checks", right_discount_mean)
print("P-value that it come from they come from the same global distribution", 
      binom_test(right_discount_mean * right_len, right_len, p=global_discount_mean)
)

Mean discount goods part of cheap checks 0.10466899424439563
P-value that it come from they come from the same global distribution 3e-323
Mean discount goods part of expensive checks 0.04872917344668426
P-value that it come from they come from the same global distribution 3.5e-323


So, we have two strong evidences againts our null hypothesis. Thats mean that we proved that discount goods amount depends on money a customer is ready to spend. But in contrary with task, the percentage of discound goods in receipt will be more if money amount are less.

## Task 2
Prove that the frequency of shopping grows around the wage taking days.

$H_0$ amount of goods that customer buys is the same at usual and wage days.

Let's use the following test. We will pick two random receipt: from usual day and from wage day. Then we will compare them. Repeating these actions we will get bernoulli sample distribtuion $B_0$. If goods quantity bought by customers does not depend on day, $B_0$ will be $B(0.5)$.

In [57]:
wage_days = [1, 2, 3, 4, 5, 6, 20, 21, 22, 23, 24, 25, 26]

wage_day_mask = g_9.date.map(lambda x: x.day in wage_days)
wage_day_df = g_9[wage_day_mask]
usual_day_df = g_9[~wage_day_mask]

In [58]:
wage_day_quantities = wage_day_df.groupby("id_doc")['quantity'].sum()
usual_day_quantities = usual_day_df.groupby("id_doc")['quantity'].sum()

In [59]:
sample_num = 1_000_000
distribution = \
    wage_day_quantities.sample(sample_num, replace=True).to_numpy() >= \
    usual_day_quantities.sample(sample_num, replace=True).to_numpy()

print(distribution.mean())

0.514257


It seems that in wage days customers tend to buy less goods (in quantity measure) than at usual days. P-value against of such evidence is:

In [36]:
binom_test(distribution.sum(), sample_num, p=0.5)

3e-323

## Task 3
Define such cohort existance:
1. Frequent buyers - The users that are shopping frequently(daily, weekly, monthly) 
2. Average purchase sum - The customers that usually spend the same amount of money 
3. Usual cart - The customers grouped by the product groups

References
- https://towardsdatascience.com/a-step-by-step-introduction-to-cohort-analysis-in-python-a2cbbd8460ea