# [View on Colab](https://colab.research.google.com/drive/1hcJRM1Z0jzOQH0vpcivTh-T_J1namgMn?usp=sharing)

# Installation
Now Colab does not support python 3.9, so it necessary to connect to it from local runtime.

1. Follow [these](https://github.com/Prometheus3375/inno_stats_2020/blob/master/README.md) instructions to install project requirements.
2. Run `jupyter serverextension enable --py jupyter_http_over_ws` to enable necessary extension.

# Connecting
1. Run the command below to start jupyter server.
```
jupyter notebook --NotebookApp.allow_origin='https://colab.research.google.com' --port=8888 --NotebookApp.port_retries=0
```
2. Follow step 4 from [here](https://research.google.com/colaboratory/local-runtimes.html).

# Preparation

In [1]:
import os
import gc

import gspread
import plotly.graph_objects as go
import pandas as pd

from plotly.subplots import make_subplots

from hw2 import io

In [2]:
from collections.abc import Iterable

In [3]:
import plotly.io as pio
pio.renderers.default = 'colab'

# Data

## Download and unpack

In [4]:
if not os.path.exists('./data'):
    os.mkdir('./data')
    # https://drive.google.com/file/d/13Qm6ztAmVyBHvo_mch6gk-2VYKUHeRuu
    !gdown --id 13Qm6ztAmVyBHvo_mch6gk-2VYKUHeRuu -O './data/data.tar.gz'
    !tar -xvzf './data/data.tar.gz' -C 'data'
    !rm './data/data.tar.gz'

## Get user ouliers

In [5]:
sa = gspread.service_account('./reader.json')
sheet = sa.open_by_url('https://docs.google.com/spreadsheets/d/1SIf2vawr2VWwme_6v_VZyHivJ502bWFN70SQLf-iBkM').sheet1
values = sheet.get_all_values()
values = list(zip(*values))[1]
outliers = {int(values[i]) for i in range(1, len(values))}

In [6]:
sorted(outliers)[:10]

[19, 32, 35, 36, 41, 42, 49, 50, 51, 56]

In [7]:
len(outliers)

24707

## Prepare data

In [8]:
io.clean_data('./data/09_groups.csv', outliers)

In [9]:
io.clean_data('./data/10_groups.csv', outliers)

In [10]:
io.clean_data('./data/11_groups.csv', outliers)

In [11]:
del sa, sheet, values, outliers
_ = gc.collect()

In [12]:
product_groups = product_groups = io.read_groups('./data/product_groups.csv')
data_paths = './data/09_groups.csv', './data/10_groups.csv', './data/11_groups.csv'
data_names = 'september', 'october', 'november'

# Task 1

Prove that buying discounted products depends on the average order sum. It is more probable that the discounted product will be bought if bunch of money is going to be spend.


# Task 2
Prove that the frequency of shopping grows around the wage taking days, i.e. days 1-6 and 20-26 of each month.

I am going to measure frequence of shopping by number of receipts a day.

In [13]:
def get_data(path: str):
    df = io.read_data(
        path,
        usecols=['date', 'id_doc'],
        parse_dates=['date'],
        infer_datetime_format=True,
    )
    df.drop_duplicates(inplace=True)
    df['day'] = df.date.dt.day
    df.drop(columns=['date'], inplace=True)
    return df

In [14]:
wage_days = {1, 2, 3, 4, 5, 6, 20, 21, 22, 23, 24, 25, 26}

In [15]:
data = [get_data(path) for path in data_paths]

In [16]:
sum(len(d) for d in data)

20842511

In [17]:
data[0].head()

Unnamed: 0,id_doc,day
0,1283228,1
1,8873113,1
2,12712899,1
3,21535283,1
4,642341,1


In [18]:
receipts_per_day = [d.groupby('day').count() for d in data]

In [19]:
receipts_per_day[0].head()

Unnamed: 0_level_0,id_doc
day,Unnamed: 1_level_1
1,240615
2,243419
3,245866
4,235497
5,178832


In [20]:
receipts_per_day[0].index

Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
           dtype='int64', name='day')

In [21]:
fig = make_subplots(
    rows=1,
    cols=len(data),
    subplot_titles=data_names,
)

for i, (name, d) in enumerate(zip(data_names, receipts_per_day), 1):
    colors = ['LightGreen' if day in wage_days else 'SkyBlue' for day in d.index]

    fig.add_trace(
        go.Bar(
            name=name,
            x=d.index,
            y=d.id_doc,
            marker=dict(
                color=colors,
            ),
        ),
        row=1,
        col=i,
    )

fig.update_xaxes(
    title='Days',
    tickmode='linear',
)
fig.update_yaxes(
    title='Number of receips',
)
# fig.update_layout(barmode='stack')
fig.show()

There is no significant increase in number of receipts in wage days. Thus, frequency of shopping is not growing in wage days.

In [22]:
del wage_days, data, receipts_per_day, fig
_ = gc.collect()

# Task 3
Define such cohort existence:
1. Frequent buyers - The users that are shopping frequently (daily, weekly, monthly)
2. Average purchase sum - The customers that usually spend the same amount of money
3. Usual cart - The customers grouped by the product groups
<!-- 4. Orders amount - The customers grouped by the total number of orders -->

[Tutorial](https://towardsdatascience.com/a-step-by-step-introduction-to-cohort-analysis-in-python-a2cbbd8460ea)

## Task 3.1

In [23]:
def get_buyers(paths: Iterable[str]):
    data = []
    for path in paths:
        df = io.read_data(
            path,
            usecols=['date', 'id_card'],
            parse_dates=['date'],
            infer_datetime_format=True,
        )
        df.drop_duplicates(inplace=True)
        df['day'] = df.date.dt.day
        df['month'] = df.date.dt.month
        df['week'] = df.date.dt.isocalendar().week
        df.drop(columns=['date'], inplace=True)

        data.append(df)

    return pd.concat(data)

In [24]:
data = get_buyers(data_paths)

In [25]:
len(data)

20842337

In [26]:
data.head()

Unnamed: 0,id_card,day,month,week
0,1538855,1,9,36
1,267307,1,9,36
2,610220,1,9,36
3,441497,1,9,36
4,1065358,1,9,36


### Daily users

In [27]:
daily = data[['id_card', 'day', 'month']].drop_duplicates().groupby('id_card').day.count().reset_index()
daily

Unnamed: 0,id_card,day
0,0,10
1,1,2
2,2,1
3,3,15
4,4,13
...,...,...
1919364,1944071,1
1919365,1944072,1
1919366,1944073,1
1919367,1944074,10


In [28]:
daily[daily.day == 30 + 31 + 30]

Unnamed: 0,id_card,day


No users buy daily.

### Weekly users

In [29]:
weekly = data[['id_card', 'week']].drop_duplicates().groupby('id_card').week.count().reset_index()
weekly

Unnamed: 0,id_card,week
0,0,4
1,1,2
2,2,1
3,3,9
4,4,8
...,...,...
1919364,1944071,1
1919365,1944072,1
1919366,1944073,1
1919367,1944074,8


In [30]:
weekly[weekly.week == data.week.nunique()]

Unnamed: 0,id_card,week
21,22,14
24,25,14
46,55,14
53,63,14
73,88,14
...,...,...
1902885,1927570,14
1903038,1927724,14
1903072,1927758,14
1903633,1928319,14


There are 62,955 users that buy weekly.

### Monthly users

In [31]:
monthly = data[['id_card', 'month']].drop_duplicates().groupby('id_card').month.count().reset_index()
monthly

Unnamed: 0,id_card,month
0,0,1
1,1,2
2,2,1
3,3,3
4,4,3
...,...,...
1919364,1944071,1
1919365,1944072,1
1919366,1944073,1
1919367,1944074,3


In [32]:
monthly[monthly.month == data.month.nunique()]

Unnamed: 0,id_card,month
3,3,3
4,4,3
9,9,3
10,10,3
11,11,3
...,...,...
1908026,1932721,3
1908028,1932723,3
1908688,1933385,3
1914713,1939417,3


There are 815,595 users that buy monthly.

In [33]:
del data, daily, weekly, monthly
_ = gc.collect()

## Task 3.2

In [34]:
def get_ave(paths: Iterable[str]):
    data = []
    for path in paths:
        df = io.read_data(
            path,
            usecols=['id_card', 'id_doc', 'sum'],
        )
        df = df.groupby(['id_card', 'id_doc']).sum().reset_index()
        df.drop(columns=['id_doc'], inplace=True)

        data.append(df)

    return pd.concat(data)

In [35]:
data = get_ave(data_paths)
data.rename(columns={'sum': 'total'}, inplace=True)
data

Unnamed: 0,id_card,total
0,0,251.0
1,0,260.0
2,0,329.0
3,0,209.0
4,0,924.2
...,...,...
6872850,1944074,80.0
6872851,1944074,239.0
6872852,1944074,203.0
6872853,1944074,50.0


In [36]:
necessary_purchase_times = 10
allowed_shift = 100
requred_count_percent = 0.9

In [37]:
def is_in_cohort(series):
    if len(series) < necessary_purchase_times:
        False

    median = series.median()
    required = len(series) * requred_count_percent
    satisfies = series.between(median - allowed_shift, median + allowed_shift)

    return len(satisfies[satisfies]) >= required

In [38]:
result = data.groupby(['id_card']).total.agg(in_cohort=is_in_cohort)
result

Unnamed: 0_level_0,in_cohort
id_card,Unnamed: 1_level_1
0,False
1,True
2,True
3,False
4,False
...,...
1944071,True
1944072,True
1944073,True
1944074,False


In [39]:
result[result.in_cohort]

Unnamed: 0_level_0,in_cohort
id_card,Unnamed: 1_level_1
1,True
2,True
7,True
13,True
20,True
...,...
1944070,True
1944071,True
1944072,True
1944073,True


There are 630,144 users with at least 10 orders, at least 90% of which have common sum within 100 rubles from median.

In [40]:
del data, result
_ = gc.collect()