# HW2 by Danil Ginzburg

In [None]:
import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
import pickle
import bz2
from datetime import datetime
from pathlib import Path

In [None]:
data_path = Path('./data/').resolve()
file_paths = [data_path / '09_groups.csv',
             data_path / '10_groups.csv',
             data_path / '11_groups.csv']

In [None]:
colunms = [s for s in 'date;id_doc;id_order;id_card;id_tov;id_kontr;quantity;sum;is_green'.split(';')]

In [None]:
MONTH_SAMPLE_SIZE = 1000000

In [None]:
df = pd.DataFrame()
for file_path in file_paths:
    n = sum(1 for line in open(file_path)) - 1 #number of records in file (excludes header)
    skip = sorted(random.sample(range(1,n+1),n-MONTH_SAMPLE_SIZE)) #the 0-indexed header will not be included in the skip list
    df = df.append(
        pd.read_csv(
            filepath_or_buffer= file_path,
            header=0,
            sep=',',
            names = colunms,
            usecols = colunms,
            parse_dates=['date'],
            date_parser=lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'),
            skiprows=skip
        )
    )

In [None]:
df.head()

## Bad ids

In [None]:
bad_ids = pd.read_csv(
    filepath_or_buffer= data_path / 'bad_ids.csv',
    header=0,
    sep=',',
    index_col=0
)

In [None]:
bad_ids.head()

In [None]:
def remove_bad_ids(df, bad_ids):
    cond = df['id_card'].isin(bad_ids['id_card'])
    df.drop(df[cond].index, inplace = True)

In [None]:
remove_bad_ids(df, bad_ids)

In [None]:
df.head()

## Task 1

In [None]:
def preprocess_task_1(df):
    df = set_timestamp_index(df)
    df = df.drop(['date', 'id_doc', 'id_order', 'id_tov', 'id_kontr', 'quantity'], inplace=False, axis=1)

    df = df.groupby(by=[df.index.date, df.index.hour, df.id_card]).sum()
    df_with_dis = df.drop(df.loc[df['is_green'] == 0].index, inplace=False)
    df_wo_dis = df.drop(df.loc[df['is_green'] != 0].index, inplace=False)
    return df_with_dis, df_wo_dis

def set_timestamp_index(df):
    df['ts'] = pd.to_datetime(df['date'])
    return df.set_index('ts', drop=True, inplace=False)

In [None]:
task1_df_discount, task1_df_no_discount = preprocess_task_1(df)
task1_df_discount.head()

In [None]:
pd.DataFrame(task1_df_discount['sum']).boxplot()

In [None]:
task1_df_discount.describe()

In [None]:
pd.DataFrame(task1_df_no_discount['sum']).boxplot()

In [None]:
task1_df_no_discount.describe()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
n, bins, rectangles = ax.hist(task1_df_discount['sum'], 100)
fig.canvas.draw()
plt.show()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
n, bins, rectangles = ax.hist(task1_df_no_discount['sum'], 100)
fig.canvas.draw()
plt.show()

In [None]:
sum_bins = [0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]
labels =   [1, 2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20]

In [None]:
task1_df_discount['bin'] = pd.cut(task1_df_discount['sum'], bins=sum_bins, labels=labels)
task1_df_discount_binsizes = task1_df_discount.groupby(by=[task1_df_discount.bin]).count()

task1_df_no_discount['bin'] = pd.cut(task1_df_no_discount['sum'], bins=sum_bins, labels=labels)
task1_df_no_discount_binsizes = task1_df_no_discount.groupby(by=[task1_df_no_discount.bin]).count()

In [None]:
task1_df_discount_binsizes.head()

In [None]:
task1_df_no_discount_binsizes.head()

In [None]:
task1_percentages_with_discount = task1_df_discount_binsizes.divide(task1_df_no_discount_binsizes)
np_bins = np.asarray(sum_bins[1:])
task1_percentages_with_discount['bin'] = np_bins

In [None]:
task1_percentages_with_discount.head()


In [None]:
task1_percentages_with_discount.plot.bar('bin')

The last plot shows the percentage of orders with discount against orders without discount in bined by the order sums.
<br/><br/>The first bin is for orders below 500 rubles. Then bin for 550, 600, etc.
<br/><br/>From the plot you can see that there is
no evidence that "It is more probable that the discounted product will be bought if a bunch of money is going to spend".
The plot shows that on the opposite the probability decreases with the increase of the order sum.
<br/><br/>In this analysis I took that 500 is "a bunch" and took 3 millions of random samples from the whole dataset (3 months) for the analysis.

## Task 2

In [None]:
def preprocess_task_2(df):
    df['ts'] = pd.to_datetime(df['date'])
    df = df.set_index('ts', drop=False, inplace=False)

    df = df.drop(['date', 'id_doc', 'id_order', 'id_tov', 'id_kontr', 'quantity', 'is_green'], inplace=False, axis=1)
    df = df.groupby(by=[df.index.day, df.index.hour, df.id_card]).sum()

    return df.groupby(level=0).count()

In [None]:
task2_df = preprocess_task_2(df)
task2_df.head()


In [None]:
task2_df.plot.bar()