## Setting up

### Import pandas and read in the csv file and set it to a dataframe called baskets

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import seaborn as sns
import matplotlib.pyplot as plt


###  unanswered questions from last notebook
 - should we remove the duplicates? 
 - what should we do about nulls in the data?

### the remaining questions may need business answer that we are not able to answer by data alone
 - ID columns' statistics make sense other than count, min, max, since they are supposed to be identifiers, should we treat them as categorical?
 - why would some items have price of 0?
 - unique placed_at is one greater than unique order_id, is it possible that two orders are made on exactly the same milisecond? In theory it is possible, but might there be potential fraud?

### plan for this notebook to work on:
 - should we remove the duplicates? 
 - what should we do about nulls in the data?
 - answer some questions regarding merchants:
   - distribution of number of merchants by months
   - distribution of number of merchants by weekday
   - distribution of number of merchants by the number of orders they made
   - distribution of number of merchants by the number of distinct SKUs they buy

### read in data, make columns for date, year, month, day, hour, weekday, from the "placed_at" column 

In [None]:
filename = '../../dslc_prep/baskets_sample_random_10_20220820.csv'
#filename = '../../dslc_prep/baskets_sample_top_250_20220902.csv'
baskets = pd.read_csv(filename)

FileNotFoundError: [Errno 2] No such file or directory: 'baskets_sample_random_10_20220820.csv'

### Simple summaries of one dimension

In [None]:
baskets.price.describe()

In [None]:
baskets.boxplot(column = 'price')


 - we can immediately see that the price has outliers
 - question: where should we cut to separate outliers from the regular items?

In [None]:
baskets.price.sort_values(ascending = False).reset_index(drop=True)[10:].plot()

In [None]:
baskets.price.sort_values(ascending = False).reset_index(drop=True)[10:].plot.density()

In [None]:
plt.figure(figsize=(15,3)) 
baskets.price.sort_values(ascending = False).reset_index(drop=True)[1000:].hist(bins=100)

In [None]:
baskets['datetime'] = baskets['placed_at'].apply(lambda x: datetime.fromisoformat(x))

#pandas.Series.dt is an interface on a pandas series that gives you convenient access to operations on data stored as a pandas datetime. 
baskets['date'] = baskets['datetime'].dt.date
baskets['year'] = baskets['datetime'].dt.year
baskets['month'] = baskets['datetime'].dt.month
baskets['day'] = baskets['datetime'].dt.day
baskets['hour'] = baskets['datetime'].dt.hour
baskets['weekday'] = baskets['datetime'].dt.weekday
baskets.head(3)

### make a column of "spent" from columns "qnt" (quantity) and "price"

In [None]:
baskets["spent"] = baskets["qty"] * baskets["price"]
baskets.head(100)

### which merchants have the most orders

In [None]:
baskets.groupby(['merchant_id']).order_id.nunique().sort_values(ascending = False)


In [None]:
baskets.groupby(['merchant_id']).spent.sum().sort_values(ascending = False)


In [None]:
baskets.groupby(['merchant_id']).date.nunique().sort_values(ascending = False)

In [None]:
merchants = baskets.groupby(['merchant_id']).agg({'spent': 'sum', 'order_id': 'nunique', 'date': 'nunique', 'sku_id':'nunique', 'top_cat_id':'nunique', 'sub_cat_id':'nunique'}).reset_index()
#merchants[merchants.merchant_id == 488]

In [None]:
#Correlation plot
plt.figure(figsize=(12,12)) 
sns.heatmap(merchants.corr(),annot=True, cmap='coolwarm')


### for merchant 488, what are the top SKU in terms of quantities

In [None]:
df = baskets[baskets['merchant_id']==488].groupby(by=['sku_id']).sum().qty.sort_values(ascending = False).reset_index()
df

### study merchant 488 on its behavior on top SKUs

In [None]:
baskets[(baskets['merchant_id'] ==488) & (baskets['sku_id'].isin(df['sku_id'].head(3)))]

### this merchant's top sku purchased are 
 -  vegetable oils - top cat 4 is "ingredients" (bahan masakan) and sub cat 57 is "grilled fish cake" )
 -  beer  - top cat 3 is "food material" (bahan makanan) and sub cat 10 is "beer" (bir))

In [None]:
df = baskets[baskets.merchant_id==488].groupby(['sku_id'], as_index=False).agg({'month': ['nunique', 'min', 'max']}).reset_index()

In [None]:
baskets.groupby(['year','month']).sum().spent

In [None]:
plt.figure(figsize=(15,3)) 
baskets.groupby(['date']).sum().spent.plot()

In [None]:
plt.figure(figsize=(15,3)) 
baskets.groupby(['date']).nunique().order_id.plot()

In [None]:
plt.figure(figsize=(15,3)) 
baskets.groupby(['date']).nunique().merchant_id.plot()

In [None]:
plt.figure(figsize=(15,3)) 
baskets.groupby(['date']).nunique().sku_id.plot()

In [None]:
sns.pairplot(merchants)

 - for each merchant, how many items did they buy on December 31, 2021?
 - what did the merchant_id buy? on December 31, 2021?

We can see that there are 2208 null values in 'top_cat_id' and 'sub_cat_id'.

In [None]:
#Removing these null values
baskets.dropna(inplace=True)

In [None]:
baskets.isnull().sum()

In [None]:
#Counting the frequency
freq = baskets['merchant_id'].value_counts()
print(freq)

In [None]:
frequency_df = baskets.groupby(
    by=['merchant_id'], as_index=False)['date'].count()
frequency_df.columns = ['merchant_id', 'Frequency']
frequency_df.head()

In [None]:
merchant_sku = baskets.groupby(['merchant_id', 'sku_id', 'month'], as_index= True).agg({'count'})
merchant_sku

In [None]:
merchant_sku = baskets.groupby(['merchant_id', 'sku_id'], as_index= True).agg({'count'})
merchant_sku

In [None]:
baskets.price.median()

In [None]:
plt.figure(figsize=(15,3)) 
result = plt.hist(baskets.price, bins=100, color='c', edgecolor='k', alpha=0.65)
plt.axvline(baskets.price.mean(), color='g', linestyle='dashed', linewidth=2)
plt.axvline(baskets.price.median(), color='r', linestyle='dashed', linewidth=2)


min_ylim, max_ylim = plt.ylim()
plt.text(baskets.price.mean()*1.1, max_ylim*0.9, 'Mean: {:.2f}'.format(baskets.price.mean()))

In [None]:
plt.figure(figsize=(15,3)) 
baskets.price.sort_values(ascending = False).reset_index(drop=True).hist(bins=100)

In [None]:
plt.figure(figsize=(15,3)) 
ax = sns.barplot(x="top_cat_id", y="price", data=baskets)


### top cat 7 is "baby food", hmmm, is this right?