## Setting up

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Import pandas and read in the csv file and set it to a dataframe called baskets

In [None]:
import pandas as pd
import numpy as np

In [None]:
baskets = pd.read_csv('../../dslc_prep/baskets_sample_random_10.csv')

## Conduct basic data inspection

 - take a look at the first three rows, and last few rows

In [None]:
baskets.head(3)

Unnamed: 0,id,order_id,placed_at,merchant_id,sku_id,top_cat_id,sub_cat_id,qty,price
0,126,23,2021-05-05 11:04:46.579,10,341,3.0,47.0,100,0.0
1,166,41,2021-05-06 10:45:02.448,196,341,3.0,47.0,2,0.0
2,167,42,2021-05-06 10:45:04.850,196,341,3.0,47.0,2,0.0


In [None]:
baskets.tail()

Unnamed: 0,id,order_id,placed_at,merchant_id,sku_id,top_cat_id,sub_cat_id,qty,price
29300,338870,61206,2022-07-31 16:52:33.731,428,1896,4.0,57.0,80,12350.0
29301,338871,61206,2022-07-31 16:52:33.731,428,747,12.0,36.0,1,86000.0
29302,338872,61206,2022-07-31 16:52:33.731,428,850,9.0,48.0,1,101500.0
29303,338873,61206,2022-07-31 16:52:33.731,428,853,9.0,48.0,1,68500.0
29304,338874,61206,2022-07-31 16:52:33.731,428,852,9.0,48.0,1,68500.0


### dataframe dimensions, column names, column data types, ranges of column values

In [None]:
baskets.shape

(29305, 9)

In [None]:
baskets.columns

Index(['id', 'order_id', 'placed_at', 'merchant_id', 'sku_id', 'top_cat_id',
       'sub_cat_id', 'qty', 'price'],
      dtype='object')

In [None]:
baskets.dtypes

id               int64
order_id         int64
placed_at       object
merchant_id      int64
sku_id           int64
top_cat_id     float64
sub_cat_id     float64
qty              int64
price          float64
dtype: object

 - noticed columns "placed_at" is not numeric and the rest are numerical columns

In [None]:
baskets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29305 entries, 0 to 29304
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           29305 non-null  int64  
 1   order_id     29305 non-null  int64  
 2   placed_at    29305 non-null  object 
 3   merchant_id  29305 non-null  int64  
 4   sku_id       29305 non-null  int64  
 5   top_cat_id   29298 non-null  float64
 6   sub_cat_id   29298 non-null  float64
 7   qty          29305 non-null  int64  
 8   price        29305 non-null  float64
dtypes: float64(3), int64(5), object(1)
memory usage: 2.0+ MB


 - question: what can you observe from the above result?

 - why are the count on top_cat_id and sub_cat_id different from others? 

In [None]:
baskets.describe()

Unnamed: 0,id,order_id,merchant_id,sku_id,top_cat_id,sub_cat_id,qty,price
count,29305.0,29305.0,29305.0,29305.0,29298.0,29298.0,29305.0,29305.0
mean,162948.006825,27337.267838,1061.053574,977.588671,10.860946,50.808485,6921.692,137913.3
std,95745.525,18450.495064,761.056051,348.157681,8.353926,31.280838,1168396.0,135536.1
min,126.0,23.0,10.0,5.0,1.0,1.0,1.0,0.0
25%,80106.0,10296.0,352.0,755.0,4.0,30.0,1.0,45500.0
50%,158748.0,25824.0,934.0,875.0,9.0,48.0,2.0,107000.0
75%,242651.0,43776.0,1717.0,1055.0,15.0,79.0,5.0,184500.0
max,338874.0,61439.0,3160.0,2383.0,35.0,108.0,200000000.0,2175000.0


 - noticed that the "placed_at" column was not shown in the above result, maybe due to its type?  
 - wondering that ID columns' statistics may not make sense other than count, min, max, since they are supposed to be identifiers

## Conduct some more data inspection

 - take a look at 3 random rows

In [None]:
# set seed for random function so that we get same rows when re-run the cell
np.random.seed(17)
baskets.iloc[np.random.randint(0, baskets.shape[0],3)]

Unnamed: 0,id,order_id,placed_at,merchant_id,sku_id,top_cat_id,sub_cat_id,qty,price
10863,121089,18116,2021-11-29 13:41:14.231,1304,768,14.0,37.0,3,85000.0
2191,24793,1959,2021-09-10 19:48:46.384,349,771,15.0,37.0,5,19200.0
13702,150296,24073,2021-12-21 14:06:17.136,1004,809,9.0,48.0,100,12000.0


 - take a look at transactions for a specific merchant_id

In [None]:
baskets[baskets['merchant_id'] == 1004]


Unnamed: 0,id,order_id,placed_at,merchant_id,sku_id,top_cat_id,sub_cat_id,qty,price
3063,33551,3158,2021-09-27 16:39:56.665,1004,1289,4.0,31.0,1,292000.0
3064,33552,3158,2021-09-27 16:39:56.665,1004,875,3.0,80.0,10,168000.0
3065,33553,3158,2021-09-27 16:39:56.665,1004,778,15.0,53.0,5,137000.0
3066,33554,3158,2021-09-27 16:39:56.665,1004,889,35.0,98.0,5,224000.0
3067,33555,3158,2021-09-27 16:39:56.665,1004,724,12.0,30.0,75,16000.0
...,...,...,...,...,...,...,...,...,...
20053,220679,39142,2022-02-15 17:03:14.776,1004,1758,12.0,12.0,210,24000.0
20978,233205,42032,2022-02-24 09:31:28.591,1004,1839,27.0,57.0,5,124000.0
20979,233206,42032,2022-02-24 09:31:28.591,1004,1838,15.0,57.0,5,125000.0
20980,233207,42032,2022-02-24 09:31:28.591,1004,1820,4.0,31.0,1,53000.0


 - how much did it cost in total for merchant 1004?

In [None]:
baskets[baskets['merchant_id'] == 1004].price.sum()

24238650.0

 - what is the average price for order 3158?

In [None]:
baskets[baskets['order_id'] == 3158].price.mean()

145583.33333333334

 - what are the average price, min and max prices for all rows in this dataset?

In [None]:
baskets['price'].mean(), baskets['price'].min(), baskets['price'].max(), 

(137913.33466782197, 0.0, 2175000.0)

 - how many rows have price of 0
 - question: Why would some items have price of 0? 

*** TODO: find out why would some items have price of 0?
  

In [None]:
baskets[baskets['price']==0].count()

id             520
order_id       520
placed_at      520
merchant_id    520
sku_id         520
top_cat_id     519
sub_cat_id     519
qty            520
price          520
dtype: int64

 - check columns' number of unique values

In [None]:
baskets.nunique()

id             29303
order_id        5542
placed_at       5541
merchant_id      317
sku_id          1353
top_cat_id        32
sub_cat_id        90
qty              174
price           1114
dtype: int64

- question: what can you observe from the above result? what might seem to be peculiar? 

 - notice unique placed_at is one greater than unique order_id
 - question: is it possible that two orders are made on exactly the same milisecond? In theory it is possible, but might there be potential fraud?

  *** TODO: how can we find out which two orders happened on the exact same millisecond? 


 - can we check the min and max of "date" column?

In [None]:
baskets['placed_at'].min(), baskets['placed_at'].max()

('2021-05-05 11:04:46.579', '2022-07-31 16:52:33.731')

 - how many merchant transacted on a particular day, say December 31, 2021?
 - what is the type "object" anyways?

In [None]:
baskets['placed_at'][1], type(baskets['placed_at'][1])

('2021-05-06 10:45:02.448', str)

 - how do we work with a string object and get the date, hour, min, second, millisecond?

### save some data to a file

In [None]:
baskets[baskets['merchant_id'] == 1004].to_csv("test_dave.csv", sep = ",", index=False)

### gather all observations, questions, and TODOs

 - columns "placed_at" and "supplier_id" are not numeric and the rest are numerical columns
 - why are the count on top_cat_id and sub_cat_id different from others? 
 - ID columns' statistics make sense other than count, min, max, since they are supposed to be identifiers, should we treat them as categorical?
 - why would some items have price of 0?
 - unique placed_at is one greater than unique order_id
 - is it possible that two orders are made on exactly the same milisecond? In theory it is possible, but might there be potential fraud?
 - how can we find out which two orders happened on the exact same millisecond? 
 - how many merchant transacted on a particular day, say December 31, 2021?
 - how do we work with a string object and get the date, hour, min, second, millisecond? 

## Testing