In [1]:
import glob
import matplotlib.pyplot as plt 
import pandas as pd 
import pyarrow.feather as feather
import seaborn as sns
from zipfile import ZipFile 

%matplotlib inline

In [2]:
CSV_PATH = "data/2019-Oct.csv"
FEATHER_PATH = "data/2019-Oct.feather"

In [3]:
if CSV_PATH not in glob.glob("data/*"):
    with ZipFile("data/archive.zip", 'r') as zip: 
        # printing all the contents of the zip file 
        zip.printdir() 

        # extracting all the files 
        print('Extracting target file now...') 
        zip.extract("2019-Oct.csv","data") 
        print('Done!') 
else:
    print("File already downloaded.")

File already downloaded.


In [4]:
if FEATHER_PATH not in glob.glob("data/*"):
     temp_data = pd.read_csv(CSV_PATH)
     with open(FEATHER_PATH, 'wb') as f:
        feather.write_feather(temp_data, f)
     del temp_data
else:
    print("File already converted.")

File already converted.


In [5]:
%%time
raw_data = feather.read_feather(FEATHER_PATH)

CPU times: user 12 s, sys: 3.81 s, total: 15.8 s
Wall time: 23.3 s


In [6]:
raw_data.shape

(42448764, 9)

In [7]:
raw_data.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


### Types of events

In [8]:
raw_data['event_type'].unique()

array(['view', 'purchase', 'cart'], dtype=object)

In [9]:
sample_user_id = raw_data[raw_data['event_type']=='purchase'].iloc[1000]['user_id']
sample_user_id

547107643

In [10]:
raw_data[raw_data['user_id']==sample_user_id].head(10)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
62114,2019-10-01 03:47:10 UTC,view,36200039,2071489994601529806,,,5.02,547107643,b63b37e9-cc00-4641-971d-e20ce8a544b7
63801,2019-10-01 03:49:11 UTC,purchase,36200039,2071489994601529806,,,5.02,547107643,b63b37e9-cc00-4641-971d-e20ce8a544b7
12979539,2019-10-11 04:18:20 UTC,view,1004858,2053013555631882655,electronics.smartphone,samsung,131.64,547107643,e154fb82-dc7d-47f4-bf9b-94ca8b779f9f
12983352,2019-10-11 04:21:57 UTC,purchase,1004858,2053013555631882655,electronics.smartphone,samsung,131.64,547107643,e154fb82-dc7d-47f4-bf9b-94ca8b779f9f
13810289,2019-10-11 14:32:12 UTC,view,3700926,2053013565983425517,appliances.environment.vacuum,samsung,64.02,547107643,99d9a1dd-858a-46b1-9f58-9097cbec2127
13812803,2019-10-11 14:33:42 UTC,purchase,3700926,2053013565983425517,appliances.environment.vacuum,samsung,64.02,547107643,99d9a1dd-858a-46b1-9f58-9097cbec2127
13813571,2019-10-11 14:34:09 UTC,view,3700926,2053013565983425517,appliances.environment.vacuum,samsung,64.02,547107643,99d9a1dd-858a-46b1-9f58-9097cbec2127
13814048,2019-10-11 14:34:26 UTC,view,3700926,2053013565983425517,appliances.environment.vacuum,samsung,64.02,547107643,99d9a1dd-858a-46b1-9f58-9097cbec2127
13814170,2019-10-11 14:34:31 UTC,cart,3700926,2053013565983425517,appliances.environment.vacuum,samsung,64.02,547107643,99d9a1dd-858a-46b1-9f58-9097cbec2127
13815160,2019-10-11 14:35:05 UTC,purchase,3700926,2053013565983425517,appliances.environment.vacuum,samsung,64.02,547107643,99d9a1dd-858a-46b1-9f58-9097cbec2127
