In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [34]:
#load data item_proprties_part1 and item_proprties_part2 and concatenate them into one dataset
df_item_properties_part1 = pd.read_csv('item_properties_part1.1.csv', index_col=None)
df_item_properties_part2 = pd.read_csv('item_properties_part2.csv', index_col=None)
df_item_properties = pd.concat([df_item_properties_part1, df_item_properties_part2], ignore_index=True)
df_item_properties.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460000000.0,460429,categoryid,1338
1,1441510000000.0,206783,888,1116713 960601 n277.200
2,1439090000000.0,395014,400,n552.000 639502 n720.000 424566
3,1431230000000.0,59481,790,n15360.000
4,1431830000000.0,156781,917,828513


In [35]:
# load behaviour events dataset
df_events = pd.read_csv('events.csv', index_col=None)
df_events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433220000000.0,257597,view,355908,
1,1433220000000.0,992329,view,248676,
2,1433220000000.0,111016,view,318965,
3,1433220000000.0,483717,view,253185,
4,1433220000000.0,951259,view,367447,


In [36]:
# load category tree dataset
df_category_tree = pd.read_csv('category_tree.csv')
df_category_tree.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [37]:
#shape of the data sets
df_item_properties.shape

(2097150, 4)

In [38]:
df_events.shape

(1048575, 5)

In [39]:
df_category_tree.shape

(1669, 2)

In [40]:
#data types of the datasets
df_item_properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2097150 entries, 0 to 2097149
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   timestamp  float64
 1   itemid     int64  
 2   property   object 
 3   value      object 
dtypes: float64(1), int64(1), object(2)
memory usage: 64.0+ MB


In [41]:
df_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   timestamp      1048575 non-null  float64
 1   visitorid      1048575 non-null  int64  
 2   event          1048575 non-null  object 
 3   itemid         1048575 non-null  int64  
 4   transactionid  8654 non-null     float64
dtypes: float64(2), int64(2), object(1)
memory usage: 40.0+ MB


In [42]:
df_category_tree.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1669 entries, 0 to 1668
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   categoryid  1669 non-null   int64  
 1   parentid    1644 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 26.2 KB


### Merge the 3 datasets

In [43]:
# merging df_events and df_item_prperties

In [50]:
df_events_props = pd.merge(
    df_events, 
    df_item_properties, 
    on='itemid', 
    how='left'
)

df_events_props.head()

Unnamed: 0,timestamp_x,visitorid,event,itemid,transactionid,timestamp_y,property,value
0,1439920000000.0,370720,view,3,,1431230000000.0,available,0
1,1439920000000.0,370720,view,3,,1435460000000.0,159,519769
2,1435650000000.0,1042455,view,4,,1431830000000.0,available,0
3,1435650000000.0,1042455,view,4,,1432440000000.0,888,371058 71429
4,1435650000000.0,1042455,view,4,,1433040000000.0,888,371058 71429


In [51]:
# merging category tree with the fist merge

In [52]:
# Filter only category rows
df_categories = df_item_properties[df_item_properties['property'] == 'categoryid']
df_categories = df_categories[['itemid', 'value']].rename(columns={'value': 'categoryid'})

In [53]:
# Convert categoryid to numeric for joining
df_categories['categoryid'] = pd.to_numeric(df_categories['categoryid'], errors='coerce')

In [54]:
# Merge category IDs into merged events/properties
df_events_props_categories = pd.merge(
    df_events_props,
    df_categories[['itemid', 'categoryid']],
    on='itemid',
    how='left'
)

In [55]:
# Merge with category tree
df_events_props_tree = pd.merge(
    df_events_props_categories,
    df_category_tree,
    on='categoryid',
    how='left'
)

df_events_props_tree.head()

Unnamed: 0,timestamp_x,visitorid,event,itemid,transactionid,timestamp_y,property,value,categoryid,parentid
0,1439920000000.0,370720,view,3,,1431230000000.0,available,0,,
1,1439920000000.0,370720,view,3,,1435460000000.0,159,519769,,
2,1435650000000.0,1042455,view,4,,1431830000000.0,available,0,,
3,1435650000000.0,1042455,view,4,,1432440000000.0,888,371058 71429,,
4,1435650000000.0,1042455,view,4,,1433040000000.0,888,371058 71429,,


In [None]:
# renaming the final dataset to df
df = df_events_props_tree
df.he