## libs 

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
# from math import sqrt
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

#### load files

In [2]:


events = pd.read_csv('archive/events.csv')
item_properties1 = pd.read_csv('archive/item_properties_part1.csv')
item_properties2 = pd.read_csv('archive/item_properties_part2.csv')
category_tree = pd.read_csv('archive/category_tree.csv')


### light EDA

In [3]:
# Kiểm tra cột và kiểu dữ liệu
print(events.head()) 
print("--------------------------------------")
print(events.info())
print("--------------------------------------")
print(events['event'].value_counts())
print("--------------------------------------")
print(events.isnull().sum())


       timestamp  visitorid event  itemid  transactionid
0  1433221332117     257597  view  355908            NaN
1  1433224214164     992329  view  248676            NaN
2  1433221999827     111016  view  318965            NaN
3  1433221955914     483717  view  253185            NaN
4  1433221337106     951259  view  367447            NaN
--------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   timestamp      int64  
 1   visitorid      int64  
 2   event          object 
 3   itemid         int64  
 4   transactionid  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 105.1+ MB
None
--------------------------------------
event
view           2664312
addtocart        69332
transaction      22457
Name: count, dtype: int64
--------------------------------------
timestamp              0
visitorid              0
event        

#### Chuyển timestamp sang datetime

In [4]:
# Chuyển timestamp sang datetime
events['datetime'] = pd.to_datetime(events['timestamp'], unit='ms')
item_properties1['datetime'] = pd.to_datetime(item_properties1['timestamp'], unit='ms')
item_properties2['datetime'] = pd.to_datetime(item_properties2['timestamp'], unit='ms')


#### Tạo sessions cho user (theo mốc thời gian)

In [5]:
# Sắp xếp theo user và thời gian
events = events.sort_values(['visitorid', 'datetime'])

# Tính delta giữa 2 event liên tiếp
events['prev_time'] = events.groupby('visitorid')['datetime'].shift(1)
events['time_diff'] = (events['datetime'] - events['prev_time']).dt.seconds.div(60, fill_value=0) + \
                      (events['datetime'] - events['prev_time']).dt.days * 24 * 60

# Định nghĩa session timeout (30 phút)
session_timeout = 30
events['new_session'] = ((events['time_diff'] > session_timeout) | events['time_diff'].isnull()).astype(int)
events['session_id'] = events.groupby('visitorid')['new_session'].cumsum()


#### Chia train / test theo timestamp để tránh rò rỉ dữ liệu

In [6]:
# Chọn mốc thời gian chia dữ liệu (ví dụ: 80% cho train, 20% cho test)
cutoff_time = events['datetime'].quantile(0.8)
train = events[events['datetime'] <= cutoff_time]
test = events[events['datetime'] > cutoff_time]

print("Train shape:", train.shape)
print("Test shape:", test.shape)


Train shape: (2204881, 10)
Test shape: (551220, 10)


#### Đếm thống kê căn bản — conversion funnel

In [7]:
# Đếm event cho train
funnel = train['event'].value_counts()
print(funnel)

# Đếm conversion rate
users = train.groupby('visitorid')['event'].agg(list)
num_view = users.apply(lambda x: 'view' in x).sum()
num_addtocart = users.apply(lambda x: 'addtocart' in x).sum()
num_transaction = users.apply(lambda x: 'transaction' in x).sum()
print('Users có view:', num_view)
print('Users có addtocart:', num_addtocart)
print('Users có transaction:', num_transaction)


event
view           2132032
addtocart        54985
transaction      17864
Name: count, dtype: int64
Users có view: 1121251
Users có addtocart: 29891
Users có transaction: 9343


#### Save dữ liệu đã xử lý để dùng cho bước Feature Engineering

In [None]:
# train.to_csv('train_processed.csv', index=False)
# test.to_csv('test_processed.csv', index=False) 