In [None]:
import os
import re
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict
import matplotlib.pyplot as plt
%matplotlib inline

plt.rcParams["figure.figsize"] = (10,8)

In [None]:
FNAME = Path('./yoochoose/yoochoose-buys.dat')

In [None]:
df = pd.read_csv(FNAME, 
                 sep=',', 
                 names=['session_id', 'timestamp', 'item_id', 'price', 'quantity'],
                 parse_dates=['timestamp'],
                 #nrows=100
                )

df.head()

In [None]:
f'{len(df):,d}'

In [None]:
df.describe()

In [None]:
df.timestamp

In [None]:
df.timestamp.min(), df.timestamp.max()

### Count distinct values

In [None]:
df.session_id.value_counts()

In [None]:
df.item_id.nunique()

# Filtering

We preserve the purchase sequences for a moderate size of data. Items
interacted by less than 5 times are removed to avoid cold-start
issue. Sequences whose length is less than 3 are also removed.

In [None]:
filtered_df = df[['session_id', 'timestamp', 'item_id']]

In [None]:
mask = filtered_df.groupby('item_id')['item_id'].transform('size') >= 5

filtered_df = filtered_df[mask]

len(filtered_df)

In [None]:
mask = filtered_df.groupby('session_id')['session_id'].transform('size') >= 3

filtered_df = filtered_df[mask]

len(filtered_df)

In [None]:
n = filtered_df.item_id.nunique()
m = filtered_df.session_id.nunique()

f'users = {m}, items = {n}'

# Stats per session

In [None]:
# lunghezza max di una sequenza

stats_session = filtered_df.groupby('session_id').agg(
    NumRows=('session_id', 'size'),
    MinTimestamp=('timestamp', 'min'),
    MaxTimestamp=('timestamp', 'max')
).reset_index()

len(stats_session)

In [None]:
stats_session['delta_timestamp'] = stats_session.MaxTimestamp - stats_session.MinTimestamp

stats_session.delta_timestamp.max()

In [None]:
sessions_length = stats_session.NumRows.sort_values().to_numpy()

plt.plot(sessions_length);

# Stats per item

In [None]:

stats_item = filtered_df.groupby('item_id').agg(
    NumRows=('item_id', 'size'),
    MinTimestamp=('timestamp', 'min'),
    MaxTimestamp=('timestamp', 'max')
).reset_index()

len(stats_item)

In [None]:
stats_item['delta_timestamp'] = stats_item.MaxTimestamp - stats_item.MinTimestamp

stats_item.delta_timestamp.max()

In [None]:
items_count = stats_item.NumRows.sort_values().to_numpy()

plt.plot(items_count);

# Save to file

In [None]:
FNAME.with_suffix('.parquet')

In [None]:
filtered_df2 = filtered_df.rename(columns={'session_id': 'User', 'item_id': 'Item', 'timestamp': 'Timestamp'})

# remap id
user_id_map = defaultdict(lambda: len(user_id_map) + 1)
item_id_map = defaultdict(lambda: len(item_id_map) + 1)

filtered_df2['User'] = filtered_df2.User.apply(lambda x: user_id_map[x])
filtered_df2['Item'] = filtered_df2.Item.apply(lambda x: item_id_map[x])

del user_id_map
del item_id_map

filtered_df2

In [None]:
for colName in ('User', 'Item'):
    n_unique = filtered_df2[colName].nunique()
    a = filtered_df2[colName].min()
    b = filtered_df2[colName].max()
    print(f'{colName} -> unique {n_unique}, min {a}, max {b}')

In [None]:
filtered_df2.to_parquet(FNAME.with_suffix('.parquet'))