In [1]:
import os

import numpy as np
import pandas as pd

import nvtabular as nvt
from nvtabular.ops import *
from merlin.schema.tags import Tags

  warn(f"Triton dtype mappings did not load successfully due to an error: {exc.msg}")


### Define Input/Output Path

In [2]:
INPUT_DATA_DIR = "../../../data/ebnerd_demo_modified/"

In [3]:
df = pd.read_parquet(f"{INPUT_DATA_DIR}/all.parquet")

In [26]:
# change impression_ts granularity to day and convert to timestamp
df["impression_ts"] = pd.to_datetime(df["impression_ts"], unit="s").dt.floor("d")

In [27]:
SESSIONS_MAX_LENGTH = 10

# Categorify categorical features
categ_feats = ['article_id', 'article_is_premium', 'article_type', 'article_category'] >> nvt.ops.Categorify()

# Define Groupby Workflow
groupby_feats = categ_feats + ['user_id', 'impression_ts', 'article_read_time', 'article_total_read_time', 'article_sentiment', 'article_ctr', 'article_emb']

# Group interaction features by session
groupby_features = groupby_feats >> nvt.ops.Groupby(
    groupby_cols=["user_id"],
    aggs={
        "article_id": ["list", "count"],
        "article_is_premium": ["list"],
        "article_type": ["list"],
        "article_category": ["list"],
        "article_read_time": ["list"],
        "article_total_read_time": ["list"],
        "article_sentiment": ["list"],
        "article_ctr": ["list"],
        "article_emb": ["list"],
        "impression_ts": ["first"],
        },
    name_sep="-")


sequence_features_truncated_item = (
    groupby_features['article_id-list']
    >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH)
    >> TagAsItemID()
)

sequence_features_truncated_cat = (
    groupby_features['article_is_premium-list', 'article_type-list', 'article_category-list']
    >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH)
    >> nvt.ops.AddMetadata(tags=[Tags.CATEGORICAL])
)
sequence_features_truncated_cont = (
    groupby_features['article_read_time-list', 'article_total_read_time-list', 'article_sentiment-list', 'article_ctr-list']
    >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH)
    >> nvt.ops.AddMetadata(tags=[Tags.CONTINUOUS])
)
sequence_features_truncated_emb = (
    groupby_features['article_emb-list']
    >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH)
    >> nvt.ops.AddMetadata(tags=[Tags.EMBEDDING])
)



# Filter out sessions with length 1 (not valid for next-item prediction training and evaluation)
MINIMUM_SESSION_LENGTH = 2
selected_features = (
    groupby_features['article_id-count', 'impression_ts-first', 'user_id'] +
    sequence_features_truncated_item +
    sequence_features_truncated_cat +
    sequence_features_truncated_cont +
    sequence_features_truncated_emb
)

filtered_sessions = selected_features >> nvt.ops.Filter(f=lambda df: df["article_id-count"] >= MINIMUM_SESSION_LENGTH)

seq_feats_list = filtered_sessions['article_id-list', 'article_is_premium-list', 'article_type-list', 'article_category-list', 'article_read_time-list', 'article_total_read_time-list', 'article_sentiment-list', 'article_ctr-list', 'article_emb-list'] >>  nvt.ops.ValueCount()

workflow = nvt.Workflow(filtered_sessions['user_id', 'impression_ts-first'] + seq_feats_list)

dataset = nvt.Dataset(df)

# Generate statistics for the features and export parquet files
# this step will generate the schema file
workflow.fit_transform(dataset).to_parquet(os.path.join(INPUT_DATA_DIR, "processed_nvt"))



It is possible to save the preprocessing workflow. That is useful to apply the same preprocessing to other data (with the same schema) and also to deploy the session-based recommendation pipeline to Triton Inference Server.

In [28]:
workflow.output_schema

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.num_buckets,properties.freq_threshold,properties.max_size,properties.cat_path,properties.domain.min,properties.domain.max,properties.domain.name,properties.embedding_sizes.cardinality,properties.embedding_sizes.dimension,properties.value_count.min,properties.value_count.max
0,user_id,(),"DType(name='uint32', element_type=<ElementType...",False,False,,,,,,,,,,,
1,impression_ts-first,(),"DType(name='datetime64[ns]', element_type=<Ele...",False,False,,,,,,,,,,,
2,article_id-list,"(Tags.ITEM, Tags.CATEGORICAL, Tags.LIST, Tags.ID)","DType(name='int64', element_type=<ElementType....",True,True,,0.0,0.0,.//categories/unique.article_id.parquet,0.0,5835.0,article_id,5836.0,206.0,2.0,10.0
3,article_is_premium-list,"(Tags.CATEGORICAL, Tags.LIST)","DType(name='int64', element_type=<ElementType....",True,True,,0.0,0.0,.//categories/unique.article_is_premium.parquet,0.0,4.0,article_is_premium,5.0,16.0,2.0,10.0
4,article_type-list,"(Tags.CATEGORICAL, Tags.LIST)","DType(name='int64', element_type=<ElementType....",True,True,,0.0,0.0,.//categories/unique.article_type.parquet,0.0,13.0,article_type,14.0,16.0,2.0,10.0
5,article_category-list,"(Tags.CATEGORICAL, Tags.LIST)","DType(name='int64', element_type=<ElementType....",True,True,,0.0,0.0,.//categories/unique.article_category.parquet,0.0,23.0,article_category,24.0,16.0,2.0,10.0
6,article_read_time-list,"(Tags.CONTINUOUS, Tags.LIST)","DType(name='float64', element_type=<ElementTyp...",True,True,,,,,,,,,,2.0,10.0
7,article_total_read_time-list,"(Tags.CONTINUOUS, Tags.LIST)","DType(name='float32', element_type=<ElementTyp...",True,True,,,,,,,,,,2.0,10.0
8,article_sentiment-list,"(Tags.CONTINUOUS, Tags.LIST)","DType(name='float32', element_type=<ElementTyp...",True,True,,,,,,,,,,2.0,10.0
9,article_ctr-list,"(Tags.CONTINUOUS, Tags.LIST)","DType(name='float64', element_type=<ElementTyp...",True,True,,,,,,,,,,2.0,10.0


Save NVTabular workflow.

In [29]:
workflow.save(os.path.join(INPUT_DATA_DIR, "workflow_etl"))

## Export pre-processed data by day

In this example we are going to split the preprocessed parquet files by days, to allow for temporal training and evaluation. There will be a folder for each day and three parquet files within each day folder: `train.parquet`, `validation.parquet` and `test.parquet`.

In [3]:
OUTPUT_DIR = os.environ.get("OUTPUT_DIR",os.path.join(INPUT_DATA_DIR, "sessions_by_ts"))

In [4]:
# Read in the processed parquet file
sessions_gdf = pd.read_parquet(os.path.join(INPUT_DATA_DIR, "processed_nvt/part_0.parquet"))

In [5]:
print(sessions_gdf.head(3))

   user_id impression_ts-first  \
0    11313          2023-04-27   
1    13538          2023-04-27   
2    15430          2023-05-02   

                                     article_id-list  \
0  [2177, 220, 708, 642, 1111, 384, 1485, 1083, 1...   
1  [1448, 1914, 577, 114, 329, 679, 935, 303, 713...   
2   [1469, 64, 2068, 64, 2068, 793, 1693, 2172, 514]   

          article_is_premium-list               article_type-list  \
0  [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]  [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]   
1  [3, 4, 3, 3, 3, 4, 3, 3, 3, 3]  [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]   
2     [3, 3, 3, 3, 3, 3, 3, 4, 4]     [3, 3, 3, 3, 3, 3, 3, 3, 3]   

            article_category-list  \
0  [6, 3, 7, 8, 5, 8, 8, 3, 3, 3]   
1  [6, 3, 6, 6, 3, 3, 9, 5, 5, 4]   
2     [3, 3, 3, 3, 3, 3, 3, 6, 6]   

                              article_read_time-list  \
0  [8.0, 25.0, 36.0, 18.0, 16.0, 15.0, 15.0, 526....   
1  [9.0, 109.0, 9.0, 9.0, 16.0, 14.0, 26.0, 26.0,...   
2  [101.0, 32.0, 70.0, 12.0, 136.0, 75.0, 16

In [6]:
from transformers4rec.utils.data_utils import save_time_based_splits



save_time_based_splits(data=nvt.Dataset(sessions_gdf),
                       output_dir= OUTPUT_DIR,
                       partition_col='impression_ts-first',
                       timestamp_col='user_id',
                       cpu=True,
                      )



ValueError: object too deep for desired array

## Check out the preprocessed outputs

In [14]:
TRAIN_PATHS = os.path.join(OUTPUT_DIR, "1", "train.parquet")

In [15]:
df = pd.read_parquet(TRAIN_PATHS)
df.head()

Unnamed: 0,session_id,item_id-list,category-list,age_days-list,weekday_sin-list
0,70000,"[306, 5, 40, 17]","[104, 3, 12, 6]","[0.044022594, 0.34956282, 0.7326993, 0.09403495]","[0.7417527, 0.60325843, 0.07417604, 0.28911334]"
1,70001,"[43, 20, 69, 8, 57]","[13, 6, 21, 3, 16]","[0.8072543, 0.28916782, 0.04966254, 0.08417622...","[0.7995051, 0.86722755, 0.84298295, 0.15793765..."
2,70002,"[137, 35, 37, 85, 65, 5]","[37, 10, 11, 22, 18, 3]","[0.04696693, 0.94499177, 0.2922437, 0.83047426...","[0.72519076, 0.92308444, 0.40120387, 0.3821016..."
4,70007,"[28, 9, 153, 74, 53, 15, 173]","[9, 4, 39, 20, 15, 5, 46]","[0.4730765, 0.69885534, 0.034774363, 0.7225920...","[0.33613566, 0.660022, 0.72897774, 0.66087157,..."
5,70021,"[59, 32, 11, 21, 23, 23, 9, 15]","[17, 10, 7, 7, 8, 8, 4, 5]","[0.07898139, 0.27463168, 0.1885847, 0.5203435,...","[0.39734098, 0.74895114, 0.43540764, 0.8372503..."


In [16]:
import gc
del df
gc.collect()

512

You have  just created session-level features to train a session-based recommendation model using NVTabular. Now you can move to the the next notebook,`02-session-based-XLNet-with-PyT.ipynb` to train a session-based recommendation model using [XLNet](https://arxiv.org/abs/1906.08237), one of the state-of-the-art NLP model. Please shut down this kernel to free the GPU memory before you start the next one.