In [21]:
import os

import numpy as np
import pandas as pd

import nvtabular as nvt
from nvtabular.ops import *
from merlin.schema.tags import Tags
import cudf
import cupy
import datetime

### Define Input/Output Path

In [16]:
INPUT_DATA_DIR = "../../../data/ebnerd_demo_modified/"

In [25]:
df = pd.read_parquet(f"{INPUT_DATA_DIR}/all.parquet")

In [28]:
df["impression_ts"] = (datetime.datetime.now() - df['impression_ts']).dt.days

In [29]:
df["impression_ts"] = df["impression_ts"] - df["impression_ts"].min()

In [30]:
df

Unnamed: 0,user_id,impression_ts,article_read_time,article_id,article_is_premium,article_type,article_topics,article_category,article_total_read_time,article_sentiment,article_ctr,article_emb
0,11313,35,32.0,9738292,False,article_default,"[Politik, International politik, Sundhed, Sygd...",nyheder,6637769.0,0.9655,0.213338,"[-0.13421957, 0.036403418, -0.3091001, 0.08155..."
1,11313,35,7.0,9733713,True,article_standard_feature,"[Samfund, Konflikt og krig, Væbnet konflikt]",nyheder,5907158.0,0.8101,0.134374,"[-0.18189247, 0.16488591, -0.23126304, 0.11608..."
2,11313,34,34.0,9740174,False,article_default,"[Erhverv, Privat virksomhed, Økonomi]",nyheder,3139645.0,0.8366,0.160523,"[-0.11109238, -0.045725856, -0.19442482, 0.083..."
3,11313,34,9.0,9740356,False,article_default,"[Kriminalitet, Personfarlig kriminalitet]",krimi,6991537.0,0.9936,0.215200,"[-0.10872016, 0.06527501, -0.26191005, 0.10563..."
4,11313,31,323.0,9730301,False,article_default,"[Videnskab, Naturvidenskab]",nyheder,5516137.0,0.6914,0.172012,"[-0.012549153, 0.089903, -0.1078802, 0.0717587..."
...,...,...,...,...,...,...,...,...,...,...,...,...
379847,2585449,1,216.0,9771579,False,article_default,"[Kendt, Begivenhed, Sport, Sundhed, Sygdom og ...",sport,1227700.0,0.9285,0.119506,"[-0.039634164, 0.11201517, 0.020189472, 0.0090..."
379848,2585449,1,77.0,9789417,False,article_default,"[Kendt, Livsstil, Familieliv, Underholdning, F...",underholdning,5148529.0,0.9210,0.168332,"[-0.070519224, 0.09460071, -0.22725518, 0.0866..."
379849,2585449,1,9.0,9673564,True,article_default,"[Økonomi, Mikro]",forbrug,5696063.0,0.7402,0.040712,"[0.062252264, 0.035441652, -0.0682976, 0.02613..."
379850,2585449,1,62.0,9787846,False,article_default,"[Kriminalitet, Personfarlig kriminalitet]",krimi,2336959.0,0.9938,0.161884,"[-0.016353942, 0.08321255, -0.13511518, 0.1513..."


In [31]:
SESSIONS_MAX_LENGTH = 10

# Categorify categorical features
categ_feats = ['article_id', 'article_is_premium', 'article_type', 'article_category'] >> nvt.ops.Categorify()

# Define Groupby Workflow
groupby_feats = categ_feats + ['user_id', 'impression_ts', 'article_read_time', 'article_total_read_time', 'article_sentiment', 'article_ctr', 'article_emb']

# Group interaction features by session
groupby_features = groupby_feats >> nvt.ops.Groupby(
    groupby_cols=["user_id"],
    aggs={
        "article_id": ["list", "count"],
        "article_is_premium": ["list"],
        "article_type": ["list"],
        "article_category": ["list"],
        "article_read_time": ["list"],
        "article_total_read_time": ["list"],
        "article_sentiment": ["list"],
        "article_ctr": ["list"],
        #"article_emb": ["list"],
        "impression_ts": ["first"],
        },
    name_sep="-")


sequence_features_truncated_item = (
    groupby_features['article_id-list']
    >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH)
    >> TagAsItemID()
)

sequence_features_truncated_cat = (
    groupby_features['article_is_premium-list', 'article_type-list', 'article_category-list']
    >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH)
    >> nvt.ops.AddMetadata(tags=[Tags.CATEGORICAL])
)
sequence_features_truncated_cont = (
    groupby_features['article_read_time-list', 'article_total_read_time-list', 'article_sentiment-list', 'article_ctr-list']
    >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH)
    >> nvt.ops.AddMetadata(tags=[Tags.CONTINUOUS])
)
#sequence_features_truncated_emb = (
  #  groupby_features['article_emb-list']
  #  >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH)
 #   >> nvt.ops.AddMetadata(tags=[Tags.EMBEDDING])
#)


# Filter out sessions with length 1 (not valid for next-item prediction training and evaluation)
MINIMUM_SESSION_LENGTH = 2
selected_features = (
    groupby_features['article_id-count', 'impression_ts-first', 'user_id'] +
    sequence_features_truncated_item +
    sequence_features_truncated_cat +
    sequence_features_truncated_cont 
    #sequence_features_truncated_emb
)

filtered_sessions = selected_features >> nvt.ops.Filter(f=lambda df: df["article_id-count"] >= MINIMUM_SESSION_LENGTH)

seq_feats_list = filtered_sessions['article_id-list', 'article_is_premium-list', 'article_type-list', 'article_category-list', 'article_read_time-list', 'article_total_read_time-list', 'article_sentiment-list', 'article_ctr-list'] >>  nvt.ops.ValueCount()

workflow = nvt.Workflow(filtered_sessions['user_id', 'impression_ts-first'] + seq_feats_list)

dataset = nvt.Dataset(df)

# Generate statistics for the features and export parquet files
# this step will generate the schema file
workflow.fit_transform(dataset).to_parquet(os.path.join(INPUT_DATA_DIR, "processed_nvt"))

  df = _general_concat(dfs, ignore_index=True)
  df = _general_concat(dfs, ignore_index=True)
  df = _general_concat(dfs, ignore_index=True)
  df = _general_concat(dfs, ignore_index=True)
  df = _general_concat(dfs, ignore_index=True)
  df = _general_concat(dfs, ignore_index=True)
  df = _general_concat(dfs, ignore_index=True)
  df = _general_concat(dfs, ignore_index=True)


It is possible to save the preprocessing workflow. That is useful to apply the same preprocessing to other data (with the same schema) and also to deploy the session-based recommendation pipeline to Triton Inference Server.

In [32]:
workflow.output_schema

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.num_buckets,properties.freq_threshold,properties.max_size,properties.cat_path,properties.domain.min,properties.domain.max,properties.domain.name,properties.embedding_sizes.cardinality,properties.embedding_sizes.dimension,properties.value_count.min,properties.value_count.max
0,user_id,(),"DType(name='uint32', element_type=<ElementType...",False,False,,,,,,,,,,,
1,impression_ts-first,(),"DType(name='int64', element_type=<ElementType....",False,False,,,,,,,,,,,
2,article_id-list,"(Tags.CATEGORICAL, Tags.ITEM, Tags.ID, Tags.LIST)","DType(name='int64', element_type=<ElementType....",True,True,,0.0,0.0,.//categories/unique.article_id.parquet,0.0,5835.0,article_id,5836.0,206.0,2.0,10.0
3,article_is_premium-list,"(Tags.CATEGORICAL, Tags.LIST)","DType(name='int64', element_type=<ElementType....",True,True,,0.0,0.0,.//categories/unique.article_is_premium.parquet,0.0,4.0,article_is_premium,5.0,16.0,2.0,10.0
4,article_type-list,"(Tags.CATEGORICAL, Tags.LIST)","DType(name='int64', element_type=<ElementType....",True,True,,0.0,0.0,.//categories/unique.article_type.parquet,0.0,13.0,article_type,14.0,16.0,2.0,10.0
5,article_category-list,"(Tags.CATEGORICAL, Tags.LIST)","DType(name='int64', element_type=<ElementType....",True,True,,0.0,0.0,.//categories/unique.article_category.parquet,0.0,23.0,article_category,24.0,16.0,2.0,10.0
6,article_read_time-list,"(Tags.CONTINUOUS, Tags.LIST)","DType(name='float64', element_type=<ElementTyp...",True,True,,,,,,,,,,2.0,10.0
7,article_total_read_time-list,"(Tags.CONTINUOUS, Tags.LIST)","DType(name='float32', element_type=<ElementTyp...",True,True,,,,,,,,,,2.0,10.0
8,article_sentiment-list,"(Tags.CONTINUOUS, Tags.LIST)","DType(name='float32', element_type=<ElementTyp...",True,True,,,,,,,,,,2.0,10.0
9,article_ctr-list,"(Tags.CONTINUOUS, Tags.LIST)","DType(name='float64', element_type=<ElementTyp...",True,True,,,,,,,,,,2.0,10.0


Save NVTabular workflow.

In [33]:
workflow.save(os.path.join(INPUT_DATA_DIR, "workflow_etl"))

## Export pre-processed data by day

In this example we are going to split the preprocessed parquet files by days, to allow for temporal training and evaluation. There will be a folder for each day and three parquet files within each day folder: `train.parquet`, `validation.parquet` and `test.parquet`.

In [34]:
OUTPUT_DIR = os.environ.get("OUTPUT_DIR",os.path.join(INPUT_DATA_DIR, "sessions_by_ts"))

In [35]:
# Read in the processed parquet file
sessions_gdf = pd.read_parquet(os.path.join(INPUT_DATA_DIR, "processed_nvt/part_0.parquet"))

In [36]:
print(sessions_gdf.head(3))

   user_id  impression_ts-first  \
0    11313                   35   
1    13538                   35   
2    15430                   30   

                                     article_id-list  \
0  [2180, 220, 716, 646, 1115, 384, 1497, 1091, 1...   
1  [1442, 1910, 582, 114, 333, 677, 937, 306, 718...   
2   [1470, 64, 2062, 64, 2062, 790, 1700, 2181, 514]   

          article_is_premium-list               article_type-list  \
0  [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]  [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]   
1  [3, 4, 3, 3, 3, 4, 3, 3, 3, 3]  [3, 3, 3, 3, 3, 3, 3, 3, 3, 3]   
2     [3, 3, 3, 3, 3, 3, 3, 4, 4]     [3, 3, 3, 3, 3, 3, 3, 3, 3]   

            article_category-list  \
0  [6, 3, 7, 8, 5, 8, 8, 3, 3, 3]   
1  [6, 3, 6, 6, 3, 3, 9, 5, 5, 4]   
2     [3, 3, 3, 3, 3, 3, 3, 6, 6]   

                              article_read_time-list  \
0  [8.0, 25.0, 36.0, 18.0, 16.0, 15.0, 15.0, 526....   
1  [9.0, 109.0, 9.0, 9.0, 16.0, 14.0, 26.0, 26.0,...   
2  [101.0, 32.0, 70.0, 12.0, 136.0, 75.0

In [37]:
from transformers4rec.utils.data_utils import save_time_based_splits



save_time_based_splits(data=nvt.Dataset(sessions_gdf),
                       output_dir= OUTPUT_DIR,
                       partition_col='impression_ts-first',
                       timestamp_col='user_id',
                       cpu=False,
                      )

Creating time-based splits: 100%|██████████| 29/29 [00:00<00:00, 29.05it/s]


## Check out the preprocessed outputs

In [38]:
TRAIN_PATHS = os.path.join(OUTPUT_DIR, "1", "train.parquet")

In [39]:
df = pd.read_parquet(TRAIN_PATHS)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../../../data/ebnerd_demo_modified/sessions_by_ts/1/train.parquet'

In [None]:
import gc
del df
gc.collect()

You have  just created session-level features to train a session-based recommendation model using NVTabular. Now you can move to the the next notebook,`02-session-based-XLNet-with-PyT.ipynb` to train a session-based recommendation model using [XLNet](https://arxiv.org/abs/1906.08237), one of the state-of-the-art NLP model. Please shut down this kernel to free the GPU memory before you start the next one.