In [21]:

SESSIONS_MAX_LENGTH = 100
MINIMUM_SESSION_LENGTH = 2


In [22]:
import os
import glob
import numpy as np
import gc
import pandas as pd
import cudf
import cupy
import nvtabular as nvt
from merlin.dag import ColumnSelector
from merlin.schema import Schema, Tags
from datetime import datetime
from merlin.core.dispatch import convert_data
import pyarrow
from numba import config
config.CUDA_LOW_OCCUPANCY_WARNINGS = 0


#### Define Data Input and Output Paths

In [23]:
DATA_FOLDER = "/cta/users/eboran/Tez/Workspace - movielens25m/general/movielens25m"
FILENAME_PATTERN = 'manipulated_rating.csv'
DATA_PATH = os.path.join(DATA_FOLDER, FILENAME_PATTERN)

OUTPUT_FOLDER = "./movielens_transformed"
OVERWRITE = False

## Load and clean raw data

In [24]:
interactions_df = cudf.read_csv(DATA_PATH, sep=',', 
                           #     names=['userId','timestamp', 'movieId', 'rating'], 
                           #     dtype=['int', 'date', 'int', 'float64']
                               )

interactions_df["timestamp"] = cudf.to_datetime(interactions_df["timestamp"])

interactions_df = interactions_df.sort_values(['userId', 'timestamp']).reset_index(drop=True)

interactions_df = interactions_df.rename(columns ={"movieId":"item_id", "userId":"session_id"})

interactions_df.timestamp = interactions_df.timestamp.astype(int)

items_first_ts_df = interactions_df.groupby('item_id').agg({'timestamp': 'min'}).reset_index().rename(columns={'timestamp': 'itemid_ts_first'})
interactions_merged_df = interactions_df.merge(items_first_ts_df, on=['item_id'], how='left')

interactions_merged_df.to_parquet(os.path.join(DATA_FOLDER, 'interactions_merged_df.parquet'))
# free gpu memory
del interactions_df, items_first_ts_df
gc.collect()


4445

In [25]:
interactions_merged_df

Unnamed: 0,session_id,item_id,rating,timestamp,_genres,tag,genome_relevance,genome_tag,itemid_ts_first
0,134,2324,4.5,1120105510000000000,Comedy,,0.99750,holocaust,909253688000000000
1,134,379,2.0,1120105531000000000,Action,,0.98675,time travel,825645600000000000
2,134,805,5.0,1120105552000000000,Drama,,0.99100,vigilante,837174815000000000
3,135,2641,3.0,1183626015000000000,Action,,0.99100,superhero,926193082000000000
4,135,2427,4.5,1183626018000000000,Action,,0.95425,war movie,915115923000000000
...,...,...,...,...,...,...,...,...,...
25000090,162534,184471,2.5,1527530568000000000,Action,,0.83775,action,1521069012000000000
25000091,162534,183611,3.0,1527530732000000000,Action,,0.84950,original,1519402882000000000
25000092,162534,173007,3.0,1528559027000000000,Action,,0.98375,futuristic,1495255732000000000
25000093,162534,187593,2.5,1529342275000000000,Action,,0.88000,absurd,1526361753000000000


In [26]:



main_cat_feats = ColumnSelector(['item_id', 'session_id','timestamp',]
                          ) >> nvt.ops.Categorify(start_index=1)


cat_feats = ColumnSelector([
                            '_genres', 'tag','genome_tag']
                          ) >> nvt.ops.Categorify(start_index=1)

con_feats = ColumnSelector(['rating','genome_relevance'])


session_ts = ColumnSelector(['timestamp'])
session_time = (
    session_ts >> 
    nvt.ops.LambdaOp(lambda col: cudf.to_datetime(col, unit='ns')) >> 
    nvt.ops.Rename(name = 'event_time_dt')
)

sessiontime_day = (
    session_time >> 
    nvt.ops.LambdaOp(lambda col: col.dt.day) >> 
    nvt.ops.Rename(name ='et_dayofday')
)

sessiontime_month = (
    session_time >> 
    nvt.ops.LambdaOp(lambda col: col.dt.month) >> 
    nvt.ops.Rename(name ='event_time_M')
)


sessiontime_weekday = (
    session_time >> 
    nvt.ops.LambdaOp(lambda col: col.dt.weekday) >> 
    nvt.ops.Rename(name ='et_dayofweek')
)

sessiontime_year = (
    session_time >> 
    nvt.ops.LambdaOp(lambda col: col.dt.year) >> 
    nvt.ops.Rename(name ='et_year')
)


def get_cycled_feature_value_sin(col, max_value):
    value_scaled = (col + 0.000001) / max_value
    value_sin = np.sin(2*np.pi*value_scaled)
    return value_sin

weekday_sin = sessiontime_weekday >> (lambda col: get_cycled_feature_value_sin(col+1, 7)) >> nvt.ops.Rename(name = 'et_dayofweek_sin')
dayofday_sin = sessiontime_day >> (lambda col: get_cycled_feature_value_sin(col+1, 30)) >> nvt.ops.Rename(name = 'et_dayofday_sin')

class ItemRecency(nvt.ops.Operator):
    def transform(self, columns, gdf):
        for column in columns.names:
            col = gdf[column]
            item_first_timestamp = gdf['itemid_ts_first']
            delta_days = (col - item_first_timestamp) / (60*60*24)
            gdf[column + "_age_days"] = delta_days * (delta_days >=0)
        return gdf

    def compute_selector(
        self,
        input_schema: Schema,
        selector: ColumnSelector,
        parents_selector: ColumnSelector,
        dependencies_selector: ColumnSelector,
    ) -> ColumnSelector:
        self._validate_matching_cols(input_schema, parents_selector, "computing input selector")
        return parents_selector

    def column_mapping(self, col_selector):
        column_mapping = {}
        for col_name in col_selector.names:
            column_mapping[col_name + "_age_days"] = [col_name]
        return column_mapping

    @property
    def dependencies(self):
        return ["itemid_ts_first"]

    @property
    def output_dtype(self):
        return np.float64
    
recency_features = session_ts >> ItemRecency() 

recency_features_norm = recency_features >> nvt.ops.LogOp() >> nvt.ops.Normalize() >> nvt.ops.Rename(name='product_recency_days_log_norm')

time_features = (
    session_time +
    sessiontime_day + 
    sessiontime_month + 
    sessiontime_weekday +
    sessiontime_year +    
    weekday_sin + 
    dayofday_sin + 
    recency_features_norm 
)



In [27]:

features = main_cat_feats + cat_feats + con_feats + time_features


In [28]:


groupby_features = features >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    aggs={
        'item_id': ["list", "count"],
        '_genres': ["list"], 
        'tag': ["list"],
        'genome_tag': ["list"],        
        'genome_relevance' : ['list'],
        
        'rating': ["list"],
        'timestamp': ["first"],
        'event_time_dt': ["first"],
        'event_time_M':['first','list'],
        'et_dayofweek_sin': ["list"],
        'et_dayofday_sin': ["list"],
        
        'et_dayofweek':['list','first'],
        'et_dayofday': ['list'],
        'product_recency_days_log_norm': ["list"],
        'et_year': ["list",'first','last'],
    },
    name_sep="-") >> nvt.ops.AddMetadata(tags=[Tags.CATEGORICAL])



In [29]:

groupby_features_list = groupby_features['item_id-list', 
                                         
                                         '_genres-list',
                                         'tag-list',
                                         'genome_tag-list',
                                         'genome_relevance-list',
                                         
                                         'rating-list', 

                                         'et_dayofweek_sin-list',                        
                                         'et_dayofday_sin-list', 
                                         'product_recency_days_log_norm-list',
                                         'et_year-list',
                                         'et_dayofweek-list',
                                         'et_dayofday-list',
                                         'event_time_M-list',
                                                                                                                                                                                                                                                                                                                         ]
groupby_features_list

<Node [('item_id-list', '_genres-list', 'tag-list', 'genome_tag-list', 'genome_relevance-list', 'rating-list', 'et_dayofweek_sin-list', 'et_dayofday_sin-list', 'product_recency_days_log_norm-list', 'et_year-list', 'et_dayofweek-list', 'et_dayofday-list', 'event_time_M-list')] output>

In [30]:

day_index_month = ((groupby_features['event_time_M-first'])  >> 
    nvt.ops.LambdaOp(
                     lambda col: col - col.min()
                    ) >> 
    nvt.ops.Rename(f = lambda col: "day_index_M")
)



In [31]:

day_index_year = ((groupby_features['et_year-first'])  >> 
    nvt.ops.LambdaOp(
                     lambda col: col - col.min()
                    ) >> 
    nvt.ops.Rename(f = lambda col: "day_index_year")
)



In [32]:
def list_files(startpath):
    """
    Util function to print the nested structure of a directory
    """
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, "").count(os.sep)
        indent = " " * 4 * (level)
        print("{}{}/".format(indent, os.path.basename(root)))
        subindent = " " * 4 * (level + 1)
        for f in files:
            print("{}{}".format(subindent, f))

**run all**

In [33]:

for SESSIONS_MAX_LENGTH in [SESSIONS_MAX_LENGTH ]:

    print("SESSIONS_MAX_LENGTH:", SESSIONS_MAX_LENGTH)
    
    groupby_features_truncated = groupby_features_list >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH, pad=True) >> nvt.ops.Rename(postfix = '_seq')
    selected_features = groupby_features['session_id', 'item_id-count']   + day_index_year + day_index_month + groupby_features_truncated
    filtered_sessions = selected_features >> nvt.ops.Filter(f=lambda df: df["item_id-count"] >= MINIMUM_SESSION_LENGTH) 
    
    convert_data(interactions_merged_df)

    dataset = nvt.Dataset(interactions_merged_df)

    workflow = nvt.Workflow(filtered_sessions)



    workflow.fit(dataset)
    
    sessions_gdf = workflow.transform(dataset).compute()
    workflow.save('workflow_etl')

    sessions_gdf["day_index"] = sessions_gdf["day_index_year"] * 12 + sessions_gdf["day_index_M"]  + 1


    output_dir = f"./datasets/preproc_sessions_by_day{SESSIONS_MAX_LENGTH}"


    from transformers4rec.data.preprocessing import save_time_based_splits
    
    save_time_based_splits(data=nvt.Dataset(sessions_gdf),
                           output_dir= output_dir,
                           partition_col='day_index',
                           timestamp_col='session_id', 
                          )


    
    dir_list = os.listdir(output_dir)

    
    dir_list_int = list()

    for _dir in dir_list:
        dir_list_int.append(int(_dir))

    dir_list_int = sorted(dir_list_int)

    for i, _dir in enumerate(dir_list_int):
        os.rename(f"{output_dir}/{_dir}",f"{output_dir}/{i+1}")


SESSIONS_MAX_LENGTH: 100
item_id-list
_genres-list
tag-list
genome_tag-list
genome_relevance-list
rating-list
et_dayofweek_sin-list
et_dayofday_sin-list
product_recency_days_log_norm-list
et_year-list
et_dayofweek-list
et_dayofday-list
event_time_M-list
item_id-list
_genres-list
tag-list
genome_tag-list
genome_relevance-list
rating-list
et_dayofweek_sin-list
et_dayofday_sin-list
product_recency_days_log_norm-list
et_year-list
et_dayofweek-list
et_dayofday-list
event_time_M-list


Creating time-based splits: 100%|██████████| 286/286 [00:52<00:00,  5.40it/s]


In [34]:
sessions_gdf

Unnamed: 0,session_id,item_id-count,day_index_year,day_index_M,item_id-list_seq,_genres-list_seq,tag-list_seq,genome_tag-list_seq,genome_relevance-list_seq,rating-list_seq,et_dayofweek_sin-list_seq,et_dayofday_sin-list_seq,product_recency_days_log_norm-list_seq,et_year-list_seq,et_dayofweek-list_seq,et_dayofday-list_seq,event_time_M-list_seq,day_index
0,2,32202,19,11,"[40555, 40757, 41024, 58407, 51608, 31371, 440...","[5, 4, 3, 4, 2, 12, 12, 5, 13, 3, 5, 2, 8, 14,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2.0, 3.0, 2.5, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...","[0.43388295, 0.43388295, 0.43388295, 0.4338829...","[-0.74314505, -0.74314505, -0.74314505, -0.743...","[-0.7243521213531494, -0.9043965339660645, -1....","[2019, 2019, 2019, 2019, 2019, 2019, 2019, 201...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[18, 18, 18, 18, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,...","[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...",240
1,3,9178,5,7,"[17514, 21211, 21850, 1195, 22661, 2287, 30406...","[3, 4, 4, 4, 3, 4, 4, 4, 4, 6, 4, 3, 10, 3, 3,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[8, 8, 8, 335, 8, 392, 8, 8, 8, 8, 8, 80, 8, 6...","[0.0, 0.0, 0.0, 0.9794999999999999, 0.0, 0.954...","[3.0, 3.0, 3.0, 4.0, 3.0, 4.0, 3.0, 3.0, 3.0, ...","[-0.781831, -0.781831, -0.43388462, 0.9749277,...","[-0.40673605, -0.40673605, -0.74314505, -0.951...","[-3.4826812744140625, -3.5111465454101562, -3....","[2010, 2010, 2010, 2010, 2011, 2011, 2011, 201...","[5, 5, 3, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 6, 6, ...","[27, 27, 18, 23, 12, 12, 13, 13, 13, 13, 14, 1...","[3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",68
2,4,8913,13,4,"[8992, 3692, 27201, 7699, 26719, 604, 1527, 20...","[5, 4, 10, 4, 10, 14, 2, 2, 2, 3, 2, 3, 10, 3,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[653, 93, 8, 258, 8, 33, 4, 93, 8, 8, 4, 8, 8,...","[0.75175, 0.8387500000000002, 0.0, 0.99925, 0....","[2.0, 3.5, 2.5, 3.5, 3.5, 3.0, 2.5, 2.0, 3.0, ...","[1.1285199e-06, 1.1285199e-06, 0.9749277, -0.4...","[-0.5877857, -0.5877857, 0.40673634, -5.642599...","[-1.0276482105255127, -1.03083336353302, 0.104...","[2019, 2019, 2019, 2019, 2019, 2019, 2019, 201...","[6, 6, 1, 3, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, ...","[17, 17, 12, 14, 17, 18, 18, 18, 18, 19, 19, 1...","[2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...",161
3,5,7919,19,6,"[8809, 39389, 7206, 33590, 8600, 6286, 7552, 3...","[6, 4, 4, 4, 3, 4, 9, 6, 3, 2, 4, 6, 3, 10, 7,...","[7739, 2, 17, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,...","[294, 8, 77, 8, 545, 247, 107, 195, 8, 4, 341,...","[0.9874999999999999, 0.0, 0.99025, 0.0, 0.9575...","[3.5, 2.5, 1.0, 3.0, 3.0, 3.5, 1.0, 3.5, 2.5, ...","[-0.43388462, 0.9749277, 1.1285199e-06, 1.1285...","[-0.5877857, 0.9510564, 0.20791137, 0.9945219,...","[-0.8711152076721191, -14.183878898620605, -0....","[2017, 2017, 2017, 2017, 2015, 2015, 2015, 201...","[3, 1, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[17, 8, 13, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6...","[8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...",235
4,6,7488,10,9,"[17905, 21870, 16622, 16570, 10352, 9005, 1700...","[14, 3, 3, 4, 4, 4, 2, 3, 10, 10, 2, 3, 2, 4, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[8, 8, 8, 8, 806, 190, 125, 2, 8, 8, 94, 204, ...","[0.0, 0.0, 0.0, 0.0, 0.8895, 0.9515, 0.9884999...","[4.0, 3.5, 2.5, 4.0, 2.5, 3.5, 3.0, 2.0, 3.0, ...","[0.7818321, 0.7818321, 0.7818321, 1.1285199e-0...","[0.99452186, 0.99452186, 0.99452186, 0.9945219...","[-0.1764855533838272, -0.1463281512260437, -0....","[2012, 2012, 2012, 2012, 2008, 2008, 2008, 200...","[0, 0, 0, 6, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[7, 7, 7, 6, 21, 21, 27, 27, 27, 27, 27, 27, 2...","[5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...",130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162536,162538,20,11,11,"[927, 849, 655, 868, 824, 839, 661, 712, 629, ...","[3, 2, 2, 3, 4, 2, 4, 4, 3, 4, 2, 6, 3, 3, 2, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[279, 23, 4, 362, 115, 271, 379, 60, 280, 203,...","[0.8574999999999999, 0.9640000000000001, 0.987...","[3.0, 4.5, 3.5, 2.0, 2.5, 2.0, 1.0, 1.0, 0.5, ...","[-0.781831, -0.781831, -0.781831, -0.781831, -...","[0.9510564, 0.9510564, 0.9510564, 0.9510564, 0...","[0.6215946674346924, 0.6193531155586243, 0.508...","[2007, 2007, 2007, 2007, 2007, 2007, 2007, 200...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1...",144
162537,162539,20,1,4,"[447, 564, 683, 433, 17, 602, 1277, 1079, 14, ...","[3, 2, 3, 4, 2, 3, 2, 3, 5, 4, 11, 4, 2, 2, 2,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[72, 396, 404, 377, 9, 205, 469, 279, 57, 238,...","[0.98125, 0.8767499999999999, 0.993, 0.86875, ...","[4.0, 4.0, 3.0, 5.0, 5.0, 4.0, 4.0, 3.0, 3.0, ...","[0.9749277, 0.9749277, 0.9749277, 0.9749277, 0...","[0.20791137, 0.20791137, 0.20791137, 0.2079113...","[-0.5039454102516174, -0.539773166179657, -0.6...","[1997, 1997, 1997, 1997, 1997, 1997, 1997, 199...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 1...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...",17
162538,162540,20,0,5,"[170, 157, 112, 24, 23, 10, 149, 33, 27, 4, 38...","[4, 2, 4, 12, 11, 2, 2, 2, 5, 3, 5, 2, 2, 2, 2...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[47, 249, 41, 5, 32, 60, 64, 31, 113, 21, 118,...","[0.995, 0.98125, 0.999, 0.991, 0.99375, 0.9894...","[5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 1.0, 3.0, 4.0, ...","[-0.9749281, -0.9749281, -0.9749281, -0.974928...","[-0.20791082, -0.20791082, -0.20791082, -0.207...","[-1.2076829671859741, -1.2076810598373413, -1....","[1996, 1996, 1996, 1996, 1996, 1996, 1996, 199...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 2...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...",6
162539,162541,20,17,6,"[777, 824, 826, 782, 719, 744, 681, 603, 606, ...","[4, 4, 3, 2, 3, 5, 2, 2, 2, 3, 2, 2, 2, 4, 5, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[30, 115, 83, 4, 6, 338, 25, 13, 259, 312, 292...","[0.99475, 0.981, 0.989, 0.9824999999999999, 0....","[1.5, 3.5, 4.0, 2.0, 2.5, 3.0, 2.5, 2.0, 1.5, ...","[0.43388295, 0.43388295, 0.43388295, 0.4338829...","[0.74314505, 0.74314505, 0.74314505, 0.7431450...","[0.799306333065033, 0.7975705862045288, 0.8070...","[2013, 2013, 2013, 2013, 2013, 2013, 2013, 201...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...",211


Let's print the head of our preprocessed dataset. You can notice that now each example (row) is a session and the sequential features with respect to user interactions were converted to lists with matching length.

**create the schema**

In [35]:
def get_min_max_of_the_feture(feature):
    min_ = 9999999999999999
    max_ = -9999999999999999
    
    for values in sessions_gdf[feature].to_arrow():
        if isinstance(values, pyarrow.lib.ListScalar):
            for element in values:
                e = element.as_py()
                if e > max_:
                    max_ = e
                if e <= min_:
                    min_ = e
        else:
            e = values.as_py()
            if e > max_:
                max_ = e
            if e <= min_:
                min_ = e
            
    return min_, max_
    
# get_min_max_of_the_feture("et_year-list_seq")

In [36]:
sessions_gdf.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 162541 entries, 0 to 162540
Data columns (total 18 columns):
 #   Column                                  Non-Null Count   Dtype
---  ------                                  --------------   -----
 0   session_id                              162541 non-null  int64
 1   item_id-count                           162541 non-null  int32
 2   day_index_year                          162541 non-null  int32
 3   day_index_M                             162541 non-null  int16
 4   item_id-list_seq                        162541 non-null  list
 5   _genres-list_seq                        162541 non-null  list
 6   tag-list_seq                            162541 non-null  list
 7   genome_tag-list_seq                     162541 non-null  list
 8   genome_relevance-list_seq               162541 non-null  list
 9   rating-list_seq                         162541 non-null  list
 10  et_dayofweek_sin-list_seq               162541 non-null  list
 11  et_da

In [37]:
for column in sessions_gdf.columns:
    #if column not in ["session_id","item_id-count" , "day_index",
    #                 "day_index_year","day_index_M","day_index"]:
    min_, max_ = get_min_max_of_the_feture(column)
    print(f"{column}: min: {min_} max: {max_}")

session_id: min: 2 max: 162542
item_id-count: min: 20 max: 32202
day_index_year: min: 0 max: 23
day_index_M: min: 0 max: 11
item_id-list_seq: min: 0 max: 59048
_genres-list_seq: min: 0 max: 21
tag-list_seq: min: 0 max: 24939
genome_tag-list_seq: min: 0 max: 842
genome_relevance-list_seq: min: 0.0 max: 1.0
rating-list_seq: min: 0.0 max: 5.0
et_dayofweek_sin-list_seq: min: -0.974928081035614 max: 0.9749277234077454
et_dayofday_sin-list_seq: min: -0.994521975517273 max: 0.9945219159126282
product_recency_days_log_norm-list_seq: min: -14.183878898620605 max: 1.002375841140747
et_year-list_seq: min: 0 max: 2019
et_dayofweek-list_seq: min: 0 max: 6
et_dayofday-list_seq: min: 0 max: 31
event_time_M-list_seq: min: 0 max: 12
day_index: min: 1 max: 287


In [38]:
# feature types
print("categoricals", cat_feats)
print("con_feats", con_feats)
print("time_features", time_features)


categoricals <Node Categorify>
con_feats <merlin.dag.selector.ColumnSelector object at 0x151395075280>
time_features <Node + output>
