In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"

import glob
import torch 

from transformers4rec import torch as tr
from transformers4rec.torch.ranking_metric import NDCGAt, AvgPrecisionAt, RecallAt
from transformers4rec.torch.utils.examples_utils import wipe_memory

  warn(f"Tensorflow dtype mappings did not load successfully due to an error: {exc.msg}")
  warn(f"Triton dtype mappings did not load successfully due to an error: {exc.msg}")
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from merlin.schema import Schema
from merlin.io import Dataset


INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/home/ec2-user/SageMaker/token_recommender/data/")

train = Dataset(os.path.join(INPUT_DATA_DIR, "202201-202203/data.parquet/part_0.parquet"))
schema = train.schema

In [4]:
import nvtabular as nvt
import cudf


In [5]:
inputs = tr.TabularSequenceFeatures.from_schema(
        schema,
        max_sequence_length=20,
        continuous_projection=64,
        masking="mlm",
        d_output=100,
)

In [6]:
# Define XLNetConfig class and set default parameters for HF XLNet config  
transformer_config = tr.XLNetConfig.build(
    d_model=64, n_head=4, n_layer=2, total_seq_length=20
)
# Define the model block including: inputs, masking, projection and transformer block.
body = tr.SequentialBlock(
    inputs, tr.MLPBlock([64]), tr.TransformerBlock(transformer_config, masking=inputs.masking)
)

# Define the evaluation top-N metrics and the cut-offs
metrics = [NDCGAt(top_ks=[5, 10,20], labels_onehot=True),  
           RecallAt(top_ks=[5, 10,20], labels_onehot=True),
           AvgPrecisionAt(top_ks=[5,10,20], labels_onehot=True)]

# Define a head related to next item prediction task 
head = tr.Head(
    body,
    tr.NextItemPredictionTask(weight_tying=True, 
                              metrics=metrics),
    inputs=inputs,
)

# Get the end-to-end Model class 
model = tr.Model(head)

In [7]:
model_path= os.environ.get("OUTPUT_DIR", f"{INPUT_DATA_DIR}/saved_model")

model = model.load(model_path)

In [None]:
# workflow = nvt.Workflow.load('./preprocessWorkflow')


In [8]:
# month = "202202"
# mydata = cudf.read_parquet(f"s3a://zarklab-token-recommender/buy-sell-updated-{month}/")
# dataset = nvt.Dataset(mydata)

In [9]:
# procesed = workflow.transform(dataset)

In [8]:
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/home/ec2-user/SageMaker/token_recommender/data/")
TEST_data = Dataset(os.path.join(INPUT_DATA_DIR, "202202-202204/data.parquet/part_0.parquet"))

TEST_data.head()

Unnamed: 0,recipient,timestamp-first,buyAsset-list,et_dayofweek_sin-list,txFee_eth_log_norm-list,buyQty1_log_norm-list,buyPrice_log_norm-list,token_category-list,token_rank_category-list,risky_flags-list
0,0x000000000004d7463d0f9c77383600bc82d612f5,2022-02-04 16:02:07,"[1476, 1017, 714, 503, 1414, 540, 24, 57, 388,...","[0.7818320421108522, -0.43388454782514785, 0.7...","[-0.9711685, 4.0381575, -0.0927555, -0.5455516...","[-0.037812326, 0.41854876, 1.6840619, 0.373512...","[3.1767557, 0.5084427, -0.32517, 0.9167288, -1...","[13, 23, 13, 15, 4, 6, 4, 10, 13, 6, 13, 5, 4,...","[3, 5, 5, 6, 3, 5, 3, 3, 5, 5, 5, 3, 3, 5, 5, ...","[3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, ..."
1,0x000000000008e4db6a6194c6957df47e30970dc2,2022-02-23 18:48:34,"[4, 5, 113, 4, 5, 4, 3, 4, 5, 4, 2, 4]","[0.43388293040961884, 0.43388293040961884, -0....","[1.1193919, -0.6704848, -0.7880669, -0.2486222...","[-1.1768427, -0.7273375, -0.25471848, -1.17684...","[-0.79159683, -1.9954994, -0.9924365, -0.72346...","[5, 3, 5, 5, 3, 5, 3, 5, 3, 5, 4, 5]","[3, 4, 5, 3, 4, 3, 4, 3, 4, 3, 3, 3]","[3, 4, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3]"
2,0x00000000005dbcb0d0513fcda746382fe8a53468,2022-03-17 22:05:10,"[7, 3]","[-0.43388454782514785, 0.43388293040961884]","[-0.56394494, -0.75527036]","[-0.18475509, -0.09766247]","[-0.0028407222, 0.3170109]","[3, 3]","[4, 4]","[3, 3]"
3,0x000000000088e0120f9e6652cc058aec07564f69,2022-02-27 22:21:30,"[3, 252]","[8.975979006501142e-07, 0.9749277124471076]","[-0.5149825, -0.6488472]","[-0.9772498, -1.1934248]","[-2.913314, -3.7022686]","[3, 18]","[4, 5]","[3, 3]"
4,0x0000000000e002b4e3009de1c7614ae5e1facf7a,2022-04-21 20:00:25,"[4, 7]","[-0.43388454782514785, -0.43388454782514785]","[-0.46488264, -0.24414156]","[-1.1934248, -1.0993807]","[-3.3618457, -3.3618457]","[5, 3]","[3, 4]","[3, 3]"


In [9]:
from transformers4rec.torch.utils.data_utils import T4RecDataLoader

In [10]:
dataloader =  T4RecDataLoader.parse('merlin').from_schema(
            train.schema,
            TEST_data,
            128,
            max_sequence_length=20,
            drop_last=False,
            shuffle=False
        )

In [11]:

softmax = torch.nn.Softmax(dim=1)

In [12]:
procesed_df = TEST_data.to_ddf()

In [13]:
procesed_df = procesed_df[["recipient",'buyAsset-list']]
procesed_df.head()

Unnamed: 0,recipient,buyAsset-list
0,0x000000000004d7463d0f9c77383600bc82d612f5,"[1476, 1017, 714, 503, 1414, 540, 24, 57, 388,..."
1,0x000000000008e4db6a6194c6957df47e30970dc2,"[4, 5, 113, 4, 5, 4, 3, 4, 5, 4, 2, 4]"
2,0x00000000005dbcb0d0513fcda746382fe8a53468,"[7, 3]"
3,0x000000000088e0120f9e6652cc058aec07564f69,"[3, 252]"
4,0x0000000000e002b4e3009de1c7614ae5e1facf7a,"[4, 7]"


In [33]:
all_labels = []
all_indices = []
all_scores = []
K=10
for i, data in enumerate(dataloader):
    
    out = model.forward(data[0],testing = True)
    prediction = out['predictions']
    labels = out['labels']

    prob = softmax(prediction)
    top5 = torch.topk(prob, K,dim=1,sorted=True)
    score = top5.values
    indices = top5.indices
    
    metrics = model.calculate_metrics(prediction,labels)
    
    
    all_labels.append(labels.cpu())
    all_indices.append(indices.cpu())
    # all_scores.append(score.cpu())
    
#     print(indices[:20])
#     print(labels[:20])
    # print(metrics)
    
#     if i >1:
#         break

In [34]:
all_indices_cat = torch.cat(all_indices)


In [35]:
import numpy as np

all_indices_array = np.array(all_indices_cat)

In [36]:
all_indices_array

array([[ 5,  4,  3, ..., 18, 30, 24],
       [ 4,  3,  5, ...,  8, 13, 34],
       [ 3,  5,  7, ..., 21, 28, 11],
       ...,
       [ 3,  5,  4, ..., 11, 19, 12],
       [ 4, 13, 16, ..., 20, 70, 34],
       [ 3,  5,  4, ..., 37,  7, 57]])

In [37]:
all_indices_list = [str(x) for x in list(all_indices_array)]

In [38]:
all_labels_cat = np.array(torch.cat(all_labels))

In [39]:
all_labels_list =list(all_labels_cat)

In [40]:
import pandas as pd

In [41]:
df_predict = pd.DataFrame(all_indices_list,columns=['predict'])

In [42]:
df_predict['label'] = all_labels_list

In [43]:
df_predict.head()

Unnamed: 0,predict,label
0,[ 5 4 3 37 14 182 25 18 30 24],4
1,[ 4 3 5 15 31 53 16 8 13 34],4
2,[ 3 5 7 4 9 12 8 21 28 11],3
3,[ 3 4 5 8 7 9 11 6 12 21],252
4,[ 4 3 5 8 16 13 11 14 15 31],7


In [45]:
df_predict.shape

(118599, 2)

In [46]:
df_predict_cudf =  cudf.DataFrame.from_pandas(df_predict)

In [47]:
df_predict_cudf.head()

Unnamed: 0,predict,label
0,[ 5 4 3 37 14 182 25 18 30 24],4
1,[ 4 3 5 15 31 53 16 8 13 34],4
2,[ 3 5 7 4 9 12 8 21 28 11],3
3,[ 3 4 5 8 7 9 11 6 12 21],252
4,[ 4 3 5 8 16 13 11 14 15 31],7


In [48]:
procesed_cudf = procesed_df.compute()

In [49]:
combined_df = cudf.concat([df_predict_cudf, procesed_cudf], axis=1)


In [50]:
combined_df.head(20)

Unnamed: 0,predict,label,recipient,buyAsset-list
0,[ 5 4 3 37 14 182 25 18 30 24],4,0x000000000004d7463d0f9c77383600bc82d612f5,"[1476, 1017, 714, 503, 1414, 540, 24, 57, 388,..."
1,[ 4 3 5 15 31 53 16 8 13 34],4,0x000000000008e4db6a6194c6957df47e30970dc2,"[4, 5, 113, 4, 5, 4, 3, 4, 5, 4, 2, 4]"
2,[ 3 5 7 4 9 12 8 21 28 11],3,0x00000000005dbcb0d0513fcda746382fe8a53468,"[7, 3]"
3,[ 3 4 5 8 7 9 11 6 12 21],252,0x000000000088e0120f9e6652cc058aec07564f69,"[3, 252]"
4,[ 4 3 5 8 16 13 11 14 15 31],7,0x0000000000e002b4e3009de1c7614ae5e1facf7a,"[4, 7]"
5,[ 3 4 5 7 8 11 10 9 12 14],3,0x0000000002ce79aacd54227d2163ff3791338975,"[8, 48, 3]"
6,[ 3 5 4 25 6 21 80 7 74 23],1270,0x000000000e3952882af02e7db0f3157bac7c6b51,"[81, 1208, 5, 1270]"
7,[ 3 5 4 18 7 32 24 47 69 12],66,0x000000000ecef000e9d22b7e0eb78ef2a8e698d5,"[325, 50, 325, 66]"
8,[ 3 5 4 11 8 9 25 23 6 10],5,0x0000000050da40b5728bfd0dabc2956fcb3b044f,"[83, 5]"
9,[ 4 15 3 5 68 26 46 16 8 6],1108,0x000000009343fc9c6199b541386d74921f00ef8d,"[165, 4, 41, 1108]"


In [51]:
combined_df.to_parquet(f'./data/predicttop{K}.parquet')
