# Import

In [None]:
import pandas as pd
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
import numpy as np
from tqdm import tqdm
import pandas as pd
import slugify
import os
# import dask.dataframe as dd

FULL_DATA_DIR = '/content'

In [None]:
def GetMaxFeature(x):
    id_size = x["investment_id"].nunique()
    time_id_size = x["time_id"].nunique()
    return id_size,time_id_size

In [None]:
class Ubiquant_test(Dataset):
    def __init__(self,test_df,context = 100,get_time_data = True):
        super(Ubiquant_test,self).__init__()
        self.get_time = get_time_data
        self.data = test_df
        self.id_size,self.time_size = GetMaxFeature(self.data)
        if(get_time_data):#get an x base on time frame
            self.x = self.data.drop(columns=["row_id"])
            self.x.loc[:,"time_id"] -=self.x.loc[0,"time_id"]
            #self.id = self.x[["time_id"]]
            self.id = self.x[["investment_id"]]
            self.x = self.x.drop(columns=["investment_id"])
            #self.x = self.x.drop(columns=["time_id"])
        else:#get (x) base on investment_id
            self.data = self.data.sort_values(by=["investment_id"])
            self.x = self.data.drop(columns=["row_id","time_id"])
            self.id = self.x[["investment_id"]]
            self.x = self.x.drop(columns=["investment_id"])
        
        #pad context before the data
        self.context = context
        self.pad_id =np.pad(self.id.to_numpy(),((context-1,0),(0,0)),constant_values = 0)
        self.pad_x =np.pad(self.x.to_numpy(),((context-1,0),(0,0)),constant_values = 0)
        
        
        self.pad_id =torch.LongTensor(self.pad_id)
        self.pad_x = torch.FloatTensor(self.pad_x)
        
        # print(f"succesfully load test data with shape x : {self.x.shape} | get time data is: {get_time_data}")      
                           
    def __getitem__(self,index):
        context = self.context
        return self.pad_id[index:index+context],self.pad_x[index:index+context]
        
    def __len__(self):
        return self.x.shape[0]
    
    def GetSize(self):
        if(self.get_time):
            return self.time_size
        else:
            return self.id_size

In [None]:
class SimpleAttention(nn.Module):
    def __init__(self,seq_len = 32, input_feature =311):
        super(SimpleAttention,self).__init__() 
        
        self.id_embedding = nn.Embedding(10000,11)
        self.att = nn.MultiheadAttention(input_feature,8,batch_first=True)
        self.flat = nn.Flatten()
        self.MLP =nn.Sequential(
            nn.Linear(input_feature*seq_len,64),
            nn.BatchNorm1d(64),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(64,128),
            nn.BatchNorm1d(128),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(128,256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(256,512),
            nn.BatchNorm1d(512),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(512,256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(256,128),
            nn.BatchNorm1d(128),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(128,8),
            nn.BatchNorm1d(8),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(8,1)
        )
            
        self.criterion = nn.MSELoss()
        
    def forward(self,_id,f_features):
        invest_embedding = self.id_embedding(_id).squeeze(dim=2)
        #print(invest_embedding.shape)
        #print(f_features.shape)
        _input =torch.cat((invest_embedding,f_features),axis =-1)
        #print(_input.shape)
        #_input = _input.unsqueeze(dim=1)
        #print(_input.shape)
        output,_ = self.att(_input,_input,_input)
        #print(output.shape)
        output = self.flat(output)
        #print(output.shape)
        output = self.MLP(output)
        return output
    
    def cal_Loss(self,y_hat,y):
        return self.criterion(y_hat,y)

In [None]:
class SimpleMLP(nn.Module):
    def __init__(self,seq_len = 32,input_feature = 311):
        super(SimpleMLP,self).__init__()
        
        self.id_embedding = nn.Embedding(10000,11)
        self.flat = nn.Flatten()
        self.MLP =nn.Sequential(
            nn.Linear(input_feature*seq_len,64),
            nn.BatchNorm1d(64),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(64,128),
            nn.BatchNorm1d(128),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(128,256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(256,512),
            nn.BatchNorm1d(512),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(512,256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(256,128),
            nn.BatchNorm1d(128),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(128,8),
            nn.BatchNorm1d(8),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(8,1)
        )
        
        self.criterion = nn.MSELoss()
        
    def forward(self,_id,f_features):
        invest_embedding = self.id_embedding(_id).squeeze(dim=2)
        #print(invest_embedding.shape)
        #print(f_features.shape)
        _input =torch.cat((invest_embedding,f_features),axis =-1)
        #print(_input.shape)
        #_input = _input.unsqueeze(dim=1)
        #print(_input.shape)
        _input = self.flat(_input)
        output = self.MLP(_input)
        return output

    def cal_Loss(self,y_hat,y):
        return self.criterion(y_hat,y)

In [None]:
def test(model,testLoader,config):
    target = []
    for i,(x1,x2) in enumerate(testLoader):
        with torch.no_grad():
            x1,x2 = x1.cuda(),x2.cuda()
            y_hat = model(x1,x2)
            y_hat = y_hat.reshape(-1,1)
            target += y_hat
    return target

In [None]:
config={"epoch" : 50,"batch_size":12800,"learning_rate":0.1,"optimz":"Adam","store_path":"./Att.pth","context":32}
base_config={"epoch" : 50,"batch_size":12800,"learning_rate":0.1,"optimz":"Adam","store_path":"./MLP.pth","context":32}

In [None]:
model = SimpleAttention(32,312).cuda()
model.load_state_dict(torch.load(config["store_path"]))
model.eval()
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
for (test_df, sample_prediction_df) in iter_test:
    testSet = Ubiquant_test(test_df, context = config["context"], get_time_data = get_time_data)
    testLoader = DataLoader(testSet,batch_size = config["batch_size"],shuffle =False,drop_last=False)
    sample_prediction_df['target'] = test(model, testLoader,config)
    env.predict(sample_prediction_df) 

In [None]:
# import test1
# env = test1.make_env()
# iter_test = env.iter_test() 
# for (test_df, sample_prediction_df) in iter_test:
#     # ds = make_test_dataset(test_df[features], test_df["investment_id"])
#     # sample_prediction_df['target'] = test(models, ds)
#     # env.predict(sample_prediction_df) 
#     print(test_df.columns)
#     print(sample_prediction_df.columns)


This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
Index(['row_id', 'investment_id', 'f_0', 'f_1', 'f_2', 'f_3', 'f_4', 'f_5',
       'f_6', 'f_7',
       ...
       'f_290', 'f_291', 'f_292', 'f_293', 'f_294', 'f_295', 'f_296', 'f_297',
       'f_298', 'f_299'],
      dtype='object', length=302)
Index(['row_id', 'target'], dtype='object')
You must call `predict()` successfully before you can continue with `iter_test()`


TypeError: ignored

Index(['row_id', 'investment_id', 'f_0', 'f_1', 'f_2', 'f_3', 'f_4', 'f_5',
       'f_6', 'f_7',
       ...
       'f_290', 'f_291', 'f_292', 'f_293', 'f_294', 'f_295', 'f_296', 'f_297',
       'f_298', 'f_299'],
      dtype='object', length=302)
Index(['row_id', 'target'], dtype='object')
You must call `predict()` successfully before you can continue with `iter_test()`