In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
import gzip
import pickle
from utils.util import *
from train_test import *
import sys
import os
import argparse
from parse_config import ConfigParser

In [3]:
def load_compressed_pickle(filename):
    with gzip.open(filename, 'rb') as f:
        obj = pickle.load(f)
    return obj

In [4]:
restored_data = load_compressed_pickle('/datasets/mind_data/data_dict_compressed.pickle')

In [5]:
user_history=restored_data["user_history"]
entity_embedding=restored_data["entity_embedding"]
relation_embedding=restored_data["relation_embedding"]
entity_adj=restored_data["entity_adj"]
relation_adj=restored_data["relation_adj"]
news_feature=restored_data["news_feature"]
max_entity_freq=restored_data["max_entity_freq"]
max_entity_pos=restored_data["max_entity_pos"]
max_entity_type=restored_data["max_entity_type"]
train_data=restored_data["train_data"]
dev_data=restored_data["dev_data"]
vert_train=restored_data["vert_train"]
vert_test=restored_data["vert_test"]
pop_train=restored_data["pop_train"]
pop_test=restored_data["pop_test"]
item2item_train=restored_data["item2item_train"]
item2item_test=restored_data["item2item_test"]

In [6]:
parser = argparse.ArgumentParser(description='KRED')
parser.add_argument('-f')
parser.add_argument('-c', '--config', default="./config.json", type=str,
                    help='config file path (default: None)')
parser.add_argument('-r', '--resume', default=None, type=str,
                    help='path to latest checkpoint (default: None)')
parser.add_argument('-d', '--device', default=None, type=str,
                    help='indices of GPUs to enable (default: all)')
config = ConfigParser.from_args(parser)

In [7]:
epochs = 5
batch_size = 64
train_type = "single_task"
task = "user2item"

config['trainer']['epochs'] = epochs
config['data_loader']['batch_size'] = batch_size
config['trainer']['training_type'] = train_type
config['trainer']['task'] = task

In [8]:
if config['trainer']['training_type']  == "multi-task":
    data = user_history, entity_embedding, relation_embedding, entity_adj, relation_adj, news_feature, max_entity_freq, max_entity_pos, max_entity_type, train_data, dev_data, vert_train, vert_test, pop_train, pop_test, item2item_train, item2item_test
elif config['trainer']['task'] == "user2item":
    data = user_history, entity_embedding, relation_embedding, entity_adj, relation_adj, news_feature, max_entity_freq, max_entity_pos, max_entity_type, train_data, dev_data
elif config['trainer']['task'] == "item2item":
    data =  user_history, entity_embedding, relation_embedding, entity_adj, relation_adj, news_feature, max_entity_freq, max_entity_pos, max_entity_type, item2item_train, item2item_test
elif config['trainer']['task'] == "vert_classify":
    data = user_history, entity_embedding, relation_embedding, entity_adj, relation_adj, news_feature, max_entity_freq, max_entity_pos, max_entity_type, vert_train, vert_test
elif config['trainer']['task'] == "pop_predict":
    data = user_history, entity_embedding, relation_embedding, entity_adj, relation_adj, news_feature, max_entity_freq, max_entity_pos, max_entity_type, pop_train, pop_test

In [9]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [10]:
#Single Task Training

In [11]:
user_history_dict, entity_embedding, relation_embedding, entity_adj, relation_adj, doc_feature_dict, entity_num, position_num, type_num, train_data, test_data = data

In [12]:
train_data_u2i = NewsDataset(train_data)

In [13]:
train_sampler_u2i = RandomSampler(train_data_u2i)

In [15]:
train_dataloader_u2i = DataLoader(train_data_u2i,
                                  sampler=train_sampler_u2i,batch_size=config['data_loader']['batch_size'],collate_fn=my_collate_fn, pin_memory=False)

In [16]:
criterion = Softmax_BCELoss(config)

In [17]:
train_data_loader = train_dataloader_u2i

In [18]:
device, deviceids = prepare_device(config['n_gpu'])

In [19]:
model = KREDModel(config, user_history_dict, doc_feature_dict, entity_embedding, relation_embedding, entity_adj,relation_adj, entity_num, position_num, type_num).cuda()

In [20]:
optimizer = optim.Adam(model.parameters(), lr=config['optimizer']['lr'], weight_decay=0)

In [21]:
trainer = Trainer(config, model, criterion, optimizer, device, train_data_loader, data[-1])

In [22]:
trainer.train()

model training
Training epoch 0/6 - 0.0
######
 Step: 0, 0.0 
######
######
 Step: 1000, 0.27078256160303277 
######
######
 Step: 2000, 0.5415651232060655 
######
######
 Step: 3000, 0.8123476848090982 
######
all loss: tensor(1707.1224, device='cuda:0', grad_fn=<AddBackward0>)
_train_epoch


KeyboardInterrupt: 

In [23]:
model323 = trainer.model

In [24]:
model323.eval()

KREDModel(
  (news_embedding): News_embedding(
    (kgat): KGAT(
      (attention_layer1): Linear(in_features=300, out_features=128, bias=True)
      (attention_layer2): Linear(in_features=128, out_features=1, bias=True)
      (softmax): Softmax(dim=-1)
      (relu): ReLU(inplace=True)
      (convolve_layer): Linear(in_features=200, out_features=100, bias=True)
    )
    (final_embedding1): Linear(in_features=868, out_features=128, bias=True)
    (final_embedding2): Linear(in_features=128, out_features=100, bias=True)
    (relu): ReLU(inplace=True)
    (sigmoid): Sigmoid()
    (tanh): Tanh()
    (title_embeddings): Embedding(1000, 100)
    (type_embeddings): Embedding(100, 100)
    (entity_num_embeddings): Embedding(100, 100)
    (attention_embedding_layer1): Linear(in_features=868, out_features=128, bias=True)
    (attention_embedding_layer2): Linear(in_features=128, out_features=1, bias=True)
    (softmax): Softmax(dim=-2)
  )
  (user_modeling): User_modeling(
    (news_embedding): N

In [25]:
test_data_u2i = NewsDataset(test_data)

In [44]:
test_dataloader_u2i = DataLoader(test_data_u2i,batch_size=config['data_loader']['batch_size'],
                                          collate_fn=my_collate_fn, pin_memory=True, num_workers=4)

In [48]:
# torch.no_grad() as a context manager:
%timeit model323.user_modeling.news_embedding(model323.user_modeling.get_user_history(real_batch_out['item1']))


299 ms ± 7.31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [49]:
%timeit model323(real_batch_out['item1'], real_batch_out['item2'],config['trainer']['task'])

447 ms ± 5.96 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [42]:
from time import time

y_pred = []
time_start = time()
for step, batch in enumerate(test_dataloader_u2i):

    if (step%100==0):
        print(step)
        print(step/len(test_dataloader_u2i))
        print(int(time_start-time()))
    batch = real_batch(batch)
    #out = model323(batch['item1'], batch['item2'],config['trainer']['task'])[0].detach().cpu().data.tolist()
    #y_pred.extend(out)
    y_pred.extend(model323(batch['item1'], batch['item2'],config['trainer']['task'])[0].detach().cpu().data.tolist())



0
0.0
0
100
0.00233486656237596
-54
200
0.00466973312475192
-109
300
0.007004599687127881
-164
400
0.00933946624950384
-220
500
0.0116743328118798
-274
600
0.014009199374255762
-330
700
0.01634406593663172
-384
800
0.01867893249900768
-438
900
0.02101379906138364
-494
1000
0.0233486656237596
-549
1100
0.025683532186135564
-604
1200
0.028018398748511524
-659
1300
0.030353265310887483
-714
1400
0.03268813187326344
-768
1500
0.035022998435639406
-823
1600
0.03735786499801536
-878
1700
0.039692731560391326
-933
1800
0.04202759812276728
-988
1900
0.044362464685143245
-1043
2000
0.0466973312475192
-1098
2100
0.049032197809895164
-1152
2200
0.05136706437227113
-1207
2300
0.053701930934647084
-1262
2400
0.05603679749702305
-1316
2500
0.058371664059399
-1371
2600
0.060706530621774966
-1426
2700
0.06304139718415093
-1481
2800
0.06537626374652689
-1535
2900
0.06771113030890284
-1590
3000
0.07004599687127881
-1646
3100
0.07238086343365477
-1701
3200
0.07471572999603072
-1756
3300
0.077050596558406

KeyboardInterrupt: 

In [93]:
len(restored_data["dev_data"]['item1'])

2740998

In [64]:
start_list = list(range(0, len(trainer.test_data['label']), int(trainer.config['data_loader']['batch_size'])))
len(start_list)

42829

In [66]:
fp_dev = open(config['data']['valid_behavior'], 'r', encoding='utf-8')
count_lines = 0
for lien in fp_dev:
    count_lines += 1
print(count_lines)

73152


In [61]:
y_pred = []
for start in start_list:
    print(start)
    if start + int(trainer.config['data_loader']['batch_size']) <= len(trainer.test_data['label']):
        end = start + int(trainer.config['data_loader']['batch_size'])
    else:
        end = len(trainer.test_data['label'])
    out = trainer.model(trainer.test_data['item1'][start:end], trainer.test_data['item2'][start:end],
                         trainer.config['trainer']['task'])[0].cpu().data.numpy()

truth = trainer.test_data['label']
auc_score = cal_auc(truth, y_pred)
print("auc socre: " + str(auc_score))

0
64
128
192
256
320
384
448
512
576
640
704
768
832
896
960
1024
1088
1152
1216
1280
1344
1408
1472
1536
1600
1664
1728
1792
1856
1920
1984
2048
2112
2176
2240
2304
2368
2432
2496
2560
2624
2688
2752
2816
2880
2944
3008
3072
3136
3200
3264
3328
3392
3456
3520
3584
3648
3712
3776
3840
3904
3968
4032
4096
4160
4224
4288
4352
4416
4480
4544
4608
4672
4736
4800
4864
4928
4992
5056
5120
5184
5248
5312
5376
5440
5504
5568
5632
5696
5760
5824
5888
5952
6016
6080
6144
6208
6272
6336
6400
6464
6528
6592
6656
6720
6784
6848
6912
6976
7040
7104
7168
7232
7296
7360
7424
7488
7552
7616
7680
7744
7808
7872
7936
8000
8064
8128
8192
8256
8320
8384
8448
8512
8576
8640
8704
8768
8832
8896
8960
9024
9088
9152
9216
9280
9344
9408
9472
9536
9600
9664
9728
9792
9856
9920
9984
10048
10112
10176
10240
10304
10368
10432
10496
10560
10624
10688
10752
10816
10880
10944
11008
11072
11136
11200
11264
11328
11392
11456
11520
11584
11648
11712
11776
11840
11904
11968
12032
12096
12160
12224
12288
12352
12416
12480


KeyboardInterrupt: 

In [33]:
def _valid_epoch(epoch):
    """
    Validate after training an epoch
    :param epoch: Integer, current training epoch.
    :return: A log that contains information about validation
    """
    self.model.eval()
    y_pred = []
    start_list = list(range(0, len(self.test_data['label']), int(self.config['data_loader']['batch_size'])))
    for start in start_list:
        if start + int(self.config['data_loader']['batch_size']) <= len(self.test_data['label']):
            end = start + int(self.config['data_loader']['batch_size'])
        else:
            end = len(self.test_data['label'])
        out = self.model(self.test_data['item1'][start:end], self.test_data['item2'][start:end],
                         self.config['trainer']['task'])[
            0].cpu().data.numpy()

        y_pred.extend(out)
    truth = self.test_data['label']
    auc_score = cal_auc(truth, y_pred)  # had to switch input parameters for it to work
    print("auc socre: " + str(auc_score))
    return auc_score

In [70]:
!ls

DNLP_project.ipynb		    embedding_part.ipynb  parse_config.py
KG_part.ipynb			    framework.PNG	  req.txt
README.md			    kred_example.ipynb	  requirements.txt
__init__.py			    logger		  train_test.py
__pycache__			    main.py		  trainer
base				    model		  utils
config.yaml			    model_run.ipynb
data_after_training_singletask.pkl  out
