In [1]:
import parser
import numpy as np
import gzip, msgpack
from torch.nn.modules import PairwiseDistance
import random
import torch 
import torch.nn as nn
from torch.autograd import Variable
import scipy.sparse as sp
import time

In [2]:
from sklearn.feature_extraction.text import HashingVectorizer
N_FEATURES = 100
hv = HashingVectorizer(n_features=N_FEATURES,non_negative=True, analyzer='word',
                       ngram_range=(2,3), norm='l2', stop_words='english')
# try default n-gram 1-gram,2-gram,3-gram

In [3]:
class SmallReference:
    def __init__(self, identifiers):
        self.identifiers = identifiers

class SmallPaper:
    def __init__(self, title, identifiers, authors, abstract, refs):
        self.title = title
        self.authors = authors
        self.abstract = hv.transform([abstract])
        self.references = [SmallReference(ref.identifiers) for ref in refs]
        self.identifiers = identifiers

In [4]:
def get_doi(paper):
    _ref_dict = dict()
    for x in paper.identifiers:
#        @todo lowecase
        _ref_dict[x.key_type] = x.key.lower()
    if 'doi' in _ref_dict:
        return _ref_dict['doi']
    elif 'ieee' in _ref_dict:
        return _ref_dict['ieee']
    elif 'semanticscholar' in _ref_dict:
        return _ref_dict['semanticscholar']
    elif 'adsa' in _ref_dict:
        return _ref_dict['adsa']
    else:
        return False

    
def update_authors(ident_to_auth, authors_to_papers, paper):
    for author in paper.authors:
        author_name = author.name.name
        
        if not author_name:
            continue
        for identifier in author.identifiers:
            result = ident_to_auth.get(identifier.key.lower(), False) 
            if result:
                author_name = str(result)
                break
        for identifier in author.identifiers:
            ident_to_auth[identifier.key.lower()] = author_name
        if not authors_to_papers.get(author_name, False):
            authors_to_papers[author_name] = list()
        authors_to_papers[author_name].append(get_doi(paper))

In [5]:
MAX_ELEMENTS = 2000
train_test_val_share = [0.8*MAX_ELEMENTS, 0.9*MAX_ELEMENTS, MAX_ELEMENTS]

identies_set = dict()

_paper_dict = dict()
myiter = 0
count=0
_ref_dict = dict()
_training_set = list()
_validation_set = list()
_test_set = list()
_ident_to_auth_dict = dict()
_authors_to_papers_dict = dict()
with gzip.open("NN_Papers.msgpack.gz", "rb") as nn_papers_out:
    unpacker = msgpack.Unpacker(nn_papers_out, encoding='utf-8')
    for _paper in unpacker:
        count+=1
        paper = parser.Paper.deserialize(_paper)
        
        paper_doi = get_doi(paper)
        if not (paper_doi and paper.abstract):
            continue
        else:
            myiter+=1
        paper = SmallPaper(paper.title, paper.identifiers, paper.authors, paper.abstract, paper.references)
        if myiter>=MAX_ELEMENTS:
            break
        #calculate statistics
        for iden in paper.identifiers:
            if identies_set.get(iden.key_type, False):
                identies_set[iden.key_type]+=1
            else:
                identies_set[iden.key_type]=1
        #end
        
        _ref_dict[paper_doi] = list()
 
        update_authors(_ident_to_auth_dict, _authors_to_papers_dict, paper)
        _paper_dict[paper_doi] = paper
        if myiter<=train_test_val_share[0]:
            _training_set.append(paper_doi)
        elif myiter<=train_test_val_share[1]: 
            _test_set.append(paper_doi)
        else:
            _validation_set.append(paper_doi)

print('total count: {0}'.format(count))
print(identies_set)



total count: 2880
{'acm': 397, 'doi': 1518, 'springer': 180, 'onepetro': 8, 'adsa': 451, 'oxford': 53, 'science': 2, 'ieee': 666, 'nature': 26, 'semanticscholar': 544, 'iop': 35, 'pmid': 257, 'mid': 5, 'pmc': 73, 'sage': 11, 'arxiv': 103, 'manuscript': 8}


In [6]:
def update_references(paper_dict, ref_dict):
    count = 0
    for paper_doi,paper in paper_dict.items():
        for ref in paper.references:
            ref_doi = get_doi(ref)
            if not ref_doi:
                continue
            result = ref_dict.get(ref_doi, False)
            if result!=False:
                count+=1
                ref_dict[paper_doi].append(ref_doi)
    print(count)
update_references(_paper_dict, _ref_dict)

54


In [None]:
# count = 0
# for key, value in _authors_to_papers_dict.items():
#     if len(value)>20:
#         count+=1
# count

In [None]:
# len(_authors_to_papers_dict)

In [8]:

# print(list(_paper_dict.values())[11].abstract)



In [9]:
# abstract_training_set = [paper.abstract for paper in training_set]
# abstract_training_set_hashed = hv.transform(abstract_training_set)

In [10]:
dtype = torch.FloatTensor

def make_var(abstract):
    dense = abstract.todense()
    torch_tensor = Variable(torch.from_numpy(dense).type(dtype))
    return torch_tensor

In [11]:

# random.randint(0,100)

In [12]:
# identies_set = dict()
# for paper in paper_list.values():
#     for iden in paper.identifiers:
#         if identies_set.get(iden.key_type, False):
#             identies_set[iden.key_type]+=1
#         else:
#             identies_set[iden.key_type]=1
# print(identies_set)
# len(paper_list)

In [13]:



# def get_triple(val_set):
#     count = 0
#     t0 = 0
#     t1 =0 b
#     for item in val_set:
#         count += 1
#         _ref_doi = get_doi(item)
#         if _ref_doi not in ref_dict.keys():
#             continue
#         _ref_list = ref_dict[_ref_doi]
        
#         for _ref in _ref_list:
#             if paper_list.get(_ref, False) in val_set:
#                 _neg_example = None
#                 while(not _neg_example):
#                     _neg_id = random.randint(0,len(val_set)-1)
#                     _neg_doi = get_doi(val_set[_neg_id])
#                     if (_neg_doi != _ref_doi) and (_neg_doi not in _ref_list):
#                         neg_ref_dict = ref_dict[_neg_doi]
#                         if not (_ref_doi in neg_ref_dict or _ref in neg_ref_dict):
#                             _neg_example = val_set[_neg_id]
# #                 t1 = time.time()
# #                 print('time to sample triple: {0}'.format(t1-t0))
#                 yield (item.abstract, paper_list[_ref].abstract, _neg_example.abstract)
# #                 t0 = time.time()

In [14]:
# appr = _paper_dict['10.1109/tnnls.2015.2392563']
def get_papers_per_authors(paper):
    set_to_merge = set()
    for author in paper.authors:
        identifier = list(author.identifiers)[0].key
#         print("name {0}. papers: {1}".format(_ident_to_auth_dict[identifier],
#                                              _authors_to_papers_dict[_ident_to_auth_dict[identifier]]))
        set_to_merge.update(set(_authors_to_papers_dict[_ident_to_auth_dict[identifier.lower()]]))
    set_to_merge.discard(get_doi(paper))
    return set_to_merge

In [15]:
# get_papers_per_authors(_paper_dict['10.1109/tnnls.2015.2392563'])
# [x for x in (1,2,3,4)]
# {1,3,4,5}.difference(_validation_set)

In [16]:
BATCH_SIZE = 50

def prepare_triples(val_set):
    count = 0
    result_list = list()
    batch_list_1 = None
    batch_list_2 = None
    batch_list_3 = None
#     print(len(val_set))
    for paper_doi in val_set:
        paper = _paper_dict[paper_doi]
        
        
        pos_ref_set = get_papers_per_authors(paper)
        
        pos_ref_set.update(_ref_dict[paper_doi])
        pos_ref_set = (pos_ref_set.difference(_validation_set)).difference(_test_set)

        #cut down articles that are not used in val_set
        
#         pos_ref_set.intersection(val_set)
        
        if len(pos_ref_set) == 0:
#             print(paper_doi)
            continue
#         print(len(pos_ref_set))
        neg_ref_set = set()
        for n in range(len(pos_ref_set)):
            new_id = None
            while((new_id in pos_ref_set) or new_id==None):
                new_id = val_set[random.randrange(0,len(val_set)-1)]
            neg_ref_set.add(new_id)
            
            
        for pos,neg in zip(pos_ref_set, neg_ref_set):

            if batch_list_1 == None:
                batch_list_1 = paper.abstract
                batch_list_2 = _paper_dict[pos].abstract
                batch_list_3 = _paper_dict[neg].abstract
            else:
                batch_list_1 = sp.vstack((batch_list_1, paper.abstract))
                batch_list_2 = sp.vstack((batch_list_2, _paper_dict[pos].abstract))
                batch_list_3 = sp.vstack((batch_list_3, _paper_dict[neg].abstract))

            if batch_list_1.shape[0]==BATCH_SIZE:
                yield (batch_list_1, batch_list_2, batch_list_3)
                batch_list_1 = None
#     print(count)
    return result_list


    

In [17]:

# get_papers_per_authors(_paper_dict[_training_set[2]]).intersection(_training_set[:50])

In [18]:
# a1,a2,a3 = next(triples_iterator)

In [19]:
# a1[0]

In [20]:



# dtype = torch.FloatTensor
# def get_batch(iterator, batch_size=BATCH_SIZE):
#     a1 = None
#     a2 = None
#     a3 = None
#     for x1,x2,x3 in iterator:
#         if a1 == None:
#             a1=x1
#             a2=x2
#             a3=x3
#         else:
#             a1 = sp.vstack((a1,x1))
#             a2 = sp.vstack((a2,x2))
#             a3 = sp.vstack((a3,x3))            
#         if a1.shape[0]==batch_size:
#             dense_a1 = a1.todense()
#             dense_a2 = a2.todense()
#             dense_a3 = a3.todense()
#             tor_a1 = torch.from_numpy(dense_a1).type(dtype)
#             tor_a2 = torch.from_numpy(dense_a2).type(dtype)
#             tor_a3 = torch.from_numpy(dense_a3).type(dtype)
#             yield (Variable(tor_a1),
#                    Variable(tor_a2),
#                    Variable(tor_a3))
#             a1 = None
#             a2 = None
#             a3 = None


# input data sparse matrix


In [22]:
# hv.transform(['avasd', 'asdfwe sdfasdfwe werwe adsfsd']).todense()

In [23]:
class DistRes2(nn.Module):

    def __init__(self):
        super(DistRes, self).__init__()
        self.layer1 = nn.Linear(N_FEATURES, 50)
        self.layer2 = nn.PairwiseDistance(2)

    def forward(self, x1,x2):
#         print('typex: {0}'.format(type(x1)))
#         print('typey: {0}'.format(type(x2)))
        res1 = self.layer1(x1)
        res2 = self.layer1(x2)
        # normalize vector L2-normalization
        return self.layer2(res1,res2)
    

In [24]:
class DistRes(nn.Module):

    def __init__(self):
        super(DistRes, self).__init__()
        self.layer1 = nn.Linear(N_FEATURES, 50)
        self.layer2 = nn.PairwiseDistance(2)

    def forward(self, x1,x2):
#         print('typex: {0}'.format(type(x1)))
#         print('typey: {0}'.format(type(x2)))
        res1 = torch.nn.functional.tanh(self.layer1(x1))
        res2 = torch.nn.functional.tanh(self.layer1(x2))

#         res3 = self.layer1(x3)
        # normalize vector L2-normalization
        return torch.nn.functional.tanh(self.layer2(res1,res2))
    
    def calculate_res(self, x):
        return torch.nn.functional.tanh(self.layer1(x)).data.numpy()
    
# linear/RelU/linear
# dropout input layer
# help as they authored by same person

In [25]:

dist_model = DistRes()

In [26]:
#check the distance Loss, just to see the difference
# maybe pos and pos close

crit = nn.HingeEmbeddingLoss(1)
optim = torch.optim.RMSprop(dist_model.parameters(), momentum=0.5, lr=0.01)
# local_dist(z)

In [27]:
ones_var = Variable(torch.ones(BATCH_SIZE))
neg_var = Variable(torch.ones(BATCH_SIZE)*(-1))

def train_networks(model, criteria, optimizer, paper, p_pos, p_neg):

    loss_pos = criteria.forward( model(paper, p_pos), ones_var)
    optimizer.zero_grad()
    gradCrit = loss_pos.backward()
    optimizer.step()
    
    loss_neg = criteria.forward( model(paper,p_neg), neg_var)
    optimizer.zero_grad()
    gradCrit = loss_neg.backward()
    optimizer.step()

    return model(paper, p_pos), model(paper,p_neg)




In [43]:

RECALL_N = 20

def calculate_recall(model, val_set, train_set):
#     if len(val_set)<1800:
#         raise ValueError('The set is too small to test.')
    a=0
    recall_at_n = list()
    
    #calculate all vectors for the set
    doi_to_vecs = dict()
    count_1 = 0
    
    
    
    for paper_doi in val_set[:500]:
        paper = _paper_dict[paper_doi]
        pos_ref_set = get_papers_per_authors(paper)
        pos_ref_set.update(_ref_dict[paper_doi])


        
        if len(pos_ref_set)==0 or len(pos_ref_set)>RECALL_N:
            continue
        if len(pos_ref_set)>1:
            count_1+=1
        neg_examples_amount = 200
        neg_ref_set = set()
        for n in range(neg_examples_amount):
            new_id = None
            while((new_id in pos_ref_set) or new_id==None or (new_id in neg_ref_set)):
                new_id = train_set[random.randrange(0,len(train_set))]
            neg_ref_set.add(new_id)
        
        all_refs = set()
        all_refs.update(pos_ref_set)
        all_refs.update(neg_ref_set)

        
        for paper_doi in all_refs:
            doi_to_vecs[paper_doi] = model.calculate_res(make_var(_paper_dict[paper_doi].abstract))
        
        paper_vector = doi_to_vecs[paper_doi]
        
        
        #calculate
        distance_array = dict()
        for ref in all_refs: 
            ref_vec = doi_to_vecs[ref]
            ref_dist = np.linalg.norm(paper_vector - ref_vec)
            distance_array[ref] = ref_dist

        ordered_predicted_list = list(map(lambda x: x[0], 
                                          sorted(distance_array.items(), 
                                                 key=lambda val: val[1])))[:RECALL_N]
        recall = len(pos_ref_set.intersection(ordered_predicted_list)) / len(pos_ref_set)
        if a!=0:
            print(ordered_predicted_list)
            print(pos_ref_set)
            a-=1
        #print(recall)
        recall_at_n.append(recall)
#     print(count_1)
    return np.mean(recall_at_n)


In [37]:
# t = time.time()
# print(calculate_recall(model=dist_model, val_set=valid_set))
# print(time.time()-t)


In [38]:
'10.1023/a:1007989231205' in valid_set

False

In [42]:
train_set = _training_set
valid_set = _training_set
# t0=time.time()

# print(time.time()-t0)
# print(len(triples_iterator))

In [40]:
# my_time = time.time()

def write_to_file(file, text_to_write):
    file.write(text_to_write)
    file.write('\n')
    file.flush()



In [41]:
epochs = 20
epoch_auc = dict()
my_time = time.time()



f = open('/home/jupyter/output_{0}.txt'.format(my_time),'w')


recall = calculate_recall(model=dist_model, val_set=valid_set, train_set=train_set)
text = '\nBeginning recall: {0}\n'.format(recall)
print(text)
write_to_file(f, text)

for epoch in range(epochs):
    aver_pos = list()
    aver_neg = list()
    triples_iterator = prepare_triples(train_set)    
    batch=0
    t0 = time.time()
    for arg1,arg2,arg3 in triples_iterator:

        ap, an = train_networks(model=dist_model, criteria=crit, optimizer=optim, 
                                paper=make_var(arg1), p_pos=make_var(arg2), 
                                p_neg=make_var(arg3))
        
        aver_pos.append(ap.data.mean())
        aver_neg.append(an.data.mean())
        if (batch % 100 == 0 and batch != 0 ):
            t1 = time.time()
            total= t1-t0
            text = 'Epoch: {0}. Batch: {3}. Time {4} av_pos: {1}. av_neg: {2}'.format(epoch, np.mean(aver_pos), 
                                                                               np.mean(aver_neg), 
                                                                                 batch, total)
            print(text)
            write_to_file(f, text)
            t0 = time.time()
        batch+=1
    
    #calc auc
    t0 = time.time()
    recall = calculate_recall(model=dist_model, val_set=valid_set, train_set=train_set)
    t1 = time.time()
#     print('recall calculated in {0} s.'.format(t1-t0))
    epoch_auc[epoch] = recall
    text = '\n\nEpoch: {0}. av_pos: {1}. av_neg: {2}. recall: {3}\n\n'.format(epoch, np.mean(aver_pos), np.mean(aver_neg), recall)
    write_to_file(f, text)
    print(text)

f.close




Epoch: 0. av_pos: 0.8776989781931261. av_neg: 0.9496426434516906. recall: 0.09738955823293172




Epoch: 1. av_pos: 0.8476370779946665. av_neg: 0.9648127841949463. recall: 0.06626506024096386




<function TextIOWrapper.close>

In [None]:
triple_iterator1 = get_triple(training_set)
batch_iterator1 = get_batch(triple_iterator1)
arg1,arg2,arg3 = next(batch_iterator1)
k = dist_model(arg1,arg2)

In [None]:
a = {'a':1,'b':5, 'c':3,'d':0}
b = list(map(lambda x: x[0], sorted(a.items(), key=lambda val: val[1])[:3]))
c = {'d', 'a'}
d = len(c.intersection(b))/len(c)
np.mean([d,0])
int(10 / 3)

In [None]:
stacked_matrix.shape