In [211]:
import parser
import numpy as np
import gzip, msgpack
from torch.nn.modules import PairwiseDistance
import random
import torch 
import torch.nn as nn
from torch.autograd import Variable
import scipy.sparse as sp
import time

In [434]:
from sklearn.feature_extraction.text import HashingVectorizer
N_FEATURES = 2*18
hv = HashingVectorizer(n_features=N_FEATURES,non_negative=True, analyzer='word',
                       ngram_range=(3,3), norm='l2', stop_words='english')
# try default n-gram 1-gram,2-gram,3-gram
dtype = torch.FloatTensor

def convert_to_variable(abstract):
#     pos_x = abstract.nonzero()[0].tolist()
#     pos_y = abstract.nonzero()[1].tolist()

#     i = torch.LongTensor([pos_x, pos_y])
#     v = torch.FloatTensor([a[1].abstract[x,y] for x,y in zip(pos_x,pos_y)])
#     sparse = torch.sparse.FloatTensor(i, v, torch.Size([1,N_FEATURES]))
    sparse = torch.from_numpy(abstract.todense()).type(dtype)
    tensor = Variable(sparse)
    return tensor


In [435]:

class SmallReference:
    def __init__(self, identifiers):
        self.identifiers = identifiers

class SmallPaper:
    def __init__(self, title, identifiers, authors, abstract, refs):
        self.title = title
        self.authors = authors
#         transformed = 
        self.abstract = hv.transform([abstract])
        self.references = [SmallReference(ref.identifiers) for ref in refs]
        self.identifiers = identifiers

In [436]:
def get_doi(paper):
    _ref_dict = dict()
    for x in paper.identifiers:
#        @todo lowecase
        _ref_dict[x.key_type] = x.key.lower()
    if 'doi' in _ref_dict:
        return _ref_dict['doi']
    elif 'ieee' in _ref_dict:
        return _ref_dict['ieee']
    elif 'semanticscholar' in _ref_dict:
        return _ref_dict['semanticscholar']
    elif 'adsa' in _ref_dict:
        return _ref_dict['adsa']
    else:
        return False

    
def update_authors(ident_to_auth, authors_to_papers, paper):
    for author in paper.authors:
        author_name = author.name.name
        
        if not author_name:
            continue
        for identifier in author.identifiers:
            result = ident_to_auth.get(identifier.key.lower(), False) 
            if result:
                author_name = str(result)
                break
        for identifier in author.identifiers:
            ident_to_auth[identifier.key.lower()] = author_name
        if not authors_to_papers.get(author_name, False):
            authors_to_papers[author_name] = list()
        authors_to_papers[author_name].append(get_doi(paper))

In [455]:
MAX_ELEMENTS = 30000
train_test_val_share = [0.5*MAX_ELEMENTS, 0.4*MAX_ELEMENTS, MAX_ELEMENTS]

identies_set = dict()

_paper_dict = dict()
myiter = 0
count=0
_ref_dict = dict()
_training_set = list()
_validation_set = list()
_test_set = list()
_ident_to_auth_dict = dict()
_authors_to_papers_dict = dict()
with gzip.open("NN_Papers.msgpack.gz", "rb") as nn_papers_out:
    unpacker = msgpack.Unpacker(nn_papers_out, encoding='utf-8')
    for _paper in unpacker:
        count+=1
        paper = parser.Paper.deserialize(_paper)
        
        paper_doi = get_doi(paper)
        if not (paper_doi and paper.abstract and paper.abstract.strip()):
            continue
        else:
            myiter+=1
        paper = SmallPaper(paper.title, paper.identifiers, paper.authors, paper.abstract.strip(), paper.references)
        if myiter>=MAX_ELEMENTS:
            break
        #calculate statistics
        for iden in paper.identifiers:
            if identies_set.get(iden.key_type, False):
                identies_set[iden.key_type]+=1
            else:
                identies_set[iden.key_type]=1
        #end
        
        _ref_dict[paper_doi] = list()
 
        update_authors(_ident_to_auth_dict, _authors_to_papers_dict, paper)
        _paper_dict[paper_doi] = paper
        if myiter<=train_test_val_share[0]:
            _training_set.append(paper_doi)
        elif myiter<=train_test_val_share[1]: 
            _test_set.append(paper_doi)
        else:
            _validation_set.append(paper_doi)

print('total count: {0}'.format(count))
print(identies_set)



total count: 43293
{'acm': 5768, 'doi': 22937, 'springer': 2507, 'onepetro': 122, 'adsa': 6875, 'oxford': 697, 'science': 14, 'ieee': 10322, 'nature': 348, 'semanticscholar': 8002, 'iop': 530, 'pmid': 3932, 'mid': 66, 'pmc': 1132, 'sage': 133, 'pmcid': 1, 'arxiv': 1543, 'manuscript': 124}


In [456]:
def update_references(paper_dict, ref_dict):
    count = 0
    for paper_doi,paper in paper_dict.items():
        for ref in paper.references:
            ref_doi = get_doi(ref)
            if not ref_doi:
                continue
            result = ref_dict.get(ref_doi, False)
            if result!=False:
                count+=1
                ref_dict[paper_doi].append(ref_doi)
    print(count)
update_references(_paper_dict, _ref_dict)

8218


In [457]:


def make_var(abstract):
    dense = abstract.todense()
    torch_tensor = Variable(torch.from_numpy(dense).type(dtype))
    return torch_tensor

In [458]:
# appr = _paper_dict['10.1109/tnnls.2015.2392563']
def get_papers_per_authors(paper):
    set_to_merge = set()
    for author in paper.authors:
        identifier = list(author.identifiers)[0].key
#         print("name {0}. papers: {1}".format(_ident_to_auth_dict[identifier],
#                                              _authors_to_papers_dict[_ident_to_auth_dict[identifier]]))
        set_to_merge.update(set(_authors_to_papers_dict[_ident_to_auth_dict[identifier.lower()]]))
    set_to_merge.discard(get_doi(paper))
    return set_to_merge

In [469]:
BATCH_SIZE = 50

def prepare_triples(val_set):
    count = 0
    result_list = list()
    batch_list_1 = None
    batch_list_2 = None
    batch_list_3 = None
#     print(len(val_set))
    for paper_doi in val_set:
        paper = _paper_dict[paper_doi]
        
        
        pos_ref_set = get_papers_per_authors(paper)
        
        pos_ref_set.update(_ref_dict[paper_doi])
        pos_ref_set = (pos_ref_set.difference(_validation_set)).difference(_test_set)

        #cut down articles that are not used in val_set
        
#         pos_ref_set.intersection(val_set)
        
        if len(pos_ref_set) < 4:
#             print(paper_doi)
            continue
#         print(len(pos_ref_set))
        neg_ref_set = set()
        for n in range(len(pos_ref_set)):
            new_id = None
            while((new_id in pos_ref_set) or new_id==None):
                new_id = val_set[random.randrange(0,len(val_set)-1)]
            neg_ref_set.add(new_id)
            
            
        for pos,neg in zip(pos_ref_set, neg_ref_set):

            if batch_list_1 == None:
                batch_list_1 = paper.abstract
                batch_list_2 = _paper_dict[pos].abstract
                batch_list_3 = _paper_dict[neg].abstract
            else:
                batch_list_1 = sp.vstack((batch_list_1, paper.abstract))
                batch_list_2 = sp.vstack((batch_list_2, _paper_dict[pos].abstract))
                batch_list_3 = sp.vstack((batch_list_3, _paper_dict[neg].abstract))

            if batch_list_1.shape[0]==BATCH_SIZE:
                yield (make_var(batch_list_1), make_var(batch_list_2), make_var(batch_list_3))
                batch_list_1 = None
#     print(count)
    return result_list


    

In [470]:
class DistRes(nn.Module):

    def __init__(self):
        super(DistRes, self).__init__()
        self.layer1 = nn.Linear(N_FEATURES, 50)
        self.layer2 = nn.PairwiseDistance(2)

    def forward(self, x1,x2):
#         print('typex: {0}'.format(type(x1)))
#         print('typey: {0}'.format(type(x2)))
        res1 = torch.nn.functional.tanh(self.layer1(x1))
        res2 = torch.nn.functional.tanh(self.layer1(x2))

#         res3 = self.layer1(x3)
        # normalize vector L2-normalization
        return torch.nn.functional.tanh(self.layer2(res1,res2))
    
    def calculate_res(self, x):
        return torch.nn.functional.tanh(self.layer1(x)).data.numpy()
    
# linear/RelU/linear
# dropout input layer
# help as they authored by same person

In [471]:

dist_model = DistRes()

In [472]:
#check the distance Loss, just to see the difference
# maybe pos and pos close

crit = nn.HingeEmbeddingLoss(1)
optim = torch.optim.RMSprop(dist_model.parameters(), momentum=0.5, lr=0.01)
# local_dist(z)

In [473]:
ones_var = Variable(torch.ones(BATCH_SIZE))
neg_var = Variable(torch.ones(BATCH_SIZE)*(-1))

def train_networks(model, criteria, optimizer, paper, p_pos, p_neg):

    loss_pos = criteria.forward( model(paper, p_pos), ones_var)
    optimizer.zero_grad()
    gradCrit = loss_pos.backward()
    optimizer.step()
    
    loss_neg = criteria.forward( model(paper,p_neg), neg_var)
    optimizer.zero_grad()
    gradCrit = loss_neg.backward()
    optimizer.step()

    return model(paper, p_pos), model(paper,p_neg)




In [474]:
RECALL_N = 5

In [475]:
def validation_for_recall(val_set, items_to_validate=100, neg_examples_amount=200):

    recall_list = list()
    
    total_ref_dict = dict()
    rand_res_list = list()
    paper_index = 0
    recall_iter = items_to_validate
    while(recall_iter!=0):
        paper_doi = valid_set[paper_index]
        paper_index +=1
        paper = _paper_dict[paper_doi]
        pos_ref_set = get_papers_per_authors(paper)
        pos_ref_set.update(_ref_dict[paper_doi])

        pos_ref_set.intersection(val_set)

        
        if len(pos_ref_set)<4:
            continue
        recall_iter -= 1

        neg_examples_amount = 200
        neg_ref_set = set()
        for n in range(neg_examples_amount):
            new_id = None
            while((new_id in pos_ref_set) or new_id==None or (new_id in neg_ref_set)):
                new_id = train_set[random.randrange(0,len(train_set))]
            neg_ref_set.add(new_id)
        
        all_refs = set()
        all_refs.update(pos_ref_set)
        all_refs.update(neg_ref_set)
        rand_res_list.append((len(pos_ref_set)/len(all_refs))*RECALL_N/len(pos_ref_set))
        
        for ref_doi in all_refs:
            total_ref_dict[ref_doi] = make_var(_paper_dict[ref_doi].abstract)
        total_ref_dict[paper_doi] = make_var(_paper_dict[paper_doi].abstract)
        
        recall_list.append((paper_doi, 
                            pos_ref_set, 
                            all_refs))

 
    rand_res = 'random result: {0}'.format(np.mean(rand_res_list))

    return recall_list, total_ref_dict, rand_res

 

In [476]:

def calculate_recall(model, recall_list, total_ref_dict):

    a=0
    recall_at_n = list()
    
    #calculate all vectors for the set
    doi_to_vecs = dict()
    count_1 = 0
#     t0 = time.time()
    for key,value in total_ref_dict.items():
        doi_to_vecs[key] = model.calculate_res(value)
#     print(time.time()-t0)
    
#     t0 = time.time()
    
    for paper_doi, pos_ref_set, all_refs in recall_list:

        paper_vector = doi_to_vecs[paper_doi]

        distance_array = dict()
        for ref_doi in all_refs: 
            ref_vec = doi_to_vecs[ref_doi]
            ref_dist = np.linalg.norm(paper_vector - ref_vec)
            distance_array[ref_doi] = ref_dist

        ordered_predicted_list = list(map(lambda x: x[0], 
                                          sorted(distance_array.items(), 
                                                 key=lambda val: val[1])))[:RECALL_N]
        recall = len(pos_ref_set.intersection(ordered_predicted_list)) / len(pos_ref_set)
        if a!=0 and len(pos_ref_set)!=1:
            print('recall: {2}. expected: {0}, got {1}'.format(pos_ref_set, ordered_predicted_list, recall))

            a-=1
        #print(recall)
        recall_at_n.append(recall)
#     print(count_1)
#     print(time.time()-t0)
    return np.mean(recall_at_n)


In [477]:
train_set = _training_set
valid_set = _validation_set
# t0=time.time()

# print(time.time()-t0)
# print(len(triples_iterator))

In [478]:
# my_time = time.time()

def write_to_file(file, text_to_write):
    print(text_to_write)
    file.write(text_to_write)
    file.write('\n')
    file.flush()



In [None]:
epochs = 100
epoch_auc = dict()
my_time = time.time()



f = open('/home/jupyter/output_{0}.txt'.format(my_time),'w')

recall_list, recall_ref_dict, text = validation_for_recall(train_set)
write_to_file(f, text)

recall = calculate_recall(dist_model, recall_list, recall_ref_dict)
text = 'Beginning recall: {0}'.format(recall)
write_to_file(f, text)


for epoch in range(epochs):
    aver_pos = list()
    aver_neg = list()
    triples_iterator = prepare_triples(train_set)    
    batch=0
    t0 = time.time()
    for arg1,arg2,arg3 in triples_iterator:

        ap, an = train_networks(model=dist_model, criteria=crit, optimizer=optim, 
                                paper=arg1, p_pos=arg2, 
                                p_neg=arg3)
        
        aver_pos.append(ap.data.mean())
        aver_neg.append(an.data.mean())
        if (batch % 100 == 0 and batch != 0 ):
            t1 = time.time()
            total= t1-t0
            text = 'Epoch: {0}. Batch: {3}. Time {4} av_pos: {1}. av_neg: {2}'.format(epoch, np.mean(aver_pos), 
                                                                               np.mean(aver_neg), 
                                                                                 batch, total)
            write_to_file(f, text)
            t0 = time.time()
        batch+=1
    
    #calc auc
    t0 = time.time()
    recall = calculate_recall(dist_model, recall_list, recall_ref_dict)
    t1 = time.time()
#     print('recall calculated in {0} s.'.format(t1-t0))
    epoch_auc[epoch] = recall
    text = 'Epoch: {0}. av_pos: {1}. av_neg: {2}. recall: {3}'.format(epoch, np.mean(aver_pos), np.mean(aver_neg), recall)
    write_to_file(f, text)

f.close


random result: 0.02392248943273766
Beginning recall: 0.02227366152366152
Epoch: 0. Batch: 100. Time 9.827518701553345 av_pos: 0.9458517710575945. av_neg: 0.9572097988884047
Epoch: 0. Batch: 200. Time 10.235700845718384 av_pos: 0.9563150741658036. av_neg: 0.9664805687660009


In [None]:
recall_list, recall_ref_dict, text = validation_for_recall(train_set)
print(text)
text = calculate_recall(dist_model, recall_list, recall_ref_dict)
print(text)

In [None]:
a = {'a':1,'b':5, 'c':3,'d':0}
b = list(map(lambda x: x[0], sorted(a.items(), key=lambda val: val[1])[:3]))
c = {'d', 'a'}
d = len(c.intersection(b))/len(c)
np.mean([d,0])
int(10 / 3)

In [None]:
stacked_matrix.shape