In [1]:
import xml.etree.ElementTree as ET
import json
import re
from tqdm import tqdm
import pandas as pd
import torch
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist

In [2]:
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.set_device(3)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
same_seeds(42)

In [3]:
train_ans = pd.read_csv('dataset/train_ans.csv')
train_ans.head()

Unnamed: 0,topic,doc
0,2,2740164 2759595 2759599 2767143 2911928 291756...
1,4,1459158 2529301 2700561 2769425 2835690 283699...
2,6,1533809 1910597 2503992 2577108 2703746 272215...
3,9,1090616 1679810 2627448 2629012 2645689 271457...
4,11,1892007 2275746 2503962 2516438 2688349 276296...


In [4]:
train_ans['doc'] = train_ans['doc'].str.strip(',').str.split(' ')
train_ans.head()

Unnamed: 0,topic,doc
0,2,"[2740164, 2759595, 2759599, 2767143, 2911928, ..."
1,4,"[1459158, 2529301, 2700561, 2769425, 2835690, ..."
2,6,"[1533809, 1910597, 2503992, 2577108, 2703746, ..."
3,9,"[1090616, 1679810, 2627448, 2629012, 2645689, ..."
4,11,"[1892007, 2275746, 2503962, 2516438, 2688349, ..."


In [5]:
train_ans.to_csv('train_answer.csv', index=False)

In [3]:
with open('dataset/train_query.txt') as json_file:
    tr_data = json.load(json_file)
    idx = []
    cont = []
    for p in tr_data['train_query']:
        idx.append(p['index'])
        txt = p['note'].strip()
        cont.append(' '.join(' '.join([line.strip() for line in i.strip().splitlines()]).split()))

In [4]:
train_query = pd.DataFrame({'topic':idx, 'train_query':cont})
train_query["topic"] = pd.to_numeric(train_query["topic"])
train_query = train_query.sort_values(by=['topic']).reset_index(drop=True)
train_query.head()

Unnamed: 0,topic,train_query
0,2,Ms [**Known patient lastname 241**] is a [**Ag...
1,4,The patient is an 87 yo woman with h/o osteopo...
2,6,This is a [**Age over 90 **] year old female w...
3,9,"Infant is a 24 [**1-31**] week, 678 gm male tr..."
4,11,Mr. [**Name13 (STitle) 5827**] is an 80yo M wi...


In [12]:
train_query.to_csv('train_query.csv', index=False)

In [3]:
with open('dataset/test_query.txt') as json_file:
    tr_data = json.load(json_file)
    idx = []
    cont = []
    for p in tr_data['test_query']:
        idx.append(p['index'])
        cont.append(p['summary'].strip())

In [4]:
test_query = pd.DataFrame({'topic':idx, 'test_query':cont})
test_query["topic"] = pd.to_numeric(test_query["topic"])
test_query = test_query.sort_values(by=['topic']).reset_index(drop=True)
test_query.head()

Unnamed: 0,topic,test_query
0,1,A 78 year old male presents with frequent stoo...
1,3,A 75F found to be hypoglycemic with hypotensio...
2,5,An 82 man with multiple chronic conditions and...
3,7,A 41-year-old male patient with medical histor...
4,8,"A 26 year-old diabetic woman, estimated to 10 ..."


In [6]:
test_query.to_csv('test_query.csv', index=False)

In [5]:
with open('dataset/document.txt') as json_file:
    tr_data = json.load(json_file)
    idx = []
    cont = []
    for p in tr_data['document']:
        idx.append(p['index'])
        cont.append(p['content'][p['content'].rfind('>')+2:].strip())

In [10]:
document = pd.DataFrame({'doc':idx, 'document':cont})
document["doc"] = pd.to_numeric(document["doc"])
document = document.sort_values(by=['doc']).reset_index(drop=True)
document.head()

Unnamed: 0,doc,document
0,13915,Background: Resistance to mammary tumorigenes...
1,15027,Background: Standard archival sequence databa...
2,17824,Our results show that cytokines derived from m...
3,28992,Background: This study evaluated the feasibil...
4,28996,Background: In order to test the hypothesis t...


In [11]:
document.shape

(100000, 2)

In [12]:
new_document = []
for i in tqdm(document['document']):
    new_document.append(' '.join(' '.join([line.strip() for line in i.strip().splitlines()]).split()))

100%|██████████| 100000/100000 [00:50<00:00, 1991.01it/s]


In [15]:
new_document = pd.DataFrame({'doc':document['doc'], 'document':new_document})
new_document.head()

Unnamed: 0,doc,document
0,13915,Background: Resistance to mammary tumorigenesi...
1,15027,Background: Standard archival sequence databas...
2,17824,Our results show that cytokines derived from m...
3,28992,Background: This study evaluated the feasibili...
4,28996,Background: In order to test the hypothesis th...


In [16]:
new_document.to_csv('document.csv', index=False)

In [14]:
def chunking(max_len, sent):
    tokenized_text = sent.lower().split(" ")
    # using list comprehension
    final = [tokenized_text[i * max_len:(i + 1) *max_len] for i in range((len(tokenized_text) + max_len - 1) // max_len)] 
    
    # join back to sentences for each of the chunks
    sent_chunk = []
    for item in final:
        sent_chunk.append(' '.join(item))
    return sent_chunk

chunk = []
for i in tqdm(new_document):
    chunk.append(chunking(512, i))

100%|██████████| 100000/100000 [00:50<00:00, 1993.77it/s]


In [15]:
# import torch
# from transformers import AutoTokenizer, AutoModelWithLMHead

# tokenizer = AutoTokenizer.from_pretrained('t5-small')
# model = AutoModelWithLMHead.from_pretrained('t5-small', return_dict=True).cuda()

# doc_summary = []
# for text in tqdm(document['document']):
#     inputs = tokenizer.encode("summarize: " + text,
#                               return_tensors='pt',
#                               max_length=512,
#                               truncation=True).cuda()
#     summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=0, num_beams=2)
#     summary = tokenizer.decode(summary_ids[0])
#     doc_summary.append(summary[summary.rfind('<pad>')+5:summary.rfind('</s>')].strip())

# new_document = pd.DataFrame({'doc':document['doc'], 'document':document['document'], 'summary':doc_summary})
# new_document.to_csv('doc_summary.csv')

In [16]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').cuda()

In [17]:
sentence_embeddings = []

for chk in tqdm(chunk):
    chk_embeddings = model.encode(chk)
    sentence_embeddings.append(torch.mean(torch.tensor(chk_embeddings), 0).tolist())

100%|██████████| 100000/100000 [50:45<00:00, 32.84it/s] 


In [38]:
train_query_document = train_query['train_query']
train_query_embeddings = model.encode(train_query_document)

In [39]:
test_query_document = test_query['test_query']
test_query_embeddings = model.encode(test_query_document)

In [40]:
train_cosine_similarity = cosine_similarity(train_query_embeddings, sentence_embeddings)

In [41]:
indices = (torch.topk(torch.tensor(train_cosine_similarity), 51).values>0.3)*torch.topk(torch.tensor(train_cosine_similarity), 51).indices

In [42]:
topic = []
doc = []
for i in range(len(indices)):
    for j in range(len(indices[i])):
        if (indices[i][j] != 0) and j!=0:
            topic.append(train_query['topic'].values[i])
            doc.append(document['doc'].values[indices[i][j]])
            print('i = {}, query_i = {}, doc_j = {}'.format(i, train_query['topic'].values[i],document['doc'].values[indices[i][j]]))

i = 0, query_i = 2, doc_j = 2762554
i = 0, query_i = 2, doc_j = 3528042
i = 0, query_i = 2, doc_j = 2767143
i = 0, query_i = 2, doc_j = 3338223
i = 0, query_i = 2, doc_j = 4097904
i = 0, query_i = 2, doc_j = 4241333
i = 0, query_i = 2, doc_j = 2759595
i = 0, query_i = 2, doc_j = 3955657
i = 0, query_i = 2, doc_j = 2823235
i = 0, query_i = 2, doc_j = 4574333
i = 0, query_i = 2, doc_j = 2911928
i = 0, query_i = 2, doc_j = 2981898
i = 0, query_i = 2, doc_j = 2759587
i = 0, query_i = 2, doc_j = 3419665
i = 0, query_i = 2, doc_j = 2850897
i = 0, query_i = 2, doc_j = 3358686
i = 0, query_i = 2, doc_j = 3651383
i = 0, query_i = 2, doc_j = 3087225
i = 0, query_i = 2, doc_j = 3360183
i = 0, query_i = 2, doc_j = 3339098
i = 0, query_i = 2, doc_j = 2621231
i = 0, query_i = 2, doc_j = 4045352
i = 0, query_i = 2, doc_j = 3445102
i = 0, query_i = 2, doc_j = 4308016
i = 0, query_i = 2, doc_j = 3377139
i = 0, query_i = 2, doc_j = 3143992
i = 0, query_i = 2, doc_j = 3104464
i = 0, query_i = 2, doc_j = 

In [43]:
train_topic = train_query['topic'].values
result = []
for i in range(len(train_topic)):
    inputt = []
    for j in range(len(topic)):
        if topic[j] == train_topic[i]:
            inputt.append(doc[j])
    result.append(inputt)

In [44]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

In [45]:
golden_total = []
predict_total = []
total = []
intersect_total = []
intersect = []

for i in range(len(train_ans['doc'])):
    golden_total.append(len(train_ans['doc'][i]))
    predict_total.append(len(result[i]))
    total.append(len(train_ans['doc'][i])+len(result[i]))
    intersect.append(intersection(result[i], list(map(int, (train_ans['doc'][i])))))
    intersect_total.append(len(intersection(result[i], list(map(int, (train_ans['doc'][i]))))))

In [46]:
final = pd.DataFrame({'topic':train_ans['topic'], 'intersect':intersect, 'intersect_total':intersect_total, 'predict_total':predict_total, 'golden_total':golden_total, 'total':total, 'score':np.array(intersect_total)/np.array(total)})
final

Unnamed: 0,topic,intersect,intersect_total,predict_total,golden_total,total,score
0,2,"[3528042, 2767143, 3338223, 2759595, 2911928, ...",14,50,34,84,0.166667
1,4,[],0,50,18,68,0.0
2,6,"[3227160, 3352193, 2976516, 2703746, 4531919, ...",14,50,64,114,0.122807
3,9,"[3672599, 3492090, 3446120, 2994134, 2714572, ...",15,50,51,101,0.148515
4,11,"[3809224, 4612484, 2503962, 2799444, 4295862, ...",6,50,56,106,0.056604
5,13,"[2691928, 3310784]",2,50,50,100,0.02
6,14,"[1824744, 3996170, 3701496, 2707150, 3389947, ...",9,50,54,104,0.086538
7,15,"[4718127, 3620928, 3277036]",3,50,50,100,0.03
8,19,"[4778408, 3748420, 3992766, 3337512, 3410308]",5,50,62,112,0.044643
9,22,[4310357],1,36,8,44,0.022727


In [47]:
np.mean(final['score'])

0.08189171010884361

In [49]:
test_cosine_similarity = cosine_similarity(test_query_embeddings, sentence_embeddings)

In [50]:
indices = (torch.topk(torch.tensor(test_cosine_similarity), 51).values>0.3)*torch.topk(torch.tensor(test_cosine_similarity), 51).indices

In [51]:
topic = []
doc = []
for i in range(len(indices)):
    for j in range(len(indices[i])):
        if (indices[i][j] != 0) and j!=0:
            topic.append(test_query['topic'].values[i])
            doc.append(document['doc'].values[indices[i][j]])
            print('i = {}, query_i = {}, doc_j = {}'.format(i, test_query['topic'].values[i],document['doc'].values[indices[i][j]]))

i = 0, query_i = 1, doc_j = 3767862
i = 0, query_i = 1, doc_j = 4321082
i = 0, query_i = 1, doc_j = 4154917
i = 0, query_i = 1, doc_j = 4271459
i = 0, query_i = 1, doc_j = 4292062
i = 0, query_i = 1, doc_j = 4480181
i = 0, query_i = 1, doc_j = 4534937
i = 0, query_i = 1, doc_j = 3954265
i = 0, query_i = 1, doc_j = 4030609
i = 0, query_i = 1, doc_j = 4004840
i = 0, query_i = 1, doc_j = 4737844
i = 0, query_i = 1, doc_j = 4302625
i = 0, query_i = 1, doc_j = 4309677
i = 0, query_i = 1, doc_j = 4326288
i = 0, query_i = 1, doc_j = 4298112
i = 0, query_i = 1, doc_j = 3193822
i = 0, query_i = 1, doc_j = 3894017
i = 0, query_i = 1, doc_j = 3305541
i = 0, query_i = 1, doc_j = 2963747
i = 0, query_i = 1, doc_j = 3640153
i = 0, query_i = 1, doc_j = 3471405
i = 0, query_i = 1, doc_j = 3075185
i = 0, query_i = 1, doc_j = 2726551
i = 0, query_i = 1, doc_j = 3999026
i = 0, query_i = 1, doc_j = 3841668
i = 0, query_i = 1, doc_j = 3542894
i = 0, query_i = 1, doc_j = 3439954
i = 0, query_i = 1, doc_j = 

In [52]:
predict_total = []
predict = []

for i in range(len(test_query['topic'])):
    predict_total.append(len(result[i]))
    predict.append(' '.join(list(map(str, sorted(result[i])))))

In [59]:
final_result = pd.DataFrame({'topic':test_query['topic'], 'doc':predict})
final_result

Unnamed: 0,topic,doc
0,1,2553427 2584651 2621231 2759587 2759595 276255...
1,3,1388220 1766477 2413407 2685234 2686344 290672...
2,5,1533809 2672232 2672256 2703746 2722154 279435...
3,7,2525846 2629012 2668557 2700561 2714572 284297...
4,8,2503962 2700560 2726557 2737796 2766584 279938...
5,10,2588582 2612686 2691928 2702896 2721944 277929...
6,12,60654 1277011 1824744 1847685 2650615 2650997 ...
7,16,2223623 2612685 2919999 2962298 3033825 305672...
8,17,1196471 1800858 2386356 2596799 2684430 271195...
9,18,130966 183850 2735159 2835690 2914738 3150839 ...


In [60]:
final_result.to_csv('dataset/result1.csv', index=False)