In [1]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')
tokenizer.save_pretrained('Llama-2-7b-hf')

('Llama-2-7b-hf/tokenizer_config.json',
 'Llama-2-7b-hf/special_tokens_map.json',
 'Llama-2-7b-hf/tokenizer.model',
 'Llama-2-7b-hf/added_tokens.json',
 'Llama-2-7b-hf/tokenizer.json')

In [None]:
from collections import defaultdict
import pandas as pd
import json
import re

dict_OOV_freq = defaultdict(int)
dict_splits = defaultdict(int)

with open('../../../../TxtInputFiles/BioASQ_input.txt','r') as f:
    for line in f:
        target_text = line.strip()
        target_text = re.sub(r'[^\w\s]', '', target_text)
        sws = tokenizer.tokenize(target_text)
        print(sws)
        i = 0
        while(i < len(sws)-1):
            if sws[i].startswith('▁'):
                if sws[i+1].startswith('▁'):
                    i+=1
                    continue
                else:
                    sw = []
                    while(True):
                        sw.append(sws[i])
                        i+=1
                        if i == len(sws) or sws[i].startswith('▁'):
                            break
                    print(sw)
                    dict_OOV_freq[''.join(sw).replace('▁','')] += 1
                    dict_splits[''.join(sw).replace('▁','')] = len(sw)

list_token, list_freq, list_split = list(), list(), list()        
for token in dict_OOV_freq:
    list_token.append(token)
    list_freq.append(dict_OOV_freq[token])
    list_split.append(dict_splits[token])

df = pd.DataFrame({'token': list_token, 'freq': list_freq, 'split': list_split})
df.to_csv(f'BioASQ_OOV.csv', index=False)

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('./CHQ_Vocabs/25K_0.5_/')
# tokenizer.save_pretrained('./EBM-20K-1/')

In [None]:
COUNT_OOV = 0
vocab = tokenizer.get_vocab()
for v in vocab:
    if vocab[v] < 258: continue
    sws = tokenizer.tokenize(v.replace('▁',' '))
    if len(sws) > 1 and not sws[1]==v:
        # if vocab[v] < 32000: print(v,'PRE',vocab[v],tokenizer.tokenize(v.replace('▁',' ')))
        if vocab[v] >= 32000: 
            print(v,'ADDED',vocab[v],tokenizer.tokenize(v.replace('▁',' ')))
            COUNT_OOV += 1

In [None]:
COUNT_OOV

In [None]:
(3156/12212 + 984/3610 + 2579/9330 + 715/2358)/4

In [None]:
from transformers import BartTokenizer
import glob

oov_frac = 0.
for fname in glob.glob('/Users/gunjanbalde/Documents/SR-NG-MedVoc/SR-NG-MedVoc/Expert-Domain/Unfiltered_Test/BART-Vocabs/*'):
    tokenizer = BartTokenizer.from_pretrained(fname)
    pre_vocab = tokenizer.get_vocab()
    a_vocab = 0
    count_oov = 0
    
    for v in pre_vocab:
        if pre_vocab[v] < 50265: continue
        a_vocab += 1
        sws = tokenizer.tokenize(v.replace('Ġ',' '))
        if len(sws) > 1 and not sws[1]==v:
            count_oov += 1
            print(v,pre_vocab[v],tokenizer.tokenize(v.replace('▁',' ')))
    oov_frac += count_oov/a_vocab
oov_frac/4

In [None]:
import json
import glob
list_abstracts = []
for fname in glob.glob('./BioASQ-training9b/*.json'):
  data = json.loads(open(fname,'r').read())
  for item in data['questions']:
      if item['type'] == 'summary':
          for doc in item['snippets']:
            print(fname,doc['document'])
            list_abstracts.append(doc['document'].split('/')[-1])
list_abstracts = list(set(list_abstracts))

In [None]:
'7270517' in list_abstracts

In [None]:
from Bio import Entrez
import sys
import csv
from time import sleep
import random
Entrez.email = 'balde.gunjan0812@kgpian.iitkgp.ac.in'
 
def fetch_abstracts(pub_ids, retmax=1000, output_file='abstracts.csv'):    
    # Make sure requests to NCBI are not too big
    for i in range(0, len(pub_ids), retmax):
        j = i + retmax
        if j >= len(pub_ids):
            j = len(pub_ids)

        print(f"Fetching abstracts from {i} to {j}.")
        handle = Entrez.efetch(db="pubmed", id=','.join(pub_ids[i:j]),
                        rettype="xml", retmode="text", retmax=retmax)
        
        records = Entrez.read(handle)
        abstracts = []
        for idx,pubmed_article in enumerate(records['PubmedArticle']):
          try:
            abstracts.append(pubmed_article['MedlineCitation']['Article']['ArticleTitle']+ '\n' +
                     ' '.join(pubmed_article['MedlineCitation']['Article']['Abstract']['AbstractText']))
          except:
            print(f"Error in fetching abstract for {pub_ids[i+idx]}")

        abstract_dict = dict(zip(pub_ids[i:j], abstracts))

        with open(output_file, 'a', newline='') as csvfile:
            fieldnames = ['pub_id', 'abstract']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='\t')
            if i == 0:
              writer.writeheader()
            for pub_id, abstract in abstract_dict.items():
              writer.writerow({'pub_id': pub_id, 'abstract': abstract})
        
        sleep(random.randint(10, 30))        

if __name__ == '__main__':
  fetch_abstracts(list_abstracts, output_file='bioasq_train_abstracts.csv')


In [None]:
list_abstracts[0]

In [2]:
import json
json_train = json.loads(open('./pubmedqa/data/pqal_fold0/train_set.json','r').read())
json_test = json.loads(open('./pubmedqa/data/test_set.json','r').read())

In [None]:
for entry in json_train:
    print(json_train[entry])

In [6]:
list_train_sd, list_train_rs = [], []
for entry in json_train:
    q,sd = json_train[entry]['QUESTION'], ' '.join(json_train[entry]['CONTEXTS'])
    rs = json_train[entry]['LONG_ANSWER']
    
    list_train_sd.append(q+'\n'+sd)
    list_train_rs.append(rs)

list_test_sd, list_test_rs = [], []
for entry in json_test:
    q,sd = json_test[entry]['QUESTION'], ' '.join(json_test[entry]['CONTEXTS'])
    rs = json_test[entry]['LONG_ANSWER']
    
    list_test_sd.append(q+'\n'+sd)
    list_test_rs.append(rs)

In [7]:
import pandas as pd
df_train = pd.DataFrame({'inputs': list_train_sd, 'target': list_train_rs})
df_test = pd.DataFrame({'inputs': list_test_sd, 'target': list_test_rs})

df_train.to_csv('PubMedQA_train.csv', index=False)
df_test.to_csv('PubMedQA_test.csv', index=False)

In [8]:
# import pandas as pd
# df_train = pd.read_json('',orient='records',lines=True)
# df_test = pd.read_json('',orient='records',lines=True)

import pandas as pd
model_path = "pritamdeka/PubMedBERT-mnli-snli-scinli-scitail-mednli-stsb"

from sentence_transformers import SentenceTransformer
model = SentenceTransformer(model_path)

train_encodings = model.encode(df_train['inputs'].tolist(), convert_to_tensor=True)
test_encodings = model.encode(df_test['inputs'].tolist(), convert_to_tensor=True)

train_encodings = train_encodings.detach().cpu().numpy()
test_encodings = test_encodings.detach().cpu().numpy()

import numpy as np
dist_rep = np.zeros((test_encodings.shape[0],train_encodings.shape[0]))

for idx1,entry in enumerate(test_encodings):
    for idx2,train_vec in enumerate(train_encodings):
        dist_rep[idx1,idx2] = np.linalg.norm(entry-train_vec)

closest_neighbors = np.zeros((len(test_encodings),16))
for idx in range(len(test_encodings)):
    closest_neighbors[idx] = np.argsort(dist_rep[idx])[:16]
    
np.save('closest_neighbors_PubMedQA.npy', closest_neighbors)

In [None]:
import pandas as pd

model_path = "pritamdeka/PubMedBERT-mnli-snli-scinli-scitail-mednli-stsb"

from sentence_transformers import SentenceTransformer
model = SentenceTransformer(model_path)

df_BioASQ = pd.read_json('./bioasq_train.json',orient='records',lines=True)
text_PAC = open('../../../../TxtInputFiles/PAC_input.txt','r').readlines()
text_BioASQ = df_BioASQ['inputs'].tolist()

In [None]:
import numpy as np
pac_encodings = np.zeros((len(text_PAC),768))

In [None]:
import tqdm
for idx in tqdm.tqdm(range(0,len(text_PAC),1024)):
    if idx+1024 > len(text_PAC):
        pac_encodings[idx:] = model.encode(text_PAC[idx:], convert_to_tensor=True).detach().cpu().numpy()
    else:
        pac_encodings[idx:idx+1024] = model.encode(text_PAC[idx:idx+1024], convert_to_tensor=True).detach().cpu().numpy()

In [None]:
BioASQ_Encodings = model.encode(text_BioASQ, convert_to_tensor=True)
BioASQ_Encodings = BioASQ_Encodings.detach().cpu().numpy()

In [None]:
import numpy as np
pac_encodings = np.load('./PAC_Encodings.npy')

In [None]:
import numpy as np
import tqdm
dist_rep = np.zeros((len(text_PAC),len(text_BioASQ)))

for idx1,entry in enumerate(pac_encodings):
    if (idx1+1)%1000 == 0:
        print(f'Completed {idx1+1} entries')
    for idx2,train_vec in enumerate(BioASQ_Encodings):
        dist_rep[idx1,idx2] = np.linalg.norm(entry-train_vec)


In [None]:
dist_rep_mean = dist_rep.mean(axis=1)
dist_rep_mean.shape

sorted_indices = dist_rep_mean.argsort()[:50000]

PAC_BioASQ = [text_PAC[idx] for idx in sorted_indices]
with open('./BioASQ_PAC.txt','w') as f:
    f.write(''.join(PAC_BioASQ))
f.close()

In [None]:
sorted_indices

In [None]:
from random import shuffle
shuffle(text_PAC)

In [None]:
random_PAC = text_PAC[:50000]
with open('./Random_PAC.txt','w') as f:
    f.write(''.join(random_PAC))
f.close()

In [None]:
import re

string = "_hello_hello_hello"
substring = "_hello"

count = len(re.findall(f"(?={substring})", string))
print("Number of overlapping occurrences:", count)
string.count(substring)