# DeepSense: a Deep Learning Method for Full-sentence Search of Biomedical Literature

## Abstract

## Introduction

## Dataset Development

In [1]:
import os
import json
import pandas as pd

In [2]:
# base_path = '/jupiter/cl17d/Project/src_new'
base_path = './project/src_new'

### Training and Validation Datasets

In [3]:
def get_train_val_stats(data_file):
    sents = set()
    p = 0
    n = 0
    with open(data_file) as f:
        next(f)
        for row in f:
            row = row.strip().split('\t')
            sents.add(f"{row[0]}|{row[1]}")
            if row[-1] == '1': p += 1
            if row[-1] == '0': n += 1
    print(f"{'Total Sentences':30}{len(sents)}\n{'Positive Instances':30}{p}")
    print(f"{'Negative Instances':30}{n}\n{'Total Instances':30}{p+n}")

Training Dataset: input/train_sentences.tsv

Some sentences have more than 1 citations, so have more than 1 positive instance, resulting in 936,591 positive instances. For each positive instance, sample 2 negative instances, some negative instances may be duplicated, resulting in 1,870,387 negative instances after removal of the duplicated instances. 

In [4]:
data_file = f"{base_path}/input/train_sentences.tsv"
get_train_val_stats(data_file)

Total Sentences               854101
Positive Instances            936591
Negative Instances            1870387
Total Instances               2806978


Validation Dataset: input/valid_sentences.tsv

In [5]:
data_file = f"{base_path}/input/valid_sentences.tsv"
get_train_val_stats(data_file)

Total Sentences               145455
Positive Instances            148269
Negative Instances            296128
Total Instances               444397


### Test Datasets

- SQL_BM25: case_SQL_testing_final_complete
- PubMed_TFIDF: case_PubMed_testing_final_complete
- PubMed_BM: case_PubMed_BM_testing_final_complete
- Google_Scholar: case_Google_scholar_testing_final_complete

Test datasets development process:

- Develop citing sentence and cited paper pairs from PMC articles (code not available, original data not available)
  - Output data format: citing_sentence, citing_sentence_pmid, cited_paper_pmid, (citing_sentence_original_text)
- Acquire search returns for each citing sentence query
  - Codes: create_test_search_PubMed.py <- PM_function.py/create_data_PubMed
  - Output data format: citing_sentence, citing_sentence_pmid, search_returned_paper_pmid, cited_paper_pmid
- Create final test datasets
  - Code: create_PM_final_test.py

Issues:

- Exist in creating final test datasets (Code: create_PM_final_test.py)
  - Incomplete information dictionary of total pubmed papers
    - Forward search of citing sentence paper publishing year when its pmid not in the dictionary
    - Skip search returned papers when their pmids not in the dictionary

In [6]:
# Test sentences record
test_sentences_records_all = json.load(open(f"{base_path}/test_sentences_records.json"))
len(test_sentences_records_all)

96950

In [7]:
# Check publishing year of test sentences and keep only sentences with 1 citation
m, n, p = 0, 0, 0
test_sentences_records = {}
for k, v in test_sentences_records_all.items():
    if len(v['year']) > 1: m += 1
    if len(v['year']) == 1: n += 1
    if len(v['citations']) > 1:
        p += 1
    else:
        test_sentences_records[k] = v
print(f"{'Sentences with 1 publishing year':40}{n}\n{'Sentences with >1 publishing year':40}{m}")
print(f"{'Sentences with 1+ citations':40}{p}")
print(f"{'Sentences with only 1 citation':40}{len(test_sentences_records)}")

Sentences with 1 publishing year        96950
Sentences with >1 publishing year       0
Sentences with 1+ citations             4703
Sentences with only 1 citation          92247


In [8]:
# check citations of test sentences
m, n = 0, 0
cites = set()
for v in test_sentences_records.values():
    n += len(v['citations'])
    if len(v['citations']) > 1: m += 1
    for cite in v['citations']:
        cites.add(cite)
print(f"{'Total citations':30}{n}\n{'Total unique cited papers':30}{len(cites)}\n{'Sentences with >1 citations':30}{m}")

Total citations               92247
Total unique cited papers     82639
Sentences with >1 citations   0


In [11]:
# retain only sentences with 5+ and 50- tokens for final test
test_sentences = {}
m, n = 0, 0
# for k, v in test_sentences_records_all.items():
for k, v in test_sentences_records.items():
    if len(k.split('|')[0].split()) < 5:
        m += 1
    elif len(k.split('|')[0].split()) > 50:
        n += 1
    else:
        test_sentences[k] = v
print(f"{'Sentences with <5 tokens':35}{m}\n{'Sentences with >50 tokens':35}{n}")
print(f"{'Sentences retained for test':35}{len(test_sentences)}")

Sentences with <5 tokens           217
Sentences with >50 tokens          1273
Sentences retained for test        90757


In [12]:
# check citations of final test sentences
m, n = 0, 0
cites = set()
more_cites_sentences = {}
for k, v in test_sentences.items():
    n += len(v['citations'])
    if len(v['citations']) > 1:
        more_cites_sentences[k] = v
        m += 1
    for cite in v['citations']:
        cites.add(cite)
print(f"{'Total citations':30}{n}\n{'Total unique cited papers':30}{len(cites)}\n{'Sentences with >1 citations':30}{m}")

Total citations               90757
Total unique cited papers     81411
Sentences with >1 citations   0


Functions for query

In [2]:
import requests
from bs4 import BeautifulSoup

In [8]:
# funtion for getting web pages
def get_page(url):
    session = requests.Session()
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) Chrome/39.0.2171.95 Safari/537.36',
               'Accept': 'text/html,application/xhtml+xml,application/xml,application/json;q=0.9,image/webp,*/*;q=0.8'}        
    try:
        req = session.get(url, headers=headers)
    except requests.exceptions.RequestException:
        return None
    req.encoding = 'utf-8'
    if req.text == '':
        return None
    if req.headers['Content-Type']=='application/json':
        return req.json()
    return req.text

In [None]:
# # function for query biokde
# def biokde_search(query, year='2020'):
#     biokde_url = 'https://biokde.com/search/'
#     biokde_url += f'?term={query}&filter=years.1977-{year}' # sort=date,pubdate &sort_order=asc
#     soup = BeautifulSoup(get_page(biokde_url), 'xml')
#     counts = int(soup.find('div', attrs={'class':'reference_info'}).text.strip().split()[0].replace(',', ''))
#     idlist = []
#     for doc in soup.find('body').find_all('span', attrs={'class':'article_info_title'}):
#         idlist.append(doc.a['href'].split('/')[-1])
#     for page in range(2, int(1000/20)+1):
#         page_url = f"{biokde_url}&page={page}"
#         soup = BeautifulSoup(get_page(page_url), 'xml')
#         for doc in soup.find('body').find_all('span', attrs={'class':'article_info_title'}):
#             idlist.append(doc.a['href'].split('/')[-1])
#     return counts, idlist

In [None]:
# # function for query pubmed by entrez (tf-idf)
# def eutils_search(query, year='2020'):
#     eutils_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
#     eutils_url += f"esearch.fcgi?db=pubmed&term={query}"
#     eutils_url += f"&retmax=1000&sort=relevance&mindate=1977&maxdate={year}&datetype=pdat"
#     soup = BeautifulSoup(get_page(eutils_url), 'xml')
#     counts = int(soup.find('Count').text)
#     idlist = list(soup.find('IdList').stripped_strings)
#     return counts, idlist

In [41]:
# function for query pubmed by web (best match)
def pubmedweb_search(query, year='2020'):
    pubmed_url = 'https://pubmed.ncbi.nlm.nih.gov/'
    pubmed_url += f'?term={query}&size=200&filter=years.1977-{year}' # sort=date,pubdate &sort_order=asc
    soup = BeautifulSoup(get_page(pubmed_url), 'xml')
    counts = int(soup.find('meta', attrs={'name':'log_resultcount'})['content'])
    idlist = soup.find('meta', attrs={'name':'log_displayeduids'})['content'].split(',')
    for page in range(2, int(1000/200)+1):
        page_url = f"{pubmed_url}&page={page}"
        soup = BeautifulSoup(get_page(page_url), 'xml')
        idlist.extend(soup.find('meta', attrs={'name':'log_displayeduids'})['content'].split(','))
    return counts, idlist

In [None]:
# # function for query google scholar
# def googlescholar_search(query, year='2020'):
#     ggs_url = 'https://scholar.google.com/scholar'
#     ggs_url += f'?q=site:pubmed.ncbi.nlm.nih.gov {query}' # sort=date,pubdate &sort_order=asc
#     ggs_url += f'&hl=en&as_sdt=0,10&as_ylo=1977&as_yhi={year}'
#     soup = BeautifulSoup(get_page(ggs_url), 'html')
#     counts = int(soup.find('div', attrs={'id':'gs_ab_md'}).text.split()[1].replace(',', ''))
#     idlist = []
#     for h3 in soup.find('body').find_all('h3'):
#         idlist.append(h3.a['href'].split('/')[-2])
#     for page in range(10, 1000, 10):
#         page_url = f"{ggs_url}&start={page}"
#         soup = BeautifulSoup(get_page(page_url), 'html')
#         for h3 in soup.find('body').find_all('h3'):
#             idlist.append(h3.a['href'].split('/')[-2])
#     return counts, idlist

In [12]:
# function for query litsense by web (best match)
def litsense_search(query):
    idlist = []
    litsense_url = 'https://www.ncbi.nlm.nih.gov/research/litsense-api/api/'
    litsense_url += f'?query={query}&rerank=true' # sort=date,pubdate &sort_order=asc
    for sent in get_page(litsense_url):
        idlist.append(sent['pmid'])
    return idlist

In [5]:
query = "Breast cancers with HER2 amplification"

In [6]:
litsense_url = 'https://www.ncbi.nlm.nih.gov/research/litsense-api/api/'
litsense_url += f'?query={query}&rerank=true'

In [None]:
get_page(litsense_url)

In [16]:
idlist = {}
for i in litsense_search(query):
    idlist[i] = idlist.get(i, 0)
    idlist[i] += 1

In [None]:
idlist

**Search in SQL_BM25**

(skip, using existing test data)

In [None]:
# biokde_counts, biokde_pmids = biokde_search(query, year)
# biokde_counts, len(biokde_pmids), biokde_pmids[:20]

**Search in PubMed_TD-IDF**

(skip, using existing data)

In [None]:
# eutils_counts, eutils_pmids = eutils_search(query, year)
# eutils_counts, len(eutils_pmids), eutils_pmids[:20]

**Search on PubMed_BM**

In [None]:
# pubmedweb_counts, pubmedweb_pmids = pubmedweb_search(query, year)
# pubmedweb_counts, len(pubmedweb_pmids), pubmedweb_pmids[:20]

In [13]:
n = 0
test_pubmed_bm = json.load(open('test_sents_pubmed_bm.json'))
remaining_set = set()

In [14]:
len(test_pubmed_bm)#, len(sents)

45115

In [None]:
for k, v in test_sentences.items():
    if k not in sents: continue
    if k in test_pubmed_bm: continue
    query = ' OR '.join(k.split('|')[0].strip().split())
    year = v['year'][0]
#     print(query, year)
    try:
        counts, pmids = pubmedweb_search(query, year)
    except:
        remaining_set.add(k)
        continue
#     print(counts, len(pmids))
    if any(i in pmids for i in v['citations']):
        test_pubmed_bm[k] = {'year':year, 'citations':v['citations'], 'counts':counts, 'pmids':pmids}
        n += 1
        if n % 1000 == 0:
            print(n)
            json.dump(test_pubmed_bm, open('test_sents_pubmed_bm.json', 'w', encoding='utf-8'))
        if n >= 6500: break
json.dump(test_pubmed_bm, open('test_sents_pubmed_bm.json', 'w', encoding='utf-8'))
len(test_pubmed_bm)

Prepare the test dataset

In [16]:
test_pubmed_bm = json.load(open('test_sents_pubmed_bm.json', 'r', encoding='utf-8'))
len(test_pubmed_bm)

45115

In [17]:
# pubmed_bm dataset stat
top1000_sentences = {} # sentences with top 1000 search returns
num_sentences = 0
num_citations = 0 # total number of citations
top1000_num_citations = 0 # total number of citations in sentences with top 1000 search returns
for k, v in test_pubmed_bm.items():
    if k in test_sentences:
        num_sentences += 1
        num_citations += len(v['citations'])
        if len(v['pmids']) > 600:
            top1000_sentences[k] = v
            top1000_num_citations += len(v['citations'])
print(f"{'Total Sentences':60}{num_sentences}")
print(f"{'Total Citations':60}{num_citations}")
print(f"{'Sentences with Top 1000 Search Returns':60}{len(top1000_sentences)}")
print(f"{'Total Citations for Sentences with Top 1000 Search Returns':60}{top1000_num_citations}")

Total Sentences                                             44408
Total Citations                                             48054
Sentences with Top 1000 Search Returns                      18192
Total Citations for Sentences with Top 1000 Search Returns  19718


In [22]:
# organize query results into format for final test data development by create_PM_final_test.py
# output format: citing_sentence, citing_sentence_pmid, search_returned_paper_pmid, cited_paper_pmid
# output folder: test_pubmed_bm_search_returns
# output file: pubmed_bm_search_returns_.csv
out_path = '/jupiter/cl17d/Project/test_pubmed_bm_search_returns'
num_sent = len(test_pubmed_bm)
for i in range(0, int(num_sent/1000)+1):
    out_file = f"pubmed_bm_search_returns_{i}.csv"
    with open(f"{out_path}/{out_file}", "w", encoding="utf-8") as fw:
        for j, (k, v) in enumerate(test_pubmed_bm.items()):
            if j < i*1000: continue
            sent_text = k.split('|')[0]
            sent_pmid = k.split('|')[1]
            for cite in v['citations']:
                for pmid in v['pmids']:
                    if pmid != '':
                        fw.write(f"{sent_text}\t{sent_pmid}\t{pmid}\t{cite}\n")
            if j+1 == (i+1)*1000: break 

In [27]:
# in_path = '/jupiter/cl17d/Project/test_pubmed_bm_search_returns'
in_path = './project/test_pubmed_bm_search_returns'
n, m = 0, 0
for file in os.listdir(in_path):
    with open(f"{in_path}/{file}", "r", encoding="utf-8") as f:
        for row in f:
            row = row.strip().split("\t")
            sentence = f"{row[0]}|{row[1]}"
            pmid = row[2]
            citation = row[3]
            if sentence not in test_pubmed_bm:
                n += 1
            else:
                if pmid not in test_pubmed_bm[sentence]["pmids"]:
                    test_pubmed_bm[sentence]["pmids"].append(pmid)
                if citation not in test_pubmed_bm[sentence]["citations"]:
                    m += 1
print(n, m)

0 0


In [None]:
# # prepare the final pubmed bm test dataset
# cd /jupiter/cl17d/Project/src_new
# !python create_pubmed_search_test_dataset.py

**Search on Google Scholar**

Because Google restricts automatic crawling, Google Scholar data is not included in the study.

In [None]:
# ggs_counts, ggs_pmids = googlescholar_search(query_or, year)
# ggs_counts, len(ggs_pmids), ggs_pmids[:20]

In [None]:
# n = 0
# test_google_scholar = {}
# remaining_set = set()

In [None]:
# for k, v in test_sentences.items():
#     if k in test_google_scholar: continue
#     n += 1
#     if n % 1000 == 0:
#         print(n)
#         json.dump(test_google_scholar, open('test_sents_google_scholar.json', 'w', encoding='utf-8'))
#     query = ' OR '.join(k.split('|')[0].strip().split())
#     year = v['year'][0]
# #     print(query, year)
#     try:
#         counts, pmids = googlescholar_search(query, year)
#     except:
#         remaining_set.add(k)
#         continue
# #     print(counts, len(pmids))
#     test_google_scholar[k] = {'year':year, 'citations':v['citations'], 'counts':counts, 'pmids':pmids}
# json.dump(test_google_scholar, open('test_sents_google_scholar.json', 'w', encoding='utf-8'))

In the existing SQL_BM25 test dataset, the ranks of search returns for each sentence-citation pair are the same. Also the search returns include the top 1000 search results. We will only use the top 500 search results in the study.

Because of the SQL BM25 dataset can not be processed for getting the top 500 search returns, we use the top 1000 search returns in the study.

In [None]:
# # function for processing the test datasets to retain the top 500 search returns
# def top500_search_returns(input_path, output_path):
#     citation_ranks = {}
#     for file in os.listdir(input_path):
#         search_results = {}
#         with open(f"{output_path}/{file}", "w", encoding="utf-8") as fw:
#             with open(f"{input_path}/{file}", "r", encoding="utf-8") as fr:
#                 fw.write(fr.readline())
#                 for line in  fr:
#                     line = line.strip().split('\t')
#                     sentence = f"{line[0]}|{line[1]}|{line[3]}"
#                     search_return = line[2]
#                     citation = line[3]
#                     citation_rank = int(line[-1])
#                     search_results[sentence] = search_results.get(sentence, 0)
#                     search_results[sentence] += 1
#                     if search_return == citation:
#                         citation_ranks[sentence] = {'position':search_results[sentence], 'citation':'\t'.join(line)}
#                     if citation_rank <= 500 and search_results[sentence] <= 500:
#                         fw.write('\t'.join(line)+'\n')
# #     return citation_ranks

In [None]:
# # process the SQL_BM25 test dataset
# input_path = f"{base_path}/case_SQL_testing_final_complete"
# output_path = f"{base_path}/test_dataset_sql_bm25"
# top500_search_returns(input_path, output_path)

In [None]:
# # process the PubMed_TFIDF test dataset
# input_path = f"{base_path}/case_PubMed_testing_final_complete"
# output_path = f"{base_path}/test_dataset_pubmed_tfidf"
# top500_search_returns(input_path, output_path)

In [None]:
# # process the PubMed_TFIDF test dataset
# input_path = f"{base_path}/test_pubmed_bm_dataset"
# output_path = f"{base_path}/test_dataset_pubmed_bm"
# top500_search_returns(input_path, output_path)

## Training

In [None]:
# !CUDA_VISIBLE_DEVICES=2 python train_sentences_new.py

## Testing

In [None]:
# # test dataset of SQL_BM25
# !CUDA_VISIBLE_DEVICES=2 python test_sentences_sql_bm25.py

In [None]:
# # test dataset of PubMed_TFIDF
# !CUDA_VISIBLE_DEVICES=2 python test_sentences_pubmed_tfidf.py

In [None]:
# # test dataset of PubMed_BM
# !CUDA_VISIBLE_DEVICES=2 python test_sentences_pubmed_bm.py

Functions for stats calculation

In [18]:
def get_test_stats(test_path, test_sentences_records):
    sents = set()
    extra_sents = set()
    cites = set()
    extra_cites = set()
    n, m = 0, 0
    for file in os.listdir(f"{test_path}"):
        with open(f"{test_path}/{file}") as ifile:
            next(ifile)
            for row in ifile:
                row = row.strip().split("\t")
                sentence = f"{row[0]}|{row[1]}"
                citation = row[3]
                if sentence in test_sentences_records:
                    sents.add(sentence)
                    cites.add(citation)
                    n += 1
                else:
                    extra_sents.add(sentence)
                    extra_cites.add(citation)
                    m += 1 
                    
    #             if citation in test_sentences_records[sentence]['citations']: m += 1
    print(f"Test Sentences: {len(sents):12}\tExtra Sentences: {len(extra_sents)}")
    print(f"Test Citations: {len(cites):12}\tExtra Citations: {len(extra_cites.difference(cites))}")
    print(f"Test Instances: {n:12}\tExtra Instances: {m}")

In [19]:
def get_pred_stats(pred_path, test_sentences_records):
    sents = set()
    extra_sents = set()
    cites = set()
    extra_cites = set()
    n, m = 0, 0
    for file in os.listdir(f"{pred_path}"):
        with open(f"{pred_path}/{file}") as ifile:
            next(ifile)
            for row in ifile:
                row = row.strip().split("\t")
                sentence = '|'.join(row[0].split('|')[:2])
                citation = row[1]
                if sentence in test_sentences_records:
                    sents.add(sentence)
                    cites.add(citation)
                    n += 1
                else:
                    extra_sents.add(sentence)
                    extra_cites.add(citation)
                    m += 1
    #             if citation in test_sentences_records[sentence]['citations']: m += 1
    print(f"Test Sentences: {len(sents):12}\tExtra Sentences: {len(extra_sents)}")
    print(f"Test Citations: {len(cites):12}\tExtra Citations: {len(extra_cites.difference(cites))}")
    print(f"Test Instances: {n:12}\tExtra Instances: {m}")

In [28]:
def get_pred_results(pred_path, test_sentences_records):
    pred_results = {}
    serank_top1, serank_top10, serank_top20, serank_top100 = 0, 0, 0, 0
    rerank_top1, rerank_top10, rerank_top20, rerank_top100 = 0, 0, 0, 0
    better_serank, tie_rank, better_rerank = 0, 0, 0
    num_dup_citations = 0
    for file in os.listdir(pred_path):
        with open(f"{pred_path}/{file}") as ifile:
            next(ifile)
            for row in ifile:
                row = row.strip().split("\t")
                sentence = '|'.join(row[0].split('|')[:2])
                citation = row[1]
                serank = int(row[2])
                rerank = int(row[3])
                if sentence in test_sentences_records:
                    if citation in test_sentences_records[sentence]['citations']:
                        pred_results[sentence] = pred_results.get(sentence, {})
                        if citation not in pred_results[sentence]:
                            pred_results[sentence][citation] = pred_results[sentence].get(citation, [])
                            pred_results[sentence][citation].append((serank, rerank))
                            if serank == 1: serank_top1 += 1
                            if serank <= 10: serank_top10 += 1
                            if serank <= 20: serank_top20 += 1
                            if serank <= 100: serank_top100 += 1

                            if rerank == 1: rerank_top1 += 1
                            if rerank <= 10: rerank_top10 += 1
                            if rerank <= 20: rerank_top20 += 1
                            if rerank <= 100: rerank_top100 += 1

                            if serank < rerank: better_serank += 1
                            if serank == rerank: tie_rank += 1
                            if rerank < serank: better_rerank += 1
                        else: num_dup_citations += 1
    
    print(f"{'Rank':20}\t{'Top1':8}\t{'Top10':8}\t{'Top20':8}\t{'Top100'}")
    print(f"{80*'-'}")
    print(f"{'Search':20}\t{serank_top1:<8}\t{serank_top10:<8}\t{serank_top20:<8}\t{serank_top100}")
    print(f"{'Rerank':20}\t{rerank_top1:<8}\t{rerank_top10:<8}\t{rerank_top20:<8}\t{rerank_top100}")
    print(f"{'Duplicated citations':20}\t{num_dup_citations}")
    print(f"{'Search Win':20}\t{better_serank}")
    print(f"{'Tie':20}\t{tie_rank}")
    print(f"{'Rerank Improvement':20}\t{better_rerank}")
    
    return pred_results

Test results: SQL_BM25

In [21]:
test_path = f"{base_path}/test_dataset_sql_bm25"
get_test_stats(test_path, test_sentences)

Test Sentences:        95230	Extra Sentences: 1984
Test Citations:        89041	Extra Citations: 1884
Test Instances:    100287085	Extra Instances: 2265000


In [29]:
# test results of sentences with SQL_BM25 top 500 search returns (New)
pred_path = f"{base_path}/src_model_full_sentence/test_results_sql_bm25"
get_pred_stats(pred_path, test_sentences)
print('\n\n')
results_sql = get_pred_results(pred_path, test_sentences)

Test Sentences:        95230	Extra Sentences: 1984
Test Citations:        89041	Extra Citations: 1884
Test Instances:       100111	Extra Instances: 2282



Rank                	Top1    	Top10   	Top20   	Top100
--------------------------------------------------------------------------------
Search              	18737   	41234   	48979   	68804
Rerank              	24747   	56260   	66507   	87751
Duplicated citations	9
Search Win          	28002
Tie                 	13067
Rerank Improvement  	59033


In [29]:
# Number of citations search returns broken into 2 files
n = 0
sentences_in_2_files = {}
for k, v in results_sql.items():
    for ck, cv in v.items():
        if len(cv) > 1:
            sentences_in_2_files[k] = v
            print(k, v)
            n += 1
print(n)

recently new genome editing methods zinc finger nucleases zfns transcription activator like nucleases talens clustered regularly interspaced short palindromic repeats cas9 endonuclease crispr cas9 shown powerful tools directly mutating genome targeted gene deletions|28224990 {'21700836': [(70, 30), (70, 30)]}
large scale mining assay data revealed different facets promiscuity|25339989 {'24358872': [(1, 1), (1, 70)]}
pae found patients switched aripiprazole sudden discontinuation previous antipsychotic medication showed increase symptom severity first week switching|25792838 {'19442491': [(1, 1), (1, 1)]}
psychiatric problems euphoria hypomania suicidal ideation apathy new cases impulse control disorders icds documented stn dbs hälbig lim volkmann moum castrioto hack|26892884 {'19236471': [(12, 3), (12, 3)], '19553125': [(3, 37)]}
finally several studies demonstrated decreased frequency impaired function tregs ssc|28890870 {'20105169': [(19, 35), (19, 35)], '21419712': [(6, 7)]}
whether

In [25]:
len(results_sql)

95230

In [26]:
serank = []
better_serank = 0
for k, v in results_sql.items():
    for ck, cv in v.items():
        serank.append(cv[0][0])
        if cv[0][0] < cv[0][1]: better_serank += 1
len(results_sql), better_serank, len(serank), min(serank), max(serank)

(95230, 28002, 100102, 1, 1000)

Test results: PubMed_TFIDF

In [30]:
test_path = f"{base_path}/test_dataset_pubmed_tfidf"
get_test_stats(test_path, test_sentences)

Test Sentences:        95230	Extra Sentences: 1720
Test Citations:        89013	Extra Citations: 1656
Test Instances:     98177227	Extra Instances: 1950842


In [31]:
# test results of sentences with PubMed TFIDF top 500 search returns
pred_path = f"{base_path}/src_model_full_sentence/test_results_pubmed_tfidf"
get_pred_stats(pred_path, test_sentences)
print('\n\n')
results_tfidf = get_pred_results(pred_path, test_sentences)

Test Sentences:        60600	Extra Sentences: 1012
Test Citations:        56778	Extra Citations: 958
Test Instances:        62821	Extra Instances: 1133



Rank                	Top1    	Top10   	Top20   	Top100
--------------------------------------------------------------------------------
Search              	9325    	22489   	27253   	40411
Rerank              	17546   	37816   	43681   	55232
Duplicated citations	3
Search Win          	15550
Tie                 	6419
Rerank Improvement  	40849


In [32]:
# Number of citations search returns broken into 2 files
n = 0
for k, v in results_tfidf.items():
    for ck, cv in v.items():
        if len(cv) > 1:
            print(k, v)
            n += 1
print(n)

0


In [33]:
serank = []
better_serank = 0
for k, v in results_tfidf.items():
    for ck, cv in v.items():
        serank.append(cv[0][0])
        if cv[0][0] < cv[0][1]: better_serank += 1
len(results_tfidf), better_serank, len(serank), min(serank), max(serank)

(60600, 15550, 62818, 1, 998)

In [34]:
n_sql, m = 0, 0
sents = set()
n_sql_se_1, n_sql_se_10, n_sql_se_20, n_sql_se_100 = 0, 0, 0, 0
n_sql_re_1, n_sql_re_10, n_sql_re_20, n_sql_re_100 = 0, 0, 0, 0
n_tfidf_se_1, n_tfidf_se_10, n_tfidf_se_20, n_tfidf_se_100 = 0, 0, 0, 0
n_tfidf_re_1, n_tfidf_re_10, n_tfidf_re_20, n_tfidf_re_100 = 0, 0, 0, 0
for k,v in results_tfidf.items():
    if k in results_sql:
        m += 1
        for i in v:
            if i in results_sql[k]:
                n_sql += 1
                sents.add(k)
                if results_sql[k][i][0][0] == 1: n_sql_se_1 += 1
                if results_sql[k][i][0][0] <= 10: n_sql_se_10 += 1
                if results_sql[k][i][0][0] <= 20: n_sql_se_20 += 1
                if results_sql[k][i][0][0] <= 100: n_sql_se_100 += 1
                
                if results_sql[k][i][0][1] == 1: n_sql_re_1 += 1
                if results_sql[k][i][0][1] <= 10: n_sql_re_10 += 1
                if results_sql[k][i][0][1] <= 20: n_sql_re_20 += 1
                if results_sql[k][i][0][1] <= 100: n_sql_re_100 += 1
                
                if results_tfidf[k][i][0][0] == 1: n_tfidf_se_1 += 1
                if results_tfidf[k][i][0][0] <= 10: n_tfidf_se_10 += 1
                if results_tfidf[k][i][0][0] <= 20: n_tfidf_se_20 += 1
                if results_tfidf[k][i][0][0] <= 100: n_tfidf_se_100 += 1
                
                if results_tfidf[k][i][0][1] == 1: n_tfidf_re_1 += 1
                if results_tfidf[k][i][0][1] <= 10: n_tfidf_re_10 += 1
                if results_tfidf[k][i][0][1] <= 20: n_tfidf_re_20 += 1
                if results_tfidf[k][i][0][1] <= 100: n_tfidf_re_100 += 1

print(f"{' ':30}\t{'Top1':8}\t{'Top10':8}\t{'Top20':8}\t{'Top100':8}")
print(f"{90*'-'}")
print(f"{'Sentences in SQL/TFIDF':30}\t{m:<8}\t{len(sents):<8}\t{n_sql:<8}")
print(f"{'SQL Search Results':30}\t{n_sql_se_1:<8}\t{n_sql_se_10:<8}\t{n_sql_se_20:<8}\t{n_sql_se_100:<8}")
print(f"{'SQL Rerank Results':30}\t{n_sql_re_1:<8}\t{n_sql_re_10:<8}\t{n_sql_re_20:<8}\t{n_sql_re_100:<8}")
print(f"{'TFIDF Search Results':30}\t{n_tfidf_se_1:<8}\t{n_tfidf_se_10:<8}\t{n_tfidf_se_20:<8}\t{n_tfidf_se_100:<8}")
print(f"{'TFIDF Rerank Results':30}\t{n_tfidf_re_1:<8}\t{n_tfidf_re_10:<8}\t{n_tfidf_re_20:<8}\t{n_tfidf_re_100:<8}")

                              	Top1    	Top10   	Top20   	Top100  
------------------------------------------------------------------------------------------
Sentences in SQL/TFIDF        	60600   	60600   	62818   
SQL Search Results            	16486   	33586   	38663   	49872   
SQL Rerank Results            	20073   	40878   	46591   	57297   
TFIDF Search Results          	9325    	22489   	27253   	40411   
TFIDF Rerank Results          	17546   	37816   	43681   	55232   


In [44]:
results_sql[k][i][0]

(3, 2)

Test results: PubMed_BM

In [23]:
# test_path = f"{base_path}/case_PubMed_BM_testing_final_complete"
test_path = f"{base_path}/test_dataset_pubmed_bm"
get_test_stats(test_path, test_sentences) # top1000_sentences, test_sentences

Test Sentences:        41120	Extra Sentences: 3995
Test Citations:        38835	Extra Citations: 6999
Test Instances:     27026803	Extra Instances: 5207044


In [26]:
# test_path = f"{base_path}/case_PubMed_BM_testing_final_complete"
test_path = f"{base_path}/test_dataset_pubmed_bm"
get_test_stats(test_path, top1000_sentences) # top1000_sentences, test_sentences

Test Sentences:        16824	Extra Sentences: 28291
Test Citations:        16390	Extra Citations: 29444
Test Instances:     14501168	Extra Instances: 17732679


In [35]:
# test results of sentences with PubMed BM top 500 search returns
pred_path = f"{base_path}/src_model_full_sentence/test_results_pubmed_bm"
get_pred_stats(pred_path, top1000_sentences) # top1000_sentences, test_sentences
print('\n\n')
results_bm = get_pred_results(pred_path, top1000_sentences)

Test Sentences:        11739	Extra Sentences: 10511
Test Citations:        11965	Extra Citations: 10294
Test Instances:        12256	Extra Instances: 11020



Rank                	Top1    	Top20   	Top100
-----------------------------------------------------------------
Search              	456     	4329    	7130
Rerank              	4644    	9472    	11337
Duplicated citations	0
Search Win          	1625
Tie                 	427
Rerank Improvement  	10204


In [15]:
n = 0
for k, v in results_bm.items():
    for ck, cv in v.items():
        if len(cv) > 1:
            print(k, v)
            n += 1
print(n)

0


In [28]:
serank = []
better_serank = 0
for k, v in results_bm.items():
    for ck, cv in v.items():
        serank.append(cv[0][0])
        if cv[0][0] > 1000: print(k, v)
        if cv[0][0] < cv[0][1]: better_serank += 1
len(results_bm), better_serank, len(serank), min(serank), max(serank)

(10782, 1400, 10782, 1, 979)

In [40]:
n_sql, n_tfidf = 0, 0
n_sql_se_1, n_sql_se_20, n_sql_se_100 = 0, 0, 0
n_sql_re_1, n_sql_re_20, n_sql_re_100 = 0, 0, 0
n_tfidf_se_1, n_tfidf_se_20, n_tfidf_se_100 = 0, 0, 0
n_tfidf_re_1, n_tfidf_re_20, n_tfidf_re_100 = 0, 0, 0
n_bm_se_1, n_bm_se_20, n_bm_se_100 = 0, 0, 0
n_bm_re_1, n_bm_re_20, n_bm_re_100 = 0, 0, 0
for k,v in results_bm.items():
    if k in results_sql and k in results_tfidf:
        n_sql += 1
        for i in v:
            if i in results_sql[k] and i in results_tfidf[k]:
                n_tfidf += 1
                if results_sql[k][i][0][0] == 1: n_sql_se_1 += 1
                if results_sql[k][i][0][0] <= 20: n_sql_se_20 += 1
                if results_sql[k][i][0][0] <= 100: n_sql_se_100 += 1
                
                if results_sql[k][i][0][1] == 1: n_sql_re_1 += 1
                if results_sql[k][i][0][1] <= 20: n_sql_re_20 += 1
                if results_sql[k][i][0][1] <= 100: n_sql_re_100 += 1
                
                if results_tfidf[k][i][0][0] == 1: n_tfidf_se_1 += 1
                if results_tfidf[k][i][0][0] <= 20: n_tfidf_se_20 += 1
                if results_tfidf[k][i][0][0] <= 100: n_tfidf_se_100 += 1
                
                if results_tfidf[k][i][0][1] == 1: n_tfidf_re_1 += 1
                if results_tfidf[k][i][0][1] <= 20: n_tfidf_re_20 += 1
                if results_tfidf[k][i][0][1] <= 100: n_tfidf_re_100 += 1
                
                if results_bm[k][i][0][0] == 1: n_bm_se_1 += 1
                if results_bm[k][i][0][0] <= 20: n_bm_se_20 += 1
                if results_bm[k][i][0][0] <= 100: n_bm_se_100 += 1
                
                if results_bm[k][i][0][1] == 1: n_bm_re_1 += 1
                if results_bm[k][i][0][1] <= 20: n_bm_re_20 += 1
                if results_bm[k][i][0][1] <= 100: n_bm_re_100 += 1
print(f"{' ':30}\t{'Top1':8}\t{'Top20':8}\t{'Top100':8}")
print(f"{75*'-'}")
print(f"{'Test Citations in SQL/TFIDF':30}\t{len(results_bm):<8}\t{n_sql:<8}\t{n_tfidf:<8}")
print(f"{'SQL Search Results':30}\t{n_sql_se_1:<8}\t{n_sql_se_20:<8}\t{n_sql_se_100:<8}")
print(f"{'SQL Rerank Results':30}\t{n_sql_re_1:<8}\t{n_sql_re_20:<8}\t{n_sql_re_100:<8}")
print(f"{'TFIDF Search Results':30}\t{n_tfidf_se_1:<8}\t{n_tfidf_se_20:<8}\t{n_tfidf_se_100:<8}")
print(f"{'TFIDF Rerank Results':30}\t{n_tfidf_re_1:<8}\t{n_tfidf_re_20:<8}\t{n_tfidf_re_100:<8}")
print(f"{'BM Search Results':30}\t{n_bm_se_1:<8}\t{n_bm_se_20:<8}\t{n_bm_se_100:<8}")
print(f"{'BM Rerank Results':30}\t{n_bm_re_1:<8}\t{n_bm_re_20:<8}\t{n_bm_re_100:<8}")

                              	Top1    	Top20   	Top100  
---------------------------------------------------------------------------
Test Citations in SQL/TFIDF   	11739   	10790   	11145   
SQL Search Results            	3613    	7642    	9394    
SQL Rerank Results            	4204    	8871    	10433   
TFIDF Search Results          	2249    	5956    	8156    
TFIDF Rerank Results          	3700    	8356    	10154   
BM Search Results             	448     	4249    	6841    
BM Rerank Results             	4433    	8777    	10374   


In [23]:
n_sql_se_1, n_sql_se_20, n_sql_se_100 = [], [], []
n_sql_re_1, n_sql_re_20, n_sql_re_100 = [], [], []
n_tfidf_se_1, n_tfidf_se_20, n_tfidf_se_100 = [], [], []
n_tfidf_re_1, n_tfidf_re_20, n_tfidf_re_100 = [], [], []
n_pbm_se_1, n_pbm_se_20, n_pbm_se_100 = [], [], []
n_pbm_re_1, n_pbm_re_20, n_pbm_re_100 = [], [], []
for k,v in results_bm.items():
    if k in results_sql and k in results_tfidf:
        n_sql += 1
        for i in v:
            if i in results_sql[k] and i in results_tfidf[k]:
                n_tfidf += 1
                if results_bm[k][i][0][0] == 1: n_pbm_se_1.append(len(k.split()))
                if results_bm[k][i][0][0] <= 20: n_pbm_se_20.append(len(k.split()))
                if results_bm[k][i][0][0] <= 100: n_pbm_se_100.append(len(k.split()))

                if results_bm[k][i][0][1] == 1: n_pbm_re_1.append(len(k.split()))
                if results_bm[k][i][0][1] <= 20: n_pbm_re_20.append(len(k.split()))
                if results_bm[k][i][0][1] <= 100: n_pbm_re_100.append(len(k.split()))
                
                if results_sql[k][i][0][0] == 1: n_sql_se_1.append(len(k.split()))
                if results_sql[k][i][0][0] <= 20: n_sql_se_20.append(len(k.split()))
                if results_sql[k][i][0][0] <= 100: n_sql_se_100.append(len(k.split()))
                
                if results_sql[k][i][0][1] == 1: n_sql_re_1.append(len(k.split()))
                if results_sql[k][i][0][1] <= 20: n_sql_re_20.append(len(k.split()))
                if results_sql[k][i][0][1] <= 100: n_sql_re_100.append(len(k.split()))
                
                if results_tfidf[k][i][0][0] == 1: n_tfidf_se_1.append(len(k.split()))
                if results_tfidf[k][i][0][0] <= 20: n_tfidf_se_20.append(len(k.split()))
                if results_tfidf[k][i][0][0] <= 100: n_tfidf_se_100.append(len(k.split()))
                
                if results_tfidf[k][i][0][1] == 1: n_tfidf_re_1.append(len(k.split()))
                if results_tfidf[k][i][0][1] <= 20: n_tfidf_re_20.append(len(k.split()))
                if results_tfidf[k][i][0][1] <= 100: n_tfidf_re_100.append(len(k.split()))
print(f"{' ':20}\t{'# of Citations':16}\t{'Min # of Words':16}\t{'Max # of Words':16}\t{'Avg # of Words':16}")
print(f"{115*'-'}")
print(f"{'SQL Search Top1':20}\t{len(n_sql_se_1):<16}\t{min(n_sql_se_1):<16}\t{max(n_sql_se_1):<16}\t{sum(n_sql_se_1)/len(n_sql_se_1):<16}")
print(f"{'SQL Rerank Top1':20}\t{len(n_sql_re_1):<16}\t{min(n_sql_re_1):<16}\t{max(n_sql_re_1):<16}\t{sum(n_sql_re_1)/len(n_sql_re_1):<16}")
print()
print(f"{'SQL Search Top20':20}\t{len(n_sql_se_20):<16}\t{min(n_sql_se_20):<16}\t{max(n_sql_se_20):<16}\t{sum(n_sql_se_20)/len(n_sql_se_20):<16}")
print(f"{'SQL Rerank Top20':20}\t{len(n_sql_re_20):<16}\t{min(n_sql_re_20):<16}\t{max(n_sql_re_20):<16}\t{sum(n_sql_re_20)/len(n_sql_re_20):<16}")
print()
print(f"{'SQL Search Top100':20}\t{len(n_sql_se_100):<16}\t{min(n_sql_se_100):<16}\t{max(n_sql_se_100):<16}\t{sum(n_sql_se_100)/len(n_sql_se_100):<16}")
print(f"{'SQL Rerank Top100':20}\t{len(n_sql_re_100):<16}\t{min(n_sql_re_100):<16}\t{max(n_sql_re_100):<16}\t{sum(n_sql_re_100)/len(n_sql_re_100):<16}")
print()
print(f"{'TFIDF Search Top1':20}\t{len(n_tfidf_se_1):<16}\t{min(n_tfidf_se_1):<16}\t{max(n_tfidf_se_1):<16}\t{sum(n_tfidf_se_1)/len(n_tfidf_se_1):<16}")
print(f"{'TFIDF Rerank Top1':20}\t{len(n_tfidf_re_1):<16}\t{min(n_tfidf_re_1):<16}\t{max(n_tfidf_re_1):<16}\t{sum(n_tfidf_re_1)/len(n_tfidf_re_1):<16}")
print()
print(f"{'TFIDF Search Top20':20}\t{len(n_tfidf_se_20):<16}\t{min(n_tfidf_se_20):<16}\t{max(n_tfidf_se_20):<16}\t{sum(n_tfidf_se_20)/len(n_tfidf_se_20):<16}")
print(f"{'TFIDF Rerank Top20':20}\t{len(n_tfidf_re_20):<16}\t{min(n_tfidf_re_20):<16}\t{max(n_tfidf_re_20):<16}\t{sum(n_tfidf_re_20)/len(n_tfidf_re_20):<16}")
print()
print(f"{'TFIDF Search Top100':20}\t{len(n_tfidf_se_100):<16}\t{min(n_tfidf_se_100):<16}\t{max(n_tfidf_se_100):<16}\t{sum(n_tfidf_se_100)/len(n_tfidf_se_100):<16}")
print(f"{'TFIDF Rerank Top100':20}\t{len(n_tfidf_re_100):<16}\t{min(n_tfidf_re_100):<16}\t{max(n_tfidf_re_100):<16}\t{sum(n_tfidf_re_100)/len(n_tfidf_re_100):<16}")
print()
print(f"{'PBM Search Top1':20}\t{len(n_pbm_se_1):<16}\t{min(n_pbm_se_1):<16}\t{max(n_pbm_se_1):<16}\t{sum(n_pbm_se_1)/len(n_pbm_se_1):<16}")
print(f"{'PBM Rerank Top1':20}\t{len(n_pbm_re_1):<16}\t{min(n_pbm_re_1):<16}\t{max(n_pbm_re_1):<16}\t{sum(n_pbm_re_1)/len(n_pbm_re_1):<16}")
print()
print(f"{'PBM Search Top20':20}\t{len(n_pbm_se_20):<16}\t{min(n_pbm_se_20):<16}\t{max(n_pbm_se_20):<16}\t{sum(n_pbm_se_20)/len(n_pbm_se_20):<16}")
print(f"{'PBM Rerank Top20':20}\t{len(n_pbm_re_20):<16}\t{min(n_pbm_re_20):<16}\t{max(n_pbm_re_20):<16}\t{sum(n_pbm_re_20)/len(n_pbm_re_20):<16}")
print()
print(f"{'PBM Search Top100':20}\t{len(n_pbm_se_100):<16}\t{min(n_pbm_se_100):<16}\t{max(n_pbm_se_100):<16}\t{sum(n_pbm_se_100)/len(n_pbm_se_100):<16}")
print(f"{'PBM Rerank Top100':20}\t{len(n_pbm_re_100):<16}\t{min(n_pbm_re_100):<16}\t{max(n_pbm_re_100):<16}\t{sum(n_pbm_re_100)/len(n_pbm_re_100):<16}")

                    	# of Citations  	Min # of Words  	Max # of Words  	Avg # of Words  
-------------------------------------------------------------------------------------------------------------------
SQL Search Top1     	3613            	5               	50              	20.099916966509827
SQL Rerank Top1     	4204            	5               	50              	20.00023786869648

SQL Search Top20    	7642            	5               	50              	20.076681496990318
SQL Rerank Top20    	8871            	5               	50              	20.08071243377297

SQL Search Top100   	9394            	5               	50              	20.06642537790079
SQL Rerank Top100   	10433           	5               	50              	20.082526598293875

TFIDF Search Top1   	2249            	5               	50              	19.895064473099154
TFIDF Rerank Top1   	3700            	5               	50              	20.253783783783785

TFIDF Search Top20  	5956            	5               	50         

In [71]:
sent = list(results_bm.keys())[6]
sent, results_sql[sent], results_tfidf[sent], results_bm[sent]

('include cellprofiler imagej bioimagexd icy omero ebimage|24564609',
 {'17936939': [(42, 58)], '20338898': [(2, 12)]},
 {'17936939': [(88, 37)], '20338898': [(1, 11)]},
 {'17936939': [(7, 7)], '20338898': [(155, 14)]})

In [67]:
test_path = f"{base_path}/test_dataset_sql_bm25"
with open("sent10_sql_bm25.csv", "w", encoding="utf-8") as fwriter:
    for file in os.listdir(f"{test_path}"):
        with open(f"{test_path}/{file}") as freader:
            next(freader)
            for line in freader:
                row = line.strip().split("\t")
                sentence = f"{row[0]}|{row[1]}"
                citation = row[3]
                if sentence == sent and citation in ['18977449']:
                    fwriter.write(line)

In [68]:
test_path = f"{base_path}/test_dataset_pubmed_tfidf"
with open("sent10_pubmed_tfidf.csv", "w", encoding="utf-8") as fwriter:
    for file in os.listdir(f"{test_path}"):
        with open(f"{test_path}/{file}") as freader:
            next(freader)
            for line in freader:
                row = line.strip().split("\t")
                sentence = f"{row[0]}|{row[1]}"
                citation = row[3]
                if sentence == sent and citation in ['18977449']:
                    fwriter.write(line)

In [69]:
test_path = f"{base_path}/test_dataset_pubmed_bm"
with open("sent10_pubmed_bm.csv", "w", encoding="utf-8") as fwriter:
    for file in os.listdir(f"{test_path}"):
        with open(f"{test_path}/{file}") as freader:
            next(freader)
            for line in freader:
                row = line.strip().split("\t")
                sentence = f"{row[0]}|{row[1]}"
                citation = row[3]
                if sentence == sent and citation in ['18977449']:
                    fwriter.write(line)

In [39]:
' OR '.join(sent.split('|')[0].strip().split())

'include OR cellprofiler OR imagej OR bioimagexd OR icy OR omero OR ebimage'

In [None]:
test_pubmed_bm[sent]

In [42]:
query = ' OR '.join(sent.split('|')[0].strip().split())
year = test_pubmed_bm[sent]['year']

In [46]:
counts, pmids = pubmedweb_search(query, year)

In [None]:
counts, pmids

Output 100 sentences for search on Google Scholar

In [46]:
sents = [k for k in results_bm.keys() if (k in results_sql and k in results_tfidf)]

In [None]:
sents_top20 = []
n = 0
for sent in sents:
    for cite, ranks in results_bm[sent].items():
        if len(ranks) == 1 and ranks[0][0] <= 20:
            if cite in results_sql[sent] and results_sql[sent][cite][0][0] <= 20:
                if cite in results_tfidf[sent] and results_tfidf[sent][cite][0][0] <= 20:
                    sents_top20.append((sent, cite))
                    n += 1
print(n)

In [49]:
# sentences with citation in top 20 search results
sents_top20 = []
n = 0
for sent in sents:
    for cite, ranks in results_bm[sent].items():
        if cite in results_sql[sent] and cite in results_tfidf[sent]:
            sents_top20.append((sent, cite))
            n += 1
print(n)

11145


In [42]:
import random
import csv

In [None]:
random.seed(13)
with open("sentences_for_google_scholar_search.csv", "w", encoding="utf-8", newline="") as f:
    fwriter = csv.writer(f)
    fwriter.writerow(['sent_pmid', 'sentence', 'sent_pub_year', 'citation_pmid', 'citation_title', 'mbm_search_rank', 'tfidf_search_rank', 'bm_search_rank', 'gs_search_link', 'gs_search_link_or'])
    for sent in random.sample(sents_top20, 100):
        cite = sent[1]
        pubmed_url = f"https://pubmed.ncbi.nlm.nih.gov/{cite}/"
        soup = BeautifulSoup(get_page(pubmed_url), 'xml')
        title = soup.find('h1', attrs={'class':'heading-title'}).text.strip()
        sent = sent[0]
        pmid = sent.split('|')[1]
        txt = sent.split('|')[0]
        query = ' OR '.join(sent.split('|')[0].strip().split())
        year = test_pubmed_bm[sent]['year']
        ggs_url = 'https://scholar.google.com/scholar'
        ggs_url_txt = ggs_url + f'?q=site:pubmed.ncbi.nlm.nih.gov {txt}'
        ggs_url_or = ggs_url + f'?q=site:pubmed.ncbi.nlm.nih.gov {query}'
        ggs_url_txt += f'&hl=en&as_sdt=0,10&as_ylo=1977&as_yhi={year}'
        ggs_url_or += f'&hl=en&as_sdt=0,10&as_ylo=1977&as_yhi={year}'
        fwriter.writerow([pmid, txt, year, cite, title, results_sql[sent][cite][0][0], results_tfidf[sent][cite][0][0], results_bm[sent][cite][0][0], ggs_url_txt, ggs_url_or])
#         print(pmid, txt, year, cite, title, results_sql[sent][cite][0][0], results_tfidf[sent][cite][0][0], results_bm[sent][cite][0][0], ggs_url_txt, ggs_url_or)

In [70]:
top1_sql, top1_tfidf, top1_bm = 0, 0, 0
top20_sql, top20_tfidf, top20_bm = 0, 0, 0
rr_top1_sql, rr_top1_tfidf, rr_top1_bm = 0, 0, 0
rr_top20_sql, rr_top20_tfidf, rr_top20_bm = 0, 0, 0
with open("sentences_for_google_scholar_search.csv", "r", encoding="utf-8", newline="") as f:
    freader = csv.reader(f)
    next(freader)
    for row in freader:
        sent = f"{row[1]}|{row[0]}"
        cite = row[3]
        
        if results_sql[sent][cite][0][0] == 1: top1_sql += 1
        if results_sql[sent][cite][0][1] == 1: rr_top1_sql += 1
        if results_sql[sent][cite][0][0] <= 20: top20_sql += 1
        if results_sql[sent][cite][0][1] <= 20: rr_top20_sql += 1
        
        if results_tfidf[sent][cite][0][0] == 1: top1_tfidf += 1
        if results_tfidf[sent][cite][0][1] == 1: rr_top1_tfidf += 1
        if results_tfidf[sent][cite][0][0] <= 20: top20_tfidf += 1
        if results_tfidf[sent][cite][0][1] <= 20: rr_top20_tfidf += 1
        
        if results_bm[sent][cite][0][0] == 1: top1_bm += 1
        if results_bm[sent][cite][0][1] == 1: rr_top1_bm += 1
        if results_bm[sent][cite][0][0] <= 20: top20_bm += 1
        if results_bm[sent][cite][0][1] <= 20: rr_top20_bm += 1
        
        print(f"{results_sql[sent][cite]}\t{results_tfidf[sent][cite]}\t{results_bm[sent][cite]}")
print(f"{top20_sql}\t{top20_tfidf}\t{top20_bm}")
print(f"{top1_sql}\t{top1_tfidf}\t{top1_bm}")
print(f"{rr_top20_sql}\t{rr_top20_tfidf}\t{rr_top20_bm}")
print(f"{rr_top1_sql}\t{rr_top1_tfidf}\t{rr_top1_bm}")

[(1, 1)]	[(4, 1)]	[(7, 1)]
[(2, 121)]	[(3, 157)]	[(230, 25)]
[(1, 161)]	[(4, 178)]	[(20, 23)]
[(1, 1)]	[(448, 1)]	[(360, 1)]
[(183, 10)]	[(146, 4)]	[(3, 6)]
[(217, 347)]	[(23, 352)]	[(249, 294)]
[(1, 1)]	[(2, 1)]	[(4, 1)]
[(58, 2)]	[(458, 1)]	[(138, 2)]
[(1, 1)]	[(1, 1)]	[(5, 1)]
[(78, 150)]	[(303, 250)]	[(890, 209)]
[(69, 17)]	[(1, 147)]	[(2, 2)]
[(1, 1)]	[(1, 1)]	[(2, 1)]
[(2, 6)]	[(2, 32)]	[(3, 3)]
[(1, 1)]	[(1, 2)]	[(6, 1)]
[(3, 1)]	[(496, 1)]	[(260, 1)]
[(13, 47)]	[(282, 32)]	[(29, 157)]
[(348, 19)]	[(67, 33)]	[(97, 9)]
[(1, 481)]	[(1, 43)]	[(5, 1)]
[(1, 1)]	[(2, 1)]	[(21, 1)]
[(51, 18)]	[(4, 20)]	[(322, 9)]
[(2, 6)]	[(10, 3)]	[(103, 2)]
[(11, 19)]	[(68, 13)]	[(474, 8)]
[(4, 67)]	[(6, 189)]	[(379, 109)]
[(9, 2)]	[(9, 13)]	[(769, 4)]
[(4, 1)]	[(5, 9)]	[(16, 2)]
[(1, 3)]	[(859, 4)]	[(40, 3)]
[(10, 1)]	[(1, 1)]	[(659, 1)]
[(1, 1)]	[(2, 1)]	[(42, 1)]
[(6, 10)]	[(4, 15)]	[(52, 1)]
[(59, 2)]	[(60, 5)]	[(4, 2)]
[(1, 9)]	[(9, 10)]	[(7, 8)]
[(8, 193)]	[(7, 272)]	[(181, 167)]
[(1, 1)]	[(1, 