### Use NLP to help with literature searches: extract key sentences

- literature search on keywords -> csv file
- Read titles and urls from csv file
- Assume most used words in article are the most important, so weight words according to their frequency
- Rank sentences in importance according to the sum of word frequency weight / number words in sentence
- Create summary from the 2 most 'important' sentences per article

- Maybe most 'important' sentences are just those with highest total sum of word frequency weights ???
- Need to test to see what method best - or if indeed useful at all :)


In [1]:
import bs4 as BeautifulSoup
import urllib.request  

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
import pandas as pd
import string

In [3]:
input_file = "pubmed_result.csv"
data_file = pd.read_csv(input_file,index_col=False)

In [4]:
data_file.head(2)

Unnamed: 0,Title,URL,Description,Details,ShortDetails,Resource,Type,Identifiers,Db,EntrezUID,Properties
0,Total knee arthroplasty: posterior tibial slop...,/pubmed/32030500,"Ismailidis P, Kremo V, Mündermann A, Müller-Ge...",Knee Surg Sports Traumatol Arthrosc. 2020 Feb ...,Knee Surg Sports Traumatol Arthrosc. 2020,PubMed,citation,PMID:32030500,pubmed,32030500,create date:2020/02/08 | first author:Ismailid...
1,The use of an asymmetrical tibial tray in TKA ...,/pubmed/32006074,"Okazaki Y, Pujol N.",Knee Surg Sports Traumatol Arthrosc. 2020 Jan ...,Knee Surg Sports Traumatol Arthrosc. 2020,PubMed,citation,PMID:32006074,pubmed,32006074,create date:2020/02/02 | first author:Okazaki Y


In [5]:
# number articles
number_articles = data_file.shape[0]
print(number_articles)

234


In [6]:
# Define number sentences required for summary
number_sentences = 2

In [7]:
url_list = data_file[['Title','URL']]

In [8]:
url_list.head(3)

Unnamed: 0,Title,URL
0,Total knee arthroplasty: posterior tibial slop...,/pubmed/32030500
1,The use of an asymmetrical tibial tray in TKA ...,/pubmed/32006074
2,Revisiting the tibial crest as reference for t...,/pubmed/32002790


In [9]:
url_list['URL'][0]

'/pubmed/32030500'

In [10]:
def get_article_content(url):

    data = urllib.request.urlopen(url)
    article = data.read()

    # Parse the URL content
    article_parsed = BeautifulSoup.BeautifulSoup(article,'html.parser')

    # Use <p> tags to split into paragraphs
    paragraphs = article_parsed.find_all('p')

    article_content = ''
    
    # Loop through the paragraphs. Could be improved - skip various text sections that not important for summary
    for p in paragraphs: 
        if "The NCBI web site requires JavaScript to function" in p.text:    
            continue
        if "Fetching bibliography" in p.text:     
            continue   
        if 'Generate a file for use with external citation management software' in p.text:  
            continue
        if 'Rockville Pike, Bethesda' in p.text:
            continue
        article_content += p.text
    
    return article_content

In [11]:
# Stop words and punctuation to be removed from text
punctuation = string.punctuation
punctuation = punctuation + '...'

stop_words = set(stopwords.words("english"))

In [12]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~...'

In [13]:
def create_dictionary_table(text_string):
    
    words = word_tokenize(text_string)
    
    # Reduce words to their root form
    stem = PorterStemmer()
    
    # Create frequency table
    frequency_table = dict()
    for word in words:
        word = stem.stem(word)
        
        if word in stop_words:
            continue
        if word in punctuation:
            continue
                        
        if word in frequency_table:
            frequency_table[word] += 1
        else:
            frequency_table[word] = 1

    return frequency_table

In [14]:
def calculate_sentence_scores(sentences, frequency_table):
    
    # Score a sentence by its word frequency sum
    sentence_weight = dict()

    for sentence in sentences:
        #print("New sentence: ",sentence)
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_sum = 0

        for word in frequency_table:
            if word in sentence.lower():
                sentence_sum += frequency_table[word]
            
        sentence_weight[sentence] = sentence_sum / sentence_wordcount
      
    return sentence_weight

In [15]:
def run_article_summary(article, number_sentences):

    #create a dictionary for the word frequency table
    frequency_table = create_dictionary_table(article)

    #tokenize the sentences
    sentences = sent_tokenize(article)

    #score a sentence by its total word freq weights
    sentence_scores = calculate_sentence_scores(sentences, frequency_table)

    sentence_sorted = sorted(sentence_scores, reverse=True)
    article_summary = sentence_sorted[0:number_sentences]

    return article_summary

In [16]:
summary_filename = "SummaryFile.txt"
url_start = "https://www.ncbi.nlm.nih.gov"
file1 = open(summary_filename,"w+")
    
summary = pd.DataFrame(columns = ['Title','URL','Summary'])    
       
for i in range(0, number_articles):    
#for i in range(0,5):
    title = url_list['Title'][i]
    url = url_start + url_list['URL'][i]
    article_content = get_article_content(url)
    
    summary_results = run_article_summary(article_content, number_sentences)
    #print('"'+ title + '"') 
    #print(url)
    #print(summary_results)
    #print(" ")
    
    summary_line_1 = str('"'+ title + '"\n' + url + "\n")
    summary_line_2 = str(str(summary_results) + "\n")
    file1.write(summary_line_1) 
    file1.write(summary_line_2) 
    file1.write("\n")
    
file1.close() 