In [1]:
# !pip3 install Biopython
# !pip3 install pymongo==3.11.2

In [3]:
from Bio import Entrez
from pymongo import MongoClient

from io import StringIO
from html.parser import HTMLParser
from collections import defaultdict

In [None]:
import os
import environ

env = environ.Env()
env.read_env(env.str('ENV_PATH', '.env'))

In [None]:
mongo_cli_username = os.environ.get('MONGO_CLI_USERNAME')
mongo_cli_password = os.environ.get('MONGO_CLI_PASSWORD')

In [8]:
client = MongoClient("mongodb+srv://{}:{}@cluster0.plop5.mongodb.net/myFirstDatabase?retryWrites=true&w=majority".format(mongo_cli_username, mongo_cli_password))
db = client['healdash']

In [9]:
# input location
input_location = 'data/input/diseases-english.txt'

In [10]:
# keywords list
keywords = []

with open(input_location) as my_file:
    for line in my_file:
        keywords.append(line.replace("\n", ""))

In [11]:
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [12]:
class PubMed:
    
    # for testing purposes
    """for i, paper in enumerate(papers['PubmedArticle']):
        print(paper['MedlineCitation']['Article'].keys())

        for j in paper['MedlineCitation']['Article']['Abstract']['AbstractText']:
            print(j)"""
    
    def __init__(self, email: str, extension="xml") -> None:
        self.email = email
        self.extension = extension
        self.keyword_dict = defaultdict(list)
        
    
    def search(self, query: str):
        Entrez.email = self.email
        handle = Entrez.esearch(db='pubmed', 
                                sort='relevance', 
                                retmax='20',
                                retmode=self.extension, 
                                term=query)
        results = Entrez.read(handle)
        return results

    def fetch_details(self, id_list: list):
        ids = ','.join(id_list)
        Entrez.email = self.email
        handle = Entrez.efetch(db='pubmed',
                               retmode=self.extension,
                               id=ids)
        results = Entrez.read(handle)
        return results
    
    def get_results(self, keywords: list) -> None:
        for keyword in keywords:
            try:
                results = self.search(keyword)
                id_list = results['IdList']
                papers = self.fetch_details(id_list)
                
                for paper in papers['PubmedArticle']:

                    abstract_texts = []

                    for abstract_text in paper['MedlineCitation']['Article']['Abstract']['AbstractText']:
                        abstract_texts.append(strip_tags(abstract_text))

                    self.keyword_dict[keyword].append({
                        "loc_id": paper['MedlineCitation']['Article']['ELocationID'], 
                        "title": strip_tags(paper['MedlineCitation']['Article']['ArticleTitle']), 
                        "abstract_texts": abstract_texts
                    })

                # add data to mongodb
                db.articles.update_many({"keyword": keyword}, {"$set": {"articles": self.keyword_dict[keyword]}}, upsert=True)
            except:
                print("{} - No complete information".format(keyword))

In [13]:
pubmed = PubMed("drgoktugasci@gmail.com")

In [14]:
# keywords[:2]

In [None]:
pubmed.get_results(keywords)

aphthous cancer - No complete information
behçet syndrome - No complete information
herpes - No complete information
mumps - No complete information
sialadenitis - No complete information
esophageal web - No complete information
zenker diverticulum - No complete information
mallory-weiss syndrome - No complete information
esophagel varices - No complete information
gastroesophageal reflux disesase  - No complete information
pyloric stenosis - No complete information
acute gastritis - No complete information
peptic ulcer disease - No complete information
gastric carcinoma - No complete information
duodenal atresia - No complete information
small bowel infarction - No complete information
tropical sprue - No complete information
whipple disease - No complete information
abetalipoproteinemia - No complete information
colonic diverticula - No complete information
angiodysplasia - No complete information
juvenile polyp - No complete information
gardner syndrome - No complete information
tur