In [2]:
#imports all the needed methods
from Bio import Entrez
from Bio import Medline
from bs4 import BeautifulSoup
import requests
import urllib
import pandas as pd
from datetime import datetime
from dateutil import relativedelta
import calendar

In [3]:
start = {'year': 2021, 'month': 1, 'day': 15}
end = {'year': 2021, 'month': 2, 'day': 5}
phrases = ['machine learning']
terms = ['find my code', 'link to my code', 'is my code'] #the desired tearms to be searched for in each paper

In [4]:
def getAmount(queries):
    handle = Entrez.egquery(term=queries) 
    record = Entrez.read(handle) 
    for row in record["eGQueryResult"]: 
        print(row["DbName"], row["Count"]) 
        
x = getAmount(queries)

NameError: name 'queries' is not defined

In [150]:
def getQuery(remainderFirstDays, lastDays, phrases, start, end):
    queries = []
    for phrase in phrases:
        queries.append(f"{phrase}[title/absract] AND {start['year']}/{start['month']}/{start['day']}:{end['year']}/{end['month']}/{end['day']}[EDAT]")
    return queries   

#gets the IDs of the articles; max of 50 articles; takes a query, in this case machine learning; database is pubmed
def search(queries):      
    Entrez.email = 'eli.krasnoff@gmail.com'
    handle = Entrez.esearch(db="pubmed", term=queries, retmax=50)
    record = Entrez.read(handle)
    handle.close()
    return record

#from the IDs of the articles that meet the parameters efetch gets all the details associated with the papers and
#stores the list as a dictionary with the key being the ID
def fetchDetails(IdList):
    handle = Entrez.efetch(db="pubmed", id=IdList, rettype="medline", retmode="text")
    details = Medline.parse(handle)
    detailsDict = {}
    for detail in details:
        detailsDict[detail['PMID']] = detail
    return detailsDict #contains 50 papers

def cleanDetails(detailsDict):
    badIdl = []
    for idl, detail in detailsDict.items():
        if 'LID' not in detail.keys():
            badIdl.append(idl)
    for idl in badIdl:
        detailsDict.pop(idl, None)
    return detailsDict

#a function to remove [doi] from a string
def removeSubstring(originalString):
    newString = originalString.replace(' [doi]', '')
    return newString
   
#creates a dictionary with all the DOIs connected to the key of the article; stores it under doiDict
def getDOI(detailsDict):
    doiDict = {}
    for idl, detail in detailsDict.items():
        lid = detail['LID']
        if '[doi]' in lid:
            if '[pii]' in lid:
                lid = lid.split('[pii] ', 1)
                if len(lid) > 0:
                    lid = lid[1]
            lid = removeSubstring(lid)
        doiDict[idl] = lid
    return doiDict

#creates a dictionary of the amount of times each term you want to search for pops up; stores the terms as two dictionaries
#with the key being the term and those dictionaries are under the key of the associated ID
def getCount(terms, doiDict):
    countDict = {}
    for idl, doi in doiDict.items():
        url = f"https://doi.org/{doi}" #creates the full url with the specific doi for each term
        headers = {"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"} #tells the server not to compress it
        html = requests.get(url, headers=headers)
        html = html.text
        htmlParse = BeautifulSoup(html, 'html.parser')
        res = htmlParse.get_text()
        countDict[idl] = {}
        for term in terms:
            countDict[idl][term] = res.count(term)
    return countDict

#creates a dictionary of the titles associated with the IDs of the articles
def getTitle(detailsDict):
    titleDict = {}
    for idl, detail in detailsDict.items():
        titleDict[idl] = detail['TI']
    return titleDict



In [151]:
queries = getQuery(remainderFirstDays, lastDays, phrases, start, end)
IdList = []
for q in queries:
    record = search(q) #the title of the paper and date constraints
    IdList.append(record['IdList']) #list of all the IDs
   

detailsDict = fetchDetails(IdList) #dictionary of all necessary details surrounding the paper
detailsDict = cleanDetails(detailsDict)
titleDict = getTitle(detailsDict) #dictionary with titles
doiDict = getDOI(detailsDict) #dictionary with DOIs
countDict = getCount(terms, doiDict) #dictionary with amount of hits per term

df = pd.DataFrame()

#assings values to the columns
df['ID'] = titleDict.keys()
df['Title'] = titleDict.values()
df['DOI'] = doiDict.values()
for term in terms:
    df[term + '|mentions'] = [x[term] for x in countDict.values()]


display(df)

df.to_csv('pubmed_searcher.csv') #saves data to a csv file named pubmed_searcher.csv


Unnamed: 0,ID,Title,DOI,find my code|mentions,link to my code|mentions,is my code|mentions
0,33540076,A tongue features fusion approach to predictin...,10.1016/j.jbi.2021.103693,0,0,0
1,33539308,Using Automated Machine Learning to Predict th...,10.2196/23458,0,0,0
2,33539291,Predicting Machine Learning Pipeline Runtimes ...,10.1109/TPAMI.2021.3056950,0,0,0
3,33539249,Predicting postoperative liver cancer death ou...,10.1080/03007995.2021.1885361,0,0,0
4,33538696,Machine Learning-Based Early Warning Systems f...,10.2196/25187,0,0,0
5,33538601,Physically Compatible Machine Learning Study o...,10.1021/acs.jpclett.0c03600,0,0,0
6,33538294,Machine learning application for patient strat...,10.1093/bib/bbaa434,0,0,0
7,33538187,Machine learning algorithms to predict seizure...,10.1177/0960327121991910,0,0,0
8,33538134,Machine-Learning-Based Approach to Differentia...,10.3947/ic.2020.0104,0,0,0
9,33537313,Identification of Genome Sequences of Polyphos...,10.3389/fcell.2020.626221,0,0,0


In [153]:
#possible ways to seperate the dates
    
'''
start_date = datetime.strptime(f"{start['day']}/{start['month']}/{start['year']}", "%d/%m/%Y")    
end_date = datetime.strptime(f"{end['day']}/{end['month']}/{end['year']}", "%d/%m/%Y")
delta = relativedelta.relativedelta(end_date, start_date)
months = delta.months + (delta.years * 12)
'''
def getFirstDays(start):
    firstMonthDays = 0
    startMonth = calendar.monthcalendar(start['year'], start['month'])
    for y in startMonth:
        for z in y:
            if z > 0:
                c = 1
            else:
                c = 0
            firstMonthDays += c
    remainderFirstDays = firstMonthDays - start['day'] #add one if range should be inclusive of start date
    return remainderFirstDays

remainderFirstDays = getFirstDays(start)
print(remainderFirstDays)

def getLastDays(end):
    lastDays = end['day']
    return lastDays

lastDays = getLastDays(end)
print(lastDays)

def regularStart(start):
    firstMonth = start['month'] + 1
    if start['month'] + 1 > 12:
        firstMonth = 1
    return firstMonth

firstMonth = regularStart(remainderFirstDays, start)
print(firstMonth)



16
5
2
