In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pymongo
import time
import glob
import os

In [33]:
from pymongo import MongoClient

In [34]:
mongoClient = MongoClient("mongodb://localhost:27017/")
db = mongoClient['t2dm']
col = db['allData']

In [35]:
query = col.find({'insulin': 1, 'mortality': 1, 'odds_ratio': 1, 'is_downloaded': 0})
pmids = []
titles = []

In [36]:
for data in query:
    pmids.append(data['pmid'])
    titles.append(data['title'])

In [37]:
pmids

[30694322.0,
 19318384.0,
 29488676.0,
 12771873.0,
 19623047.0,
 20715734.0,
 29769112.0,
 23271103.0,
 26088909.0,
 30535668.0,
 30686567.0,
 17599483.0,
 25888011.0,
 22452807.0,
 18503925.0,
 11729370.0,
 21963581.0,
 17434094.0,
 19387173.0,
 24420499.0,
 8203424.0,
 20888671.0,
 24857051.0,
 26119654.0,
 25999212.0,
 25770704.0,
 21248294.0,
 16098296.0,
 20561594.0,
 28838399.0,
 12438289.0,
 15886232.0,
 8960845.0,
 6140355.0,
 15976800.0,
 20815045.0,
 16504835.0,
 10526912.0,
 9207639.0]

In [38]:
titles

['Effect of Hydrocortisone Therapy Initiated 7 to 14 Days After Birth on Mortality or Bronchopulmonary Dysplasia Among Very Preterm Infants Receiving Mechanical Ventilation: A Randomized Clinical Trial',
 'Intensive versus conventional glucose control in critically ill patients',
 'Treatment with insulin is associated with worse outcome in patients with chronic heart failure and diabetes',
 'Continuous insulin infusion reduces mortality in patients with diabetes undergoing coronary artery bypass grafting',
 'Hypoglycemia with intensive insulin therapy in critically ill patients: predisposing factors and association with mortality',
 'Glycaemic fluctuation predicts mortality in critically ill patients',
 'Association between β-blocker use and mortality in critically ill patients: a nested cohort study',
 'The potential impact of admission insulin levels on patient outcome in the intensive care unit',
 'Intensive versus conventional glucose control in critically ill patients with traumat

In [39]:
pmids = [int(x) for x in pmids]

In [40]:
print(pmids)

[30694322, 19318384, 29488676, 12771873, 19623047, 20715734, 29769112, 23271103, 26088909, 30535668, 30686567, 17599483, 25888011, 22452807, 18503925, 11729370, 21963581, 17434094, 19387173, 24420499, 8203424, 20888671, 24857051, 26119654, 25999212, 25770704, 21248294, 16098296, 20561594, 28838399, 12438289, 15886232, 8960845, 6140355, 15976800, 20815045, 16504835, 10526912, 9207639]


In [70]:
import selenium

from selenium import webdriver

# driver = webdriver.Chrome(r'C:/chromedriver/chromedriver.exe')

In [71]:
options = webdriver.ChromeOptions()
prefs = {
    'download.default_directory': r"E:\Repos\GitHub\source\t2dm\temp",
      "download.prompt_for_download": False
}
options.add_experimental_option('prefs', prefs)

In [72]:
driver = webdriver.Chrome(r'C:/chromedriver/chromedriver.exe', chrome_options = options)

  driver = webdriver.Chrome(r'C:/chromedriver/chromedriver.exe', chrome_options = options)


In [74]:
def download(pmids, titles):
    print(f"Total files are: {len(pmids)}")
    i = 1
    for pmid, title in zip(pmids, titles):
        print(f"Iteration {i}")
        i += 1
        # go to link
        print("accessing link")
        driver.get("https://sci-hub.se/")
        search_box = driver.find_element_by_name("request")
        
        # send pmid to selenium browser
        print("sending keys")
        search_box.send_keys(str(pmid))
        driver.execute_script('''
            var elem = arguments[0];
            var value = arguments[1]
            elem.value = value;
        ''',search_box, str(pmid))
        print("executing script")
        value = driver.execute_script('return arguments[0].value', search_box)
        
        # check whether values are correct
        print(f"value returned: {value}")
        print("pmid: " + str(pmid))
        print("check whether values match")
        if(int(pmid) == int(value)):
            print("values verified")
        else:
            break
        
        # click submit form
        print("going to pdf")
        driver.execute_script("javascript:document.forms[0].submit()")
        
        # access pdf download buutons
        print("accessing pdf download buttons")
        element = driver.find_element_by_xpath("//div[@id='article']/iframe")
        print(element.get_attribute("src"))
        
        # download pdf
        print("downloading pdf")
        driver.get(str(element.get_attribute('src')))
        
        time.sleep(10)
        
        # get_pdfs
        pdf_files = get_pdfs()

        # check whether size is 1
        if len(pdf_files) != 1:
            print("Cannot rename file, exiting")
            break
            
        title = title.replace(" ", "_")
        # rename files
        rename_files(pdf_files[0], str(pmid), 'insulin_mortality_odds_ratio')
        print(f"{pdf_files[0]} renamed to {title}.pdf")
        
        # update equivalent document in mongodb
        print("updating mongodb")
        update(pmid)
        
        
        time.sleep(300)

In [75]:
def get_pdfs():
    path = 'E:/Repos/GitHub/source/t2dm/temp'
    pdf_files = [f for f in os.listdir(path) if f.endswith('.pdf')]
    return pdf_files

In [76]:
pdf_files = get_pdfs()

In [77]:
def rename_files(src: str,dst: str, folder_name: str):
    filepath = f"temp/{src}"
    filepath = filepath.replace("/", "\\")
    cwd = os.getcwd()
    old_path = os.path.join(cwd, filepath)
#     newfilepath = f"papers/{folder_name}/{dst}.pdf"
    newfilepath = f"papers/{folder_name}/"
    newfilepath = newfilepath.replace("/", "\\")
    new_path = os.path.join(cwd, newfilepath)
    print("Old path " + old_path)
    print(newfilepath)
    if os.path.exists(old_path):
        os.rename(old_path, new_path + dst + ".pdf")
    else:
        print("File does not exist")

In [78]:
def update(pmid: int):
    myquery = {"pmid": pmid}
    newvalues = {"$set": {"is_downloaded": 1}}
    col.update_one(myquery, newvalues)

In [79]:
def get_actual_filename(name):
    dirs = name.split('\\')
    # disk letter
    test_name = [dirs[0].upper()]
    for d in dirs[1:]:
        test_name += ["%s[%s]" % (d[:-1], d[-1])]
    res = glob.glob('\\'.join(test_name))
    if not res:
        #File not found
        return None
    return res[0]

In [80]:
# rename_files("hi.pdf", "Effect of Hydrocortisone Therapy Initiated 7 to 14 Days After Birth on Mortality or Bronchopulmonary Dysplasia among Very Preterm Infants Receiving Mechanical Ventilation- A Randomized Clinical Trial", 'insulin_mortality_odds_ratio')
rename_files("sales2014.pdf", "xyz1423W", 'insulin_mortality_odds_ratio')

Old path E:\Repos\GitHub\source\t2dm\temp\sales2014.pdf
papers\insulin_mortality_odds_ratio\
File does not exist


In [None]:
download(pmids, titles)

Total files are: 39
Iteration 1
accessing link
sending keys
executing script
value returned: 30694322
pmid: 30694322
check whether values match
values verified
going to pdf
accessing pdf download buttons
https://dacemirror.sci-hub.se/journal-article/c8a7ccd7bb0b194a20f2a8a7a0fe345d/onland2019.pdf#view=FitH
downloading pdf
Old path E:\Repos\GitHub\source\t2dm\temp\onland2019.pdf
papers\insulin_mortality_odds_ratio\
onland2019.pdf renamed to Effect_of_Hydrocortisone_Therapy_Initiated_7_to_14_Days_After_Birth_on_Mortality_or_Bronchopulmonary_Dysplasia_Among_Very_Preterm_Infants_Receiving_Mechanical_Ventilation:_A_Randomized_Clinical_Trial.pdf
updating mongodb
