# Web Scraper for Systematic Reviews

In [None]:
import os

temp_folder = os.getenv("temp") if os.name == 'nt' else '/tmp' + '/SistRev/'
os.makedirs(temp_folder, exist_ok=True)

## Keywords to search

In [None]:
search = input("What are we researching today? ")

## Metadata Scraper

In [None]:
from metadataScraper import metadataScraper

files = []

scraper = metadataScraper(download_path=temp_folder)

scraped = scraper.scrape(search)    # returns a list

for file in scraped:
    files.append(file)
    
scraper.quit_driver()
    

## Data Cleaner

In [None]:
from datacleaner import DataCleaner

cleaner = DataCleaner()

# This will be handled by metadata scraper 
# data_files_path = ['testdata/testdata.ris', 'testdata/artf-intl-wos.ris']

for file in files:
    print(file)
    cleaner.add_file(file)
    
print('Added all files')
print(f'Total number of articles imported: {cleaner.count_in_entries()}')

In [None]:
n_dup, n_no_title, n_no_abst, n_no_doi = cleaner.clean_entries()
clean = cleaner.count_out_entries()

print(f'Number of duplicated entries: {n_dup}')
print(f'Number of entries without title: {n_no_title}')
print(f'Number of entries without abstract: {n_no_abst}')
print(f'Number of entries without doi: {n_no_doi}')
print(f'Number of entries after cleanup: {clean}')

In [None]:
from pandas import Series

datapoints = [clean]
index = [f'Clean Data: {clean}']
if n_dup > 0:
    datapoints.append(n_dup)
    index.append(f'Duplicated: {n_dup}')
    
if n_no_title > 0:
    datapoints.append(n_no_title)
    index.append(f'No Title: {n_no_title}')
    
if n_no_abst > 0:
    datapoints.append(n_no_abst)
    index.append(f'No Abstract: {n_no_abst}')
    
if n_no_doi > 0:
    datapoints.append(n_no_doi)
    index.append(f'No DOI: {n_no_doi}')

ser = Series(datapoints, index=index)

ser.plot(kind='pie', title='Cleanup results', figsize=(6,6), 
         fontsize='small', autopct=lambda x: '{:.0f}'.format(x * ser.sum() / 100) if x > 0 else '', 
         labels=None, legend=True)

In [None]:
# Output file
cleaner.export_data(path=temp_folder)

## ASREVIEW

In [None]:
import asreview_interfacer as asr

proj = asr.createProject(input("What should we name the project?\nLeave empty for a generated name. "))

proj.add_dataset(f'{temp_folder}/out.ris')

interface = asr.launch_interface()

print("The interface should start in a few seconds...")
print("When the review is finished, export the data (relevant) as RIS to use the PDF scraper. Put the exported file on it's own folder. The PDFs are downloaded next to the file.")
input("Press ENTER/OK kill the web server (You need to close the browser tab manually). ")

interface.kill()

## PDF Scraper

In [None]:
from pdfscraper import PDFScraper

risfile = input("Path for the .ris file to import: ").replace('"', '')

failed_downloads, total = PDFScraper.download_from_ris(risfile)

print()
print(" Failed downloads ")
print("------------------")
for doi in failed_downloads:
    print("https://doi.org/" + doi)

print("Total Failed", len(failed_downloads), "/", total )
print("Try checking the publisher using the addresses above or searching for the DOI in "
      "https://www.researchgate.net/")