In [1]:
from collections import namedtuple
import re
from time import sleep

from bs4 import BeautifulSoup
import pandas as pd
import requests
from selenium import webdriver
from tqdm import tqdm

SEARCH_URL = "https://psyarxiv.com/discover?q=depression"
PsyArXivTuple = namedtuple("PsyArXivTuple", 
                           field_names=["PaperName","URL","LastEdited","Disciplines"])

In [2]:
def get_page_html(url):
    """Shorthand for retrieving a page's HTML and processing it into BeautifulSoup."""
    return BeautifulSoup(requests.get(url).text, "lxml")


def download_pdf(pdf_url, file_destination):
    """Downloads a single PDF file from the given URL"""
    response = requests.get(pdf_url)
    with open(file_destination,'wb') as f:
        f.write(response.content)

## Get initial preprint info

In [3]:
# set up the Selenium driver
driver = webdriver.Firefox()

In [4]:
driver.get(SEARCH_URL)
sleep(5)
page_html = BeautifulSoup(driver.page_source, "lxml")

In [5]:
pagination_element = page_html.find_all("ul", attrs={"class":"pagination"})[-1]
last_page_link = pagination_element.find_all("li", attrs={"class":"ember-view"})[-1]
number_of_pages = int(last_page_link.text.strip())
number_of_pages

91

In [8]:
def get_OSF_paper_info(paper_element):
    paper_link = paper_element.find("h4")
    if paper_link.a:
        paper_name = paper_link.a.text.strip()
        paper_url = paper_link.a["href"]
    else:
        paper_name = paper_link.span.text.strip()
        paper_url = ""
        
    last_edited = paper_element.find("em").text.strip()[13:-4]
    
    discipline_elements = paper_element.find_all("span", attrs={"class":"subject-preview"})
    if discipline_elements:
        disciplines = [e.text.strip() for e in discipline_elements]
    else:
        disciplines = []
        
    return PsyArXivTuple(paper_name, paper_url, last_edited, disciplines)

In [9]:
preprint_tuples = []
for page in tqdm(range(number_of_pages)):
    target_page = f"https://psyarxiv.com/discover?page={page+1}&q=depression"
    driver.get(target_page)
    sleep(5)  ## try to avoid overwhelming the site
    
    page_html = BeautifulSoup(driver.page_source, "lxml")
    papers_on_page = page_html.find_all("div", attrs={"class": "col-sm-8"})[1]
    paper_elements = papers_on_page.find_all("div", attrs={"class":"ember-view"}, recursive=False)
    current_page_tuples = [get_OSF_paper_info(p) for p in paper_elements]
    preprint_tuples.extend(current_page_tuples)

100%|██████████| 91/91 [08:06<00:00,  5.35s/it]


In [10]:
psyarxiv = pd.DataFrame(preprint_tuples)

In [11]:
(psyarxiv["URL"] == "").mean()

0.0

In [55]:
psyarxiv.to_csv("psyarxiv.csv")

## Getting the preprints themselves

In [4]:
psyarxiv = pd.read_csv("psyarxiv.csv", converters={"Disciplines":eval})

In [25]:
save_directory = "preprints/"
tag_lists = []
for _, row in tqdm(psyarxiv.iterrows()):
    driver.get(row.URL)
    sleep(5)
    page_html = BeautifulSoup(driver.page_source, "lxml")
    
    #get file extension & use preprint title on 
    if (file_name_element := page_html.find("span", attrs={"id":"selectedFileName"})):
        file_name = file_name_element.text
        extension = re.match("^.*?\.", file_name[::-1]).group()[::-1]
        paper_name = row.PaperName.replace("/", "-").replace(":", "")
        if len(paper_name) > 250:
            paper_name = paper_name[:250]
        download_pdf(row.URL + "download", save_directory + paper_name + extension)
        tags = page_html.find_all("span", attrs={"class":"badge"})
    else:
        tags = []
    
    tag_lists.append([t.text for t in tags])
        

64it [07:41,  7.22s/it]


In [29]:
psyarxiv["Tags"] = tag_lists

In [31]:
psyarxiv.to_csv("psyarxiv.csv", index=False)