## Find abstract and references

In [1]:
from bs4 import BeautifulSoup

In [2]:
import requests

In [3]:
import pandas as pd
import re

In [4]:
from functions import log, isInRef

In [5]:
from urllib.parse import urlparse

In [6]:
df = pd.read_csv('links.1.csv', delimiter= '\t')

In [7]:
# dictionary for getting journal name from url
journals_df = pd.read_csv('domain_to_journal.csv', delimiter='\t')
journal_ref = pd.Series(journals_df.journal.values,index=journals_df.domain).to_dict()
# dict: journal_ref['url'] = journal name

In [8]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

In [9]:
# initialize abstracts list and citations list as well as journal name
abstracts = []
all_references = [] # nested list
journal_names = []

In [10]:
# make sure reference link exists 
def check_ref(ref):
    try:
        link = ref.ul.li.a.get('href')
        if link != '#': # occurs if link is fake
            return link
    except AttributeError: # occurs if no link exists
        return

In [11]:
def main():
    for link in df.link:
        
        domain = urlparse(link).netloc
        
        if isInRef(domain):
            journal_names.append(journal_ref[domain])
                
        page = requests.get(link, headers=headers).content
        soup = BeautifulSoup(page, 'html.parser')
        
        # get abstract 
        abstract = soup.find('div', 'abstract-content').p.get_text()
        abstracts.append(abstract)
        
        # get references
        # initialize reference list
        local_references = []
        references = soup.find('ol', 'references').find_all('li', id=re.compile('^ref\d+'))
        for ref in references:
            checked_ref = check_ref(ref)
            if checked_ref != None:
                local_references.append(checked_ref)
        all_references.append(local_references)
        
    # count references
    count = 0
    for listElem in all_references:
        count += len(listElem)

    log(f'Ran get_contents.2.ipynb succesfully, finding {count} references')
    print(count)
        


In [12]:
main()

305


In [13]:
df["abstract"] = abstracts
df["references"] = all_references
df["journal"] = journal_names

In [14]:
df

Unnamed: 0,link,title,abstract,references,journal
0,https://journals.plos.org/plosone/article?id=1...,Multi-methodological approach for the Quality ...,We set forth to assess the quality of an herba...,"[https://doi.org/10.1016/j.jpba.2011.05.004, h...",PLOS
1,https://journals.plos.org/plosone/article?id=1...,"Differential COVID-19 testing, admissions, and...",Understanding of COVID-19 acquisition and seve...,"[https://doi.org/10.1111/joim.13117, https://d...",PLOS
2,https://journals.plos.org/plosone/article?id=1...,DAO-CP: Data-Adaptive Online CP decomposition ...,How can we accurately and efficiently decompos...,"[https://doi.org/10.1109/TSP.2017.2690524, htt...",PLOS
3,https://journals.plos.org/plosone/article?id=1...,Retinal biomarkers of Cerebral Small Vessel Di...,"Cerebral Small Vessel Disease (CSVD), a progre...","[https://doi.org/10.1212/WNL.0000000000007654,...",PLOS
4,https://journals.plos.org/plosone/article?id=1...,Blockchain-based healthcare management system ...,The lack of data outsourcing in healthcare man...,"[https://doi.org/10.3390/sym10100470, https://...",PLOS
5,https://journals.plos.org/plosone/article?id=1...,Analysis of vibratory mode changes in symmetri...,Investigations of neuromuscular control of voi...,"[https://doi.org/10.2174/157489311796904637, h...",PLOS
6,https://journals.plos.org/plosone/article?id=1...,A study on pick cutting properties with full-s...,To investigate the cutting forces on road-head...,"[https://doi.org/10.1007/s00603-015-0834-7, ht...",PLOS
7,https://journals.plos.org/plosone/article?id=1...,Is it really organic? Credibility factors of o...,Consumer trust and organic food product credib...,[https://doi.org/10.1080/00207144.2020.1756695...,PLOS
8,https://journals.plos.org/plosone/article?id=1...,Contractility analysis of human engineered 3D ...,The use of Engineered Heart Tissues (EHT) as i...,"[https://doi.org/10.1161/CIR.0000000000000570,...",PLOS
9,https://journals.plos.org/plosone/article?id=1...,Experimental study on engineering properties o...,Carbide slag has been used to prepare solidifi...,[https://doi.org/10.1016/j.conbuildmat.2019.05...,PLOS


In [15]:
df.to_csv(r'results.2.csv', sep='\t', index = False)

In [16]:
len(all_references)

13