<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Data scraping

# Imports

In [1]:
import os
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup
import os

import time
import random
from tqdm import tqdm 

# Data download
- [Setting up Beautiful soup for ACS: Analytical Chemistry Journal (low_HI)](Setting-up-Beautiful-soup-for-ACS:-Analytical-Chemistry-Journal-(low_HI)) - set up catalogue for downloading later.
- [Setting up Beautiful soup for: Analytical and Bioanalytical Chemistry (high_HI)](Setting-up-Beautiful-soup-for-ACS:-Analytical-and-Bioanalytical-Chemistry-(high_HI)) - set up catalogue for downloading later.
- [Set scraping function](Set-scraping-function) - to automate the webscraping process.
- [Scrape & save](Scrape-&-save) - Export data for use in next section.

## Setting up Beautiful soup for ACS: Analytical Chemistry Journal (low_HI)
- Set up Beautiful soup
- Pull and filter the initial catalogue of issues for low_HI journal.
- Data starts from 2000 since the low_HI only has publications available starting from that year.
- Export list for storage and use by `datagrab` function later

In [2]:
#set url, check reponse
low_url = "https://link.springer.com/journal/10809/volumes-and-issues"
low_res = requests.get(low_url)
low_res.status_code == 200

True

In [3]:
#create soup object with html parser
low_soup = BeautifulSoup(low_res.content, 'html.parser')

#lookthrough
low_soup.extract()

<!DOCTYPE html>

<html class="no-js" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="pc,mobile" name="applicable-device"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Journal of Analytical Chemistry | Volumes and issues</title>
<meta content="telephone=no" name="format-detection"/>
<link href="/oscar-static/img/favicons/springer/apple-touch-icon-1ee298e6a2.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="/oscar-static/img/favicons/springer/favicon-32x32-da5cbe81a8.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="/oscar-static/img/favicons/springer/favicon-16x16-b154d15d9a.png" rel="icon" sizes="16x16" type="image/png"/>
<link data-test="shortcut-icon" href="/oscar-static/img/favicons/springer/favicon-d1430c5053.ico" rel="shortcut icon"/>
<link href="/oscar-static/app-springercom/manifest-31044b92ab.json" rel="manifest"/>
<meta content="/oscar-static/app-springercom/brow

In [4]:
#test search
low_soup.select('ul.c-list-group a')[310].text.strip()

'February 2000, issue 2'

In [5]:
#find catalogue via <a> elements within the <ul> element with class="c-list-group"
low_a_elements = low_soup.select('ul.c-list-group a')

low_catalogue = []
# Loop through the <a> elements and extract the href and text data
for a in low_a_elements:
    if a.text.strip() == 'January 2000, issue 1':
        break
    else:
        catalogue_issue = {'text' : a.text.strip(),    #strip to remove whitespaces
                           'href': a['href'],
                           'url' : 'https://link.springer.com' + a['href']
                  }
        low_catalogue.append(catalogue_issue)

#data export
if not os.path.exists('data'):
    os.makedirs('data')
pd.DataFrame(low_catalogue).to_csv('data/low_catalogue.csv', index = False)

len(low_catalogue)

311

## Setting up Beautiful soup for: Analytical and Bioanalytical Chemistry (high_HI)
- Set up Beautiful soup
- Pull and filter the initial catalogue of issues for high_HI journal.
- Data to be filtered starting from 2000 since the low_HI only has publications available starting from that year.
- Export list for storage and use by `datagrab` function later

In [6]:
#set url, check reponse for valid site
high_url = "https://link.springer.com/journal/216/volumes-and-issues"
res = requests.get(high_url)
res.status_code == 200

True

In [7]:
#create soup object with html parser
high_soup = BeautifulSoup(res.content, 'html.parser')

#lookthrough
high_soup.extract()

<!DOCTYPE html>

<html class="no-js" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="pc,mobile" name="applicable-device"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Analytical and Bioanalytical Chemistry | Volumes and issues</title>
<meta content="telephone=no" name="format-detection"/>
<link href="/oscar-static/img/favicons/springer/apple-touch-icon-1ee298e6a2.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="/oscar-static/img/favicons/springer/favicon-32x32-da5cbe81a8.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="/oscar-static/img/favicons/springer/favicon-16x16-b154d15d9a.png" rel="icon" sizes="16x16" type="image/png"/>
<link data-test="shortcut-icon" href="/oscar-static/img/favicons/springer/favicon-d1430c5053.ico" rel="shortcut icon"/>
<link href="/oscar-static/app-springercom/manifest-31044b92ab.json" rel="manifest"/>
<meta content="/oscar-static/app-springerc

In [8]:
#test search
high_soup.select('ul.c-list-group a')[615].text.strip()

'January 2000, issue 2'

In [9]:
#find catalogue via <a> elements within the <ul> element with class="c-list-group"
high_a_elements = high_soup.select('ul.c-list-group a')

high_catalogue = []
# Loop through the <a> elements and extract the href and text data
for a in high_a_elements:
    if a.text.strip() == 'January 2000, issue 1':      #set date
        break
    else:
        catalogue_issue = {'text' : a.text.strip(),    #strip to remove whitespaces
                           'href': a['href'],          #retain just incase.
                           'url' : 'https://link.springer.com' + a['href'] #form url
                  }
        high_catalogue.append(catalogue_issue)

#data export
if not os.path.exists('data'):
    os.makedirs('data')
pd.DataFrame(high_catalogue).to_csv('data/high_catalogue.csv', index = False)

len(high_catalogue)

616

## Set scraping function
- Test and refine manual call of data
- combinne into a function

In [10]:
#import back in catalogue
low_catalogue_df = pd.read_csv('data/low_catalogue.csv')
low_catalogue = low_catalogue_df.to_dict('records')

In [11]:
low_catalogue

[{'text': 'August 2023, issue 8',
  'href': '/journal/10809/volumes-and-issues/78-8',
  'url': 'https://link.springer.com/journal/10809/volumes-and-issues/78-8'},
 {'text': 'July 2023, issue 7',
  'href': '/journal/10809/volumes-and-issues/78-7',
  'url': 'https://link.springer.com/journal/10809/volumes-and-issues/78-7'},
 {'text': 'June 2023, issue 6',
  'href': '/journal/10809/volumes-and-issues/78-6',
  'url': 'https://link.springer.com/journal/10809/volumes-and-issues/78-6'},
 {'text': 'May 2023, issue 5',
  'href': '/journal/10809/volumes-and-issues/78-5',
  'url': 'https://link.springer.com/journal/10809/volumes-and-issues/78-5'},
 {'text': 'April 2023, issue 4',
  'href': '/journal/10809/volumes-and-issues/78-4',
  'url': 'https://link.springer.com/journal/10809/volumes-and-issues/78-4'},
 {'text': 'March 2023, issue 3',
  'href': '/journal/10809/volumes-and-issues/78-3',
  'url': 'https://link.springer.com/journal/10809/volumes-and-issues/78-3'},
 {'text': 'February 2023, issue

In [12]:
high_catalogue

[{'text': 'September 2023, issue 21',
  'href': '/journal/216/volumes-and-issues/415-21',
  'url': 'https://link.springer.com/journal/216/volumes-and-issues/415-21'},
 {'text': 'August 2023, issue 20',
  'href': '/journal/216/volumes-and-issues/415-20',
  'url': 'https://link.springer.com/journal/216/volumes-and-issues/415-20'},
 {'text': 'August 2023, issue 19',
  'href': '/journal/216/volumes-and-issues/415-19',
  'url': 'https://link.springer.com/journal/216/volumes-and-issues/415-19'},
 {'text': 'July 2023, issue 18',
  'href': '/journal/216/volumes-and-issues/415-18',
  'url': 'https://link.springer.com/journal/216/volumes-and-issues/415-18'},
 {'text': 'July 2023, issue 17',
  'href': '/journal/216/volumes-and-issues/415-17',
  'url': 'https://link.springer.com/journal/216/volumes-and-issues/415-17'},
 {'text': 'July 2023, issue 16',
  'href': '/journal/216/volumes-and-issues/415-16',
  'url': 'https://link.springer.com/journal/216/volumes-and-issues/415-16'},
 {'text': 'June 202

In [13]:
#data export
if not os.path.exists('data'):
    os.makedirs('data')
pd.DataFrame(high_catalogue).to_csv('data/high_catalogue.csv', index = False)

In [14]:
#sample issues
low_catalogue[10]['url']

'https://link.springer.com/journal/10809/volumes-and-issues/77-12'

In [15]:
#manual test
test_url = low_catalogue[0]['url']
test_res = requests.get(test_url)
test_soup = BeautifulSoup(test_res.content, 'html.parser')
test_a_elements = test_soup.select('li.c-list-group__item a')

In [16]:
paper_url = 'https://link.springer.com/article/10.1134/S106193482209009X'
paper_res = requests.get(paper_url)
paper_soup = BeautifulSoup(paper_res.content, 'html.parser')
abstract_section = paper_soup.find('div', {'id': 'Abs1-content'})
content_type_section = paper_soup.find('li', {'data-test': 'article-category'})
published_date_section = paper_soup.find('time', 'datetime')

In [17]:
abstract_text = abstract_section.find('p').text.replace("\xa0", " ").strip()
abstract_text 

'The paper presents one of lines of scientific inquiry of the Department of Analytical Chemistry and Chemical Ecology of Saratov University – establishing a relationship of the physicochemical, analytical, and other properties, reactivity, and biological activity of substances with the energetics, spatial and electronic structure, molecular descriptors of reactants, transition states, intermediates, nanoclusters in the ground and excited states, generalization of views on the mechanisms and regioselectivity of reactions, study of the effects of electrostatic and hydrophobic factors, and medium on various properties and the course of chemical processes; quantum-chemical study of the electronic effects of atomic groups in molecules, hydrogen bonding; revealing the nature of interactions between unbound atoms in small and/or strained molecular systems, in small and medium quasicycles; elucidation (on the basis of NBO analysis of hybridization) of the question of the differential participa

In [18]:
content_type_section = paper_soup.find('li', {'data-test': 'article-category'})
published_date_section = paper_soup.find('a', {'data-track': 'click', 'data-track-action': 'publication date'})
content_type_section = paper_soup.find('li', {'data-test': 'article-category'})
published_date_section = paper_soup.find('a', {'data-track': 'click', 
                                               'data-track-action': 'publication date'})

In [22]:
#set up function to call data from preloaded catalogue list with start stop range to break up scraping
def datagrab(catalogues, start=0, stop=100):
    #set up list for paper_data dump
    journal_data = []

    #loop through catalogues list to get each url
    for n in tqdm(range(start, stop)):
        issue_url = catalogues[n]['url']
        issue_res = requests.get(issue_url)

        #instantiate and set blank list
        soup = BeautifulSoup(issue_res.content, 'html.parser')

        #finding a & meta elements to extract the href (as url), abstract_section, content_type_section, published_date_section
        a_elements = soup.select('li.c-list-group__item a')

        # Loop through the <a> elements and extract the href and text data
        for i, a in enumerate(a_elements):
            paper_url = a['href']
            paper_res = requests.get(paper_url)
            paper_soup = BeautifulSoup(paper_res.content, 'html.parser')
            abstract_section = paper_soup.find('div', {'id': 'Abs1-content'})
            content_type_section = paper_soup.find('li', {'data-test': 'article-category'})
            published_date_section = paper_soup.find('a', {'data-track': 'click', 
                                                           'data-track-action': 'publication date'})
            #some pages does not have abstract. content type etc or built differently, try-except needed to prevent break.
            try:                #strip to remove whitespaces
                abstract_text = abstract_section.find('p').text.replace("\xa0", " ").strip()
                content_type_text = content_type_section.text.strip()
                published_date_text = published_date_section.find('time').text.strip()
                
                #combine all data into a dictionary and throw back to list.
                paper_data = {'title' : a_elements[i].text.strip(),    
                             'issue' : catalogues[n]['text'],
                             'issue_href' : catalogues[n]['href'],
                             'url': a['href'],
                             'content_type' : content_type_text,
                             'publish_date' : published_date_text,
                             'abstract' : abstract_text
                            }
                journal_data.append(paper_data)
#optionl cooldown timer per paper to lower hashrate and prevent kick out.
                cooldown_time = random.uniform(0.1, 2.5)
                time.sleep(cooldown_time)
            except:
              continue
#optional cooldown per issue
        cooldown_time = random.uniform(0.1, 10.5)
        time.sleep(cooldown_time)
    #combine into df to return for export later
    journal_df = pd.DataFrame(journal_data)
    return journal_df

## Scrape & save
- Load back previously generated catalogue to scrape in sections to prevent blocks or other issues.
- Concatenate data
- Export

In [23]:
#import back in catalogue
high_catalogue_df = pd.read_csv('data/high_catalogue.csv')
high_catalogue = high_catalogue_df.to_dict('records')

low_catalogue_df = pd.read_csv('data/low_catalogue.csv')
low_catalogue = low_catalogue_df.to_dict('records')

In [24]:
#download high_HI catalogue
high_papers_df = datagrab(high_catalogue, 0, 2)  #set to 2 for demonstration purpose only
high_papers_df

100%|███████████████████████████████████████████| 2/2 [03:25<00:00, 102.96s/it]


Unnamed: 0,title,issue,issue_href,url,content_type,publish_date,abstract
0,Rules for mass spectrometry applications in th...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Feature Article,25 March 2023,MS-based analytical methods now play an import...
1,Strategies for automating analytical and bioan...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Feature Article,13 May 2023,Analytical measurement methods are used in dif...
2,The role of comprehensive two-dimensional gas ...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Critical Review,06 May 2023,Mineral oil hydrocarbons (MOH) contain a wide ...
3,Confocal micro X-ray fluorescence analysis for...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Critical Review,24 July 2023,Confocal micro X-ray fluorescence (CMXRF) spec...
4,Advances in testing for sample manipulation in...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Critical Review,05 May 2023,"In many countries, adherence testing is used t..."
5,Advances in testing for sample manipulation in...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Critical Review,28 April 2023,"As a continuation of part A, focusing on advan..."
6,Integrated microscale immiscible phase extract...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Research Paper,18 May 2023,Gonorrhea is the second most common sexually t...
7,Immunomagnetic separation coupled with flow cy...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Research Paper,18 May 2023,Legionella pneumophila are pathogenic bacteria...
8,Tailored extraction and ion mobility-mass spec...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Research Paper,22 June 2023,Climate change directs the focus in biotechnol...
9,"Detection, chemical analysis, and pharmacologi...","September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Research Paper,13 May 2023,The emergence of structurally diverse new synt...


In [25]:
#download low_HI catalogue
low_papers_df = datagrab(low_catalogue, 0, 2)    #set to 2 for demonstration purpose only/
low_papers_df.head()

100%|████████████████████████████████████████████| 2/2 [02:15<00:00, 67.57s/it]


Unnamed: 0,title,issue,issue_href,url,content_type,publish_date,abstract
0,Proteins: Templates and Matrices in Molecular ...,"August 2023, issue 8",/journal/10809/volumes-and-issues/78-8,https://link.springer.com/article/10.1134/S106...,REVIEWS,02 August 2023,This review considers the issues of molecular ...
1,Determination and Quantification of Cetirizine...,"August 2023, issue 8",/journal/10809/volumes-and-issues/78-8,https://link.springer.com/article/10.1134/S106...,ARTICLES,02 August 2023,"A simple, fast, sensitive, cost-effective, and..."
2,Experience of the Determination of Fluorine in...,"August 2023, issue 8",/journal/10809/volumes-and-issues/78-8,https://link.springer.com/article/10.1134/S106...,ARTICLES,02 August 2023,The main fluorine-concentrating minerals in ro...
3,A Green-Solvent Based Dispersive Liquid–Liquid...,"August 2023, issue 8",/journal/10809/volumes-and-issues/78-8,https://link.springer.com/article/10.1134/S106...,ARTICLES,02 August 2023,This research presents the use of deep eutecti...
4,Adaptive Calibration in Electrothermal Atomic ...,"August 2023, issue 8",/journal/10809/volumes-and-issues/78-8,https://link.springer.com/article/10.1134/S106...,ARTICLES,02 August 2023,A method is developed for determining concentr...
5,Effect of Organic Additives on the Intensity o...,"August 2023, issue 8",/journal/10809/volumes-and-issues/78-8,https://link.springer.com/article/10.1134/S106...,ARTICLES,02 August 2023,The effect of the nature and concentration of ...
6,Characterization and Comparison of Automobile ...,"August 2023, issue 8",/journal/10809/volumes-and-issues/78-8,https://link.springer.com/article/10.1134/S106...,ARTICLES,02 August 2023,Automotive paint flakes are significant for th...
7,Using Highly Sensitive Piezo Sensors in an Ope...,"August 2023, issue 8",/journal/10809/volumes-and-issues/78-8,https://link.springer.com/article/10.1134/S106...,ARTICLES,02 August 2023,A single sensor with a piezoelectric quartz tr...
8,Development of a Bioanalytical Method for the ...,"August 2023, issue 8",/journal/10809/volumes-and-issues/78-8,https://link.springer.com/article/10.1134/S106...,ARTICLES,02 August 2023,Since fungal infections are on the rise and wi...
9,Profiling Detection and Validation of Six Sart...,"August 2023, issue 8",/journal/10809/volumes-and-issues/78-8,https://link.springer.com/article/10.1134/S106...,ARTICLES,02 August 2023,"Six sartan substances including telmisartan, l..."


In [26]:
high_papers_df.head()

Unnamed: 0,title,issue,issue_href,url,content_type,publish_date,abstract
0,Rules for mass spectrometry applications in th...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Feature Article,25 March 2023,MS-based analytical methods now play an import...
1,Strategies for automating analytical and bioan...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Feature Article,13 May 2023,Analytical measurement methods are used in dif...
2,The role of comprehensive two-dimensional gas ...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Critical Review,06 May 2023,Mineral oil hydrocarbons (MOH) contain a wide ...
3,Confocal micro X-ray fluorescence analysis for...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Critical Review,24 July 2023,Confocal micro X-ray fluorescence (CMXRF) spec...
4,Advances in testing for sample manipulation in...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Critical Review,05 May 2023,"In many countries, adherence testing is used t..."
5,Advances in testing for sample manipulation in...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Critical Review,28 April 2023,"As a continuation of part A, focusing on advan..."
6,Integrated microscale immiscible phase extract...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Research Paper,18 May 2023,Gonorrhea is the second most common sexually t...
7,Immunomagnetic separation coupled with flow cy...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Research Paper,18 May 2023,Legionella pneumophila are pathogenic bacteria...
8,Tailored extraction and ion mobility-mass spec...,"September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Research Paper,22 June 2023,Climate change directs the focus in biotechnol...
9,"Detection, chemical analysis, and pharmacologi...","September 2023, issue 21",/journal/216/volumes-and-issues/415-21,https://link.springer.com/article/10.1007/s002...,Research Paper,13 May 2023,The emergence of structurally diverse new synt...


In [27]:
#data export #with _demo
if not os.path.exists('data'):
    os.makedirs('data')
    pd.DataFrame(high_papers_df).to_csv('data/high_journal_demo.csv', index = False)
    pd.DataFrame(low_papers_df).to_csv('data/low_journal_demo.csv', index = False)

else:
    try:
        #load & concat
        high_journal_df = pd.read_csv('data/high_journal_demo.csv')
        high_journal_df = pd.concat(high_journal_df, high_papers_df)
        low_journal_df = pd.read_csv('data/low_journal_demo.csv')
        low_journal_df = pd.concat(low_journal_df, low_papers_df)
        #export
        pd.DataFrame(high_journal_df).to_csv('data/high_journal_demo.csv', index = False)
        pd.DataFrame(low_journal_df).to_csv('data/low_journal_demo.csv', index = False)
    except:
        pd.DataFrame(high_papers_df).to_csv('data/high_journal_demo.csv', index = False)
        pd.DataFrame(low_papers_df).to_csv('data/low_journal_demo.csv', index = False)