### Full scraping workflow using Requests, BeautifulSoup combined with Regex

First we call the libraries needed.

In [1]:
import sys
import time
import bibtexparser
import itertools
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

We load the input .csv file containing the MR number and conver it to a Python ```list```.

In [2]:
input_test = pd.read_csv('test_input.csv')
type_change = input_test.values.tolist()
mrn_numbers_only = list(itertools.chain(*type_change))
type(mrn_numbers_only)
mrn_numbers_only

[4044696, 2900886, 3169623, 4180136, 7777777]

In [3]:
mrn = [] # list of all good MR numbers, made up from exactly 7 digits, that we will search for citations
non_standard_mrn = [] # list of non-standard MR numbers

for i in range(len(mrn_numbers_only)):
	#if (mrn_numbers_only[i].isnumeric() and len(mrn_numbers_only[i]) == 7):
	each_mrn = ('MR' + str(mrn_numbers_only[i]))
	mrn.append(each_mrn)
	#else:
#		non_standard_mrn.append(mrn_numbers_only[i])
print('Total input elements:')
print(len(mrn_numbers_only))
print('Number of non-standart elements isolated for updating:')
print(len(non_standard_mrn))
print('Number of standard MR elements, that will be searched for GAP citations:')
print(len(mrn))

Total input elements:
5
Number of non-standart elements isolated for updating:
0
Number of standard MR elements, that will be searched for GAP citations:
5


We define two functions used together to find all GAP citations by HTMl element and text contained inside it. They can be re-used in future we-scraping projects too.

In [4]:
MATCH_ALL = r'.*'


def like(string):
    """
    Return a compiled regular expression that matches the given
    string with any prefix and postfix, e.g. if string = "hello",
    the returned regex matches r".*hello.*"
    """
    string_ = string
    if not isinstance(string_, str):
        string_ = str(string_)
    regex = MATCH_ALL + re.escape(string_) + MATCH_ALL
    return re.compile(regex, flags=re.DOTALL)


def find_by_text(soup, text, tag, mrn, **kwargs):
    """
    Find the tag in soup that matches all provided kwargs, and contains the
    text.

    If no match is found, raise ValueError.
    """
    empty = 1
    elements = soup.find_all(tag, **kwargs)
    matches = []
    for element in elements:
        if element.find(text=like(text)):
            matches.append(mrn + ':')
            matches.append(element.text.strip())
    if len(matches) == 0:
        pass
    else:
        return matches

In [5]:
base_URL = "D:\\"

all_matches = []
review_later = []
actual_scrapes = []

for i in range(len(mrn)):
    url = (base_URL + mrn[i] + '.html')
    page = open(url, encoding="utf8") 
    soup = BeautifulSoup(page, 'html.parser')
    match = (find_by_text(soup, 'GAP', 'li', mrn[i]))
    if match is None:
        review_later.append(mrn[i])
    else:
        all_matches.append(match)
        actual_scrapes.append(mrn[i])
# the following print statements allow user to track progress.
    print('Working on page:')
    print(i)
    print('from a total of:')
    print(len(mrn))
    print('Citations found in page:')
    print(match)
    print(' ') # to skip a line for better readability
    # time.sleep(5) # adding 5 seconds rest interval between iterations  
    # to avoid overloading the source website and also not to risk activating 
    # their security sentinel algorithms
print('Finished GAP citation scan...')
#all_matches

Working on page:
0
from a total of:
5
Citations found in page:
['MR4044696:', 'The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.']
 
Working on page:
1
from a total of:
5
Citations found in page:
['MR2900886:', 'The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.']
 
Working on page:
2
from a total of:
5
Citations found in page:
['MR3169623:', 'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.', 'MR3169623:', 'The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.']
 
Working on page:
3
from a total of:
5
Citations found in page:
['MR4180136:', 'The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.']
 
Working on page:
4
from a total of:
5
Citatio

In [6]:
further_review = pd.DataFrame(review_later)
further_review.to_csv('review.csv', index=False, encoding='utf-8')

Some of the test HTMLs did not contain the word GAP and they returned NoneType elements. Using the following list comprehension we will remove them from the results before we continue.

In [7]:
all_matches = [i for i in all_matches if i is not None]
all_matches

[['MR4044696:',
  'The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.'],
 ['MR2900886:',
  'The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.'],
 ['MR3169623:',
  'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.',
  'MR3169623:',
  'The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.'],
 ['MR4180136:',
  'The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.']]

In [8]:
print(type(match))
print(type(all_matches))
print('Results count is:', len(all_matches))
print(all_matches[2])

<class 'NoneType'>
<class 'list'>
Results count is: 4
['MR3169623:', 'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.', 'MR3169623:', 'The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.']


In [9]:
joined = list(itertools.chain(*all_matches))
joined

['MR4044696:',
 'The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.',
 'MR2900886:',
 'The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.',
 'MR3169623:',
 'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.',
 'MR3169623:',
 'The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.',
 'MR4180136:',
 'The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.']

In [10]:
print(joined[3])
print(type(joined))
print(type(joined[1]))
print('Now the Results count is:', len(joined), ' which confirms that our program also catches GAP Packages citation as separate results.')

The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.
<class 'list'>
<class 'str'>
Now the Results count is: 10  which confirms that our program also catches GAP Packages citation as separate results.


In [11]:
print('Total number of results is: ' + str(len(joined)))

Total number of results is: 10


In [12]:
final = []
for i in range(len(joined)):
    clean = (joined[i].strip())
    final.append(clean)
final

['MR4044696:',
 'The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.',
 'MR2900886:',
 'The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.',
 'MR3169623:',
 'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.',
 'MR3169623:',
 'The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.',
 'MR4180136:',
 'The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.']

### Converting our data to Pandas dataframe for further analysis

In [13]:
df = pd.DataFrame(final)
df

Unnamed: 0,0
0,MR4044696:
1,"The GAP Group, GAP – groups, algorithms and pr..."
2,MR2900886:
3,"The GAP Group, $GAP$ groups, algorithms, and p..."
4,MR3169623:
5,"Distler, A., Mitchell, J. D. (2011). Smallsemi..."
6,MR3169623:
7,"The GAP Group, (2008). (http://www.gap-system...."
8,MR4180136:
9,"The GAP Group, 2019. GAP – Groups, Algorithms,..."


Some MR numbers contain more than one GAP citations which produces extra columns. We need to take every odd element from the whole data and assign it to separate row in one 'MR' column. And then take every even element containing the corresponding citation and join it to its MR number in a second column called 'Citation'.

In [14]:
check = df.index%2==0  #checking if the index is even because the values are in consicutive order
final_df = pd.DataFrame([df.loc[check, 0].str.strip(':').tolist(), # taking every odd element which is MR number
                         df.loc[~check, 0].tolist()], # taking every even element which is Citation
                         index=['MR','Citation']).T # assigning the corresponding value names to each column

In [15]:
final_df

Unnamed: 0,MR,Citation
0,MR4044696,"The GAP Group, GAP – groups, algorithms and pr..."
1,MR2900886,"The GAP Group, $GAP$ groups, algorithms, and p..."
2,MR3169623,"Distler, A., Mitchell, J. D. (2011). Smallsemi..."
3,MR3169623,"The GAP Group, (2008). (http://www.gap-system...."
4,MR4180136,"The GAP Group, 2019. GAP – Groups, Algorithms,..."


The resultung Pandas Data-frame has two columns. Now we can export it to a .CSV file which will be taken over by the next Jupyter Notebook in our pipeline.

In [16]:
final_df.to_csv('local_test_output.csv', index=False, encoding='utf-8')

### Obtaining more data

In [None]:
import bibtexparser
bibtex_file = open('gap-publishednicer.bib.txt', encoding='utf-8')
bib_data = bibtexparser.load(bibtex_file)

In [17]:
bib = bib_data.entries

NameError: name 'bib_data' is not defined

In [None]:
mrs = final_df['MR']

In [None]:
print(type(bib))
print(type(bib[5]))
print(bib[5])

In [None]:
bib[5]

In [None]:
bib[5].keys()
bib[5]['year']
bib[5]['fjournal']

In [None]:
years = []
journals = []
for mrs in bib:
    if mrs == bib[i]['ID']:
        year = bib[i]['mrreviewer']
        
    journals.append(year)
   

In [None]:
journals

# Data pre-processing