### Full scraping workflow using Requests, BeautifulSoup combined with Regex

First we call the libraries needed.

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import itertools

We load the input .csv file containing the MR number and conver it to a Python ```list```.

In [2]:
input_test = pd.read_csv('test_input.csv')
mrn = (input_test['0'].tolist())
print(mrn)

['MR3', 'MR4044696', 'MR2900886', 'MR3169623', 'MR4180136', 'MR11', 'MR1111111', 'MR5', 'MR7', 'MR9', 'MRMR4044697']


We define two functions used together to find all GAP citations by HTMl element and text contained inside it. They can be re-used in future we-scraping projects too.

In [12]:
MATCH_ALL = r'.*'


def like(string):
    """
    Return a compiled regular expression that matches the given
    string with any prefix and postfix, e.g. if string = "hello",
    the returned regex matches r".*hello.*"
    """
    string_ = string
    if not isinstance(string_, str):
        string_ = str(string_)
    regex = MATCH_ALL + re.escape(string_) + MATCH_ALL
    return re.compile(regex, flags=re.DOTALL)


def find_by_text(soup, text, tag, mrn, **kwargs):
    """
    Find the tag in soup that matches all provided kwargs, and contains the
    text.

    If no match is found, raise ValueError.
    """
    empty = 1
    elements = soup.find_all(tag, **kwargs)
    matches = []
    for element in elements:
        if element.find(text=like(text)):
            matches.append(mrn + ':')
            matches.append(element.text.strip())
    if len(matches) == 0:
        pass
    else:
        return matches

In [13]:
base_URL = "https://sis1.host.cs.st-andrews.ac.uk/GAP/"
#mrn = ["MR3", "MR4044696", "MR2900886", "MR3169623", "MR4180136", "MR4044697", "MR7", "MR5", "MR1111111", "MR11"]
url_list = []
all_matches = []

for i in range(len(mrn)):
    url = (base_URL + mrn[i] + '.html')
    url_list.append(url) #for records keeping only, not really needed 
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    match = (find_by_text(soup, 'GAP', 'li', mrn[i]))
    all_matches.append(match)
    
all_matches

[None,
 ['MR4044696:',
  'The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.'],
 ['MR2900886:',
  'The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.'],
 ['MR3169623:',
  'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.',
  'MR3169623:',
  'The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.'],
 ['MR4180136:',
  'The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.'],
 ['MR11:', 'GAP group'],
 ['MR1111111:',
  'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.',
  'MR1111111:',
  'The GAP Group, (2008). (http://www.gap-system.org).

In [14]:
print(type(all_matches[0]))

<class 'NoneType'>


Some of the test HTMLs did not contain the word GAP and they returned NoneType elements. Using the following list comprehension we will remove them from the results before we continue.

In [15]:
all_matches = [i for i in all_matches if i is not None]
all_matches

[['MR4044696:',
  'The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.'],
 ['MR2900886:',
  'The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.'],
 ['MR3169623:',
  'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.',
  'MR3169623:',
  'The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.'],
 ['MR4180136:',
  'The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.'],
 ['MR11:', 'GAP group'],
 ['MR1111111:',
  'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.',
  'MR1111111:',
  'The GAP Group, (2008). (http://www.gap-system.org). GAP–Gr

In [16]:
print(type(match))
print(type(all_matches))
print('Results count is:', len(all_matches))
print(all_matches[2])

<class 'NoneType'>
<class 'list'>
Results count is: 9
['MR3169623:', 'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.', 'MR3169623:', 'The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.']


In [19]:
joined = list(itertools.chain(*all_matches))
joined

['MR4044696:',
 'The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.',
 'MR2900886:',
 'The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.',
 'MR3169623:',
 'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.',
 'MR3169623:',
 'The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.',
 'MR4180136:',
 'The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.',
 'MR11:',
 'GAP group',
 'MR1111111:',
 'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.',
 'MR1111111:',
 'The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, a

In [20]:
print(joined[3])
print(type(joined))
print(type(joined[1]))
print('Now the Results count is:', len(joined), ' which confirms that our program also catches GAP Packages citation as separate results.')

The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.
<class 'list'>
<class 'str'>
Now the Results count is: 32  which confirms that our program also catches GAP Packages citation as separate results.


In [24]:
final = []
for i in range(len(joined)):
    clean = (joined[i].strip())
    final.append(clean)
final

['MR4044696:',
 'The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.',
 'MR2900886:',
 'The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.',
 'MR3169623:',
 'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.',
 'MR3169623:',
 'The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.',
 'MR4180136:',
 'The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.',
 'MR11:',
 'GAP group',
 'MR1111111:',
 'Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.',
 'MR1111111:',
 'The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, a

### Converting our data to Pandas dataframe for further analysis

In [80]:
df=pd.DataFrame(final)
df

Unnamed: 0,0
0,MR4044696:
1,"The GAP Group, GAP – groups, algorithms and pr..."
2,MR2900886:
3,"The GAP Group, $GAP$ groups, algorithms, and p..."
4,MR3169623:
5,"Distler, A., Mitchell, J. D. (2011). Smallsemi..."
6,MR3169623:
7,"The GAP Group, (2008). (http://www.gap-system...."
8,MR4180136:
9,"The GAP Group, 2019. GAP – Groups, Algorithms,..."


In [95]:
check = df.index%2==0  #checking if the index is even because the values are in consicutive order
separated = pd.DataFrame([df.loc[check, 0].str.strip(':').tolist(), df.loc[~check, 0].tolist()], index=['MR','Citation']).T

In [96]:
separated

Unnamed: 0,MR,Citation
0,MR4044696,"The GAP Group, GAP – groups, algorithms and pr..."
1,MR2900886,"The GAP Group, $GAP$ groups, algorithms, and p..."
2,MR3169623,"Distler, A., Mitchell, J. D. (2011). Smallsemi..."
3,MR3169623,"The GAP Group, (2008). (http://www.gap-system...."
4,MR4180136,"The GAP Group, 2019. GAP – Groups, Algorithms,..."
5,MR11,GAP group
6,MR1111111,"Distler, A., Mitchell, J. D. (2011). Smallsemi..."
7,MR1111111,"The GAP Group, (2008). (http://www.gap-system...."
8,MR5,"V. A. Artamonov and A. A. Bovdi, Integral gro ..."
9,MR5,"V. Bovdi, A. Grishkov and A. Konovalov, Kimmer..."


In [97]:
separated.to_csv('output.csv', index=False, encoding='utf-8')