In [1]:
from lxml import html
import requests

In [2]:
page = requests.get('https://sis1.host.cs.st-andrews.ac.uk/GAP/MR2900886.html')
tree = html.fromstring(page.content)

### Manual selsection

Using the Chrome browser, I highlight the GAP citation text, right-click then select Inspect, then right-click on the text and select Copy Xpath - that gives me:

```/html/body/li[15]/text()```

In [3]:
gap = tree.xpath('/html/body/li[15]/text()')

In [4]:
print(gap)

['\n  The GAP Group, ', ' groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org. \n\n']


The above works, but I manually pointed it to the location of the citation.
I need to be able to find GAP citations in different result pages hence I will try the following:

### Xpath 'contains'

In [5]:
targets = tree.xpath('//li[contains(., "gap") or contains(.,"GAP")]')
for target in targets:
    print(target.text)


  The GAP Group, 


Fails due to extra tags inside some < li > elements - so it only displays the first part of the < li > element until it reachers the '<' of the extra tag inside, then it stops.

### With multiple URLs

In [58]:
base_URL = "https://sis1.host.cs.st-andrews.ac.uk/GAP/"
mrn = ["MR4044696", "MR2900886", "MR3169623", "MR4180136"]
url_list = []

for i in range(len(mrn)):
    url = (base_URL + mrn[i] + '.html')
    url_list.append(url)
    
#print(url_list)

['https://sis1.host.cs.st-andrews.ac.uk/GAP/MR4044696.html', 'https://sis1.host.cs.st-andrews.ac.uk/GAP/MR2900886.html', 'https://sis1.host.cs.st-andrews.ac.uk/GAP/MR3169623.html', 'https://sis1.host.cs.st-andrews.ac.uk/GAP/MR4180136.html']


In [50]:
base_URL

'https://sis1.host.cs.st-andrews.ac.uk/GAP/'

In [7]:
for page3 in url_lst:
    page3 = requests.get(page3)
    tree3 = html.fromstring(page3.content)
    targets3 = tree3.xpath('//li[contains(., "gap") or contains(.,"GAP")]')
    for target in targets3:
        print(target.text)


  The GAP Group, GAP â groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018. 



  The GAP Group, 

  Distler, A., Mitchell, J. D. (2011). 

  The GAP Group, (2008). (http://www.gap-system.org). 

  Alonso, J., Brady, T., Cooper, D., Ferlini, V., Lustig, M., Mihalik, M., Shapiro, M., Short, H., 1991. Notes on word-hyperbolic groups. In: Ghys, E., Haefliger, A., Verjovsky, A. (Eds.), Proceedings of the Conference 

  The GAP Group, 2019. GAP â Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org. 




In [8]:
print(type(tree3))
print(type(targets3))

<class 'lxml.html.HtmlElement'>
<class 'list'>


It yields good results but the problem with extra tags inside ```<li>``` elements remains. Lets try with BeautifulSoup and Regex instead.

### Full scraping workflow using Requests, BeautifulSoup combined with Regex

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import itertools

In [13]:
MATCH_ALL = r'.*'


def like(string):
    """
    Return a compiled regular expression that matches the given
    string with any prefix and postfix, e.g. if string = "hello",
    the returned regex matches r".*hello.*"
    """
    string_ = string
    if not isinstance(string_, str):
        string_ = str(string_)
    regex = MATCH_ALL + re.escape(string_) + MATCH_ALL
    return re.compile(regex, flags=re.DOTALL)


def find_by_text(soup, text, tag,mrn, **kwargs):
    """
    Find the tag in soup that matches all provided kwargs, and contains the
    text.

    If no match is found, raise ValueError.
    """
    empty = 1
    elements = soup.find_all(tag, **kwargs)
    matches = []
    for element in elements:
        if element.find(text=like(text)):
            matches.append(mrn + ": " + element.text.strip())
    if len(matches) == 0:
        pass
    else:
        return matches

In [35]:
base_URL = "https://sis1.host.cs.st-andrews.ac.uk/GAP/"
mrn = ["MR3.html", "MR4044696", "MR2900886", "MR3169623", "MR4180136"]
url_list = []
all_matches = []

for i in range(len(mrn)):
    url = (base_URL + mrn[i] + '.html')
    url_list.append(url) #for records keeping only, not really needed 
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    match = (find_by_text(soup, 'GAP', 'li', mrn[i]))
    all_matches.append(match)
    
print(all_matches)

[None, ['MR4044696: The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.'], ['MR2900886: The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.'], ['MR3169623: Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.', 'MR3169623: The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.'], ['MR4180136: The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.']]


In [36]:
print(type(all_matches[0]))

<class 'NoneType'>


In [37]:
all_matches = [i for i in all_matches if i is not None]

In [38]:
print(all_matches)

[['MR4044696: The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.'], ['MR2900886: The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.'], ['MR3169623: Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.', 'MR3169623: The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.'], ['MR4180136: The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.']]


In [39]:
print(type(match))
print(type(all_matches))
print(type(match[0]))
print('Results count is:', len(all_matches))
print(all_matches[2])

<class 'list'>
<class 'list'>
<class 'str'>
Results count is: 4
['MR3169623: Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.', 'MR3169623: The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.']


In [40]:
joined = list(itertools.chain(*all_matches))
print(joined)

['MR4044696: The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.', 'MR2900886: The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.', 'MR3169623: Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.', 'MR3169623: The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.', 'MR4180136: The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.']


In [41]:
print(joined[3])
print(type(joined))
print(type(joined[1]))
print('Now the Results count is:', len(joined), ' which confirms that our program also catches GAP Packages citation as separate results.')

MR3169623: The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.
<class 'list'>
<class 'str'>
Now the Results count is: 5  which confirms that our program also catches GAP Packages citation as separate results.


In [43]:
final = []
for i in range(len(joined)):
    clean = (joined[i].strip())
    final.append(clean)
final

['MR4044696: The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.',
 'MR2900886: The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.',
 'MR3169623: Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.',
 'MR3169623: The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.',
 'MR4180136: The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.']

### Converting our data to Pandas dataframe for further analysis

In [44]:
df = pd.DataFrame(final)

In [45]:
display(df.iloc[1])

0    MR2900886: The GAP Group, $GAP$ groups, algori...
Name: 1, dtype: object

In [46]:
df

Unnamed: 0,0
0,"MR4044696: The GAP Group, GAP – groups, algori..."
1,"MR2900886: The GAP Group, $GAP$ groups, algori..."
2,"MR3169623: Distler, A., Mitchell, J. D. (2011)..."
3,"MR3169623: The GAP Group, (2008). (http://www...."
4,"MR4180136: The GAP Group, 2019. GAP – Groups, ..."


Now we convert the dataframe to a .csv file which can be loaded by the next notebook for pre-processing and analysis.

In [66]:
df.to_csv('gathered.csv', encoding='utf-8', index=False)

In [67]:
input_file_pd = pd.DataFrame(mrn)

In [68]:
input_file_pd.to_csv('test_input.csv', encoding='utf-8', index=False)

In [69]:
out_test = pd.read_csv('gathered.csv')
input_test = pd.read_csv('test_input.csv')

In [75]:
out_test

Unnamed: 0,0
0,"MR4044696: The GAP Group, GAP – groups, algori..."
1,"MR2900886: The GAP Group, $GAP$ groups, algori..."
2,"MR3169623: Distler, A., Mitchell, J. D. (2011)..."
3,"MR3169623: The GAP Group, (2008). (http://www...."
4,"MR4180136: The GAP Group, 2019. GAP – Groups, ..."


In [72]:
input_test

Unnamed: 0,0
0,MR3.html
1,MR4044696
2,MR2900886
3,MR3169623
4,MR4180136


In [77]:
print(input_test['0'].tolist())

['MR3.html', 'MR4044696', 'MR2900886', 'MR3169623', 'MR4180136']
