In [1]:
from lxml import html
import requests

In [2]:
page = requests.get('https://sis1.host.cs.st-andrews.ac.uk/GAP/MR2900886.html')
tree = html.fromstring(page.content)

### Manual selsection

Using the Chrome browser, I highlight the GAP citation text, right-click then select Inspect, then right-click on the text and select Copy Xpath - that gives me:

```/html/body/li[15]/text()```

In [3]:
gap = tree.xpath('/html/body/li[15]/text()')

In [4]:
print(gap)

['\n  The GAP Group, ', ' groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org. \n\n']


The above works, but I manually pointed it to the location of the citation.
I need to be able to find GAP citations in different result pages hence I will try the following:

### Xpath 'contains'

In [5]:
targets = tree.xpath('//li[contains(., "gap") or contains(.,"GAP")]')
for target in targets:
    print(target.text)


  The GAP Group, 


Fails due to extra tags inside some < li > elements - so it only displays the first part of the < li > element until it reachers the '<' of the extra tag inside, then it stops.

### With multiple URLs

To update with '+'

In [6]:
url_lst = ['https://sis1.host.cs.st-andrews.ac.uk/GAP/MR4044696.html',
           'https://sis1.host.cs.st-andrews.ac.uk/GAP/MR2900886.html',
           'https://sis1.host.cs.st-andrews.ac.uk/GAP/MR3169623.html',
           'https://sis1.host.cs.st-andrews.ac.uk/GAP/MR4180136.html']


In [7]:
for page3 in url_lst:
    page3 = requests.get(page3)
    tree3 = html.fromstring(page3.content)
    targets3 = tree3.xpath('//li[contains(., "gap") or contains(.,"GAP")]')
    for target in targets3:
        print(target.text)


  The GAP Group, GAP â groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018. 



  The GAP Group, 

  Distler, A., Mitchell, J. D. (2011). 

  The GAP Group, (2008). (http://www.gap-system.org). 

  Alonso, J., Brady, T., Cooper, D., Ferlini, V., Lustig, M., Mihalik, M., Shapiro, M., Short, H., 1991. Notes on word-hyperbolic groups. In: Ghys, E., Haefliger, A., Verjovsky, A. (Eds.), Proceedings of the Conference 

  The GAP Group, 2019. GAP â Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org. 




In [8]:
print(type(tree3))
print(type(targets3))

<class 'lxml.html.HtmlElement'>
<class 'list'>


It yields good results but the problem with extra tags inside < li > elements remains.

### Attempt to flatten the html using 'minidom'

In [9]:
from xml.dom import minidom
xmldoc = minidom.parse("MR4044696.html",)
itemlist = xmldoc.getElementsByTagName('li')
print(len(itemlist))
for s in itemlist:
    print(s.attributes['li'].value)

ExpatError: not well-formed (invalid token): line 9, column 50

Another attempt

In [None]:
import xml.etree.ElementTree as ET
root = ET.parse('MR4044696.html').getroot()

In [None]:
import html2text
html = open("mr4044696.html").read()
print(html2text.html2text(html))

##### Finally a successful attempt using BeautifulSoup

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import itertools

In [2]:
MATCH_ALL = r'.*'


def like(string):
    """
    Return a compiled regular expression that matches the given
    string with any prefix and postfix, e.g. if string = "hello",
    the returned regex matches r".*hello.*"
    """
    string_ = string
    if not isinstance(string_, str):
        string_ = str(string_)
    regex = MATCH_ALL + re.escape(string_) + MATCH_ALL
    return re.compile(regex, flags=re.DOTALL)


def find_by_text(soup, text, tag, **kwargs):
    """
    Find the tag in soup that matches all provided kwargs, and contains the
    text.

    If no match is found, raise ValueError.
    """
    elements = soup.find_all(tag, **kwargs)
    matches = []
    for element in elements:
        if element.find(text=like(text)):
            matches.append(element)
    if len(matches) == 0:
        raise ValueError("No matching citations were found")
    else:
        return matches

In [3]:
url_lst = ['https://sis1.host.cs.st-andrews.ac.uk/GAP/MR4044696.html',
           'https://sis1.host.cs.st-andrews.ac.uk/GAP/MR2900886.html',
           'https://sis1.host.cs.st-andrews.ac.uk/GAP/MR3169623.html',
           'https://sis1.host.cs.st-andrews.ac.uk/GAP/MR4180136.html']
all_content = []
all_matches = []

In [4]:
for page in url_lst:
    page = requests.get(page)
    soup = BeautifulSoup(page.content, 'html.parser')
    match = find_by_text(soup, 'GAP', 'li')
    all_matches.append(match)
print(all_matches)

[[<li>
  The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018. 

</li>], [<li>
  The GAP Group, <span class="MathTeX">$GAP$</span><script type="math/tex">GAP</script> groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org. 

</li>], [<li>
  Distler, A., Mitchell, J. D. (2011). <span class="it">Smallsemi - A Library of Small Semigroups.</span> http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4. 

</li>, <li>
  The GAP Group, (2008). (http://www.gap-system.org). <span class="it">GAP–Groups, Algorithms, and Programming, Version 4.4.12.</span>
</li>], [<li>
  The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org. 

</li>]]


In [5]:
print(all_matches)

[[<li>
  The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018. 

</li>], [<li>
  The GAP Group, <span class="MathTeX">$GAP$</span><script type="math/tex">GAP</script> groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org. 

</li>], [<li>
  Distler, A., Mitchell, J. D. (2011). <span class="it">Smallsemi - A Library of Small Semigroups.</span> http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4. 

</li>, <li>
  The GAP Group, (2008). (http://www.gap-system.org). <span class="it">GAP–Groups, Algorithms, and Programming, Version 4.4.12.</span>
</li>], [<li>
  The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org. 

</li>]]


In [6]:
print(type(match))
print(type(all_matches))
print(type(match[0]))

<class 'list'>
<class 'list'>
<class 'bs4.element.Tag'>


In [7]:
len(all_matches)

4

In [8]:
print(all_matches[3])

[<li>
  The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org. 

</li>]


In [9]:
for i in all_matches:
    print(i[0].text.strip() + "\n")

The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.

The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.

Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.

The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.



In [65]:
final = []
for i in all_matches:
    clean = (i[0].text.strip())
    final.append(clean)
final

['The GAP Group, GAP – groups, algorithms and programming, version 4.10, Available from http://www.gap-system.org, 2018.',
 'The GAP Group, $GAP$ groups, algorithms, and programming, version 4.4.12 (2008), http://www.gap-system.org.',
 'The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12.',
 'The GAP Group, 2019. GAP – Groups, Algorithms, and Programming, Version 4.10.1; https://www.gap-system.org.']

### Converting our data to Pandas dataframe for further analysis

In [40]:
df = pd.DataFrame(final)

In [53]:
display(df.iloc[1])

0    The GAP Group, $GAP$ groups, algorithms, and p...
Name: 1, dtype: object

In [59]:
df

Unnamed: 0,0
0,"The GAP Group, GAP – groups, algorithms and pr..."
1,"The GAP Group, $GAP$ groups, algorithms, and p..."
2,"The GAP Group, (2008). (http://www.gap-system...."
3,"The GAP Group, 2019. GAP – Groups, Algorithms,..."
