In [21]:
%pip install bs4
from bs4 import BeautifulSoup
import re
from bs4 import Tag
import math
import copy
import os
import glob

Note: you may need to restart the kernel to use updated packages.


In [22]:
## Cleanses files and removes unnecessary contextual and bibliographical information.
## USES keywords to filter out irrelevant sections

def assertIsTag(tag: Tag, msg = ""):
	if tag is None or not isinstance(tag, Tag):
		raise Exception('Error:', msg,'The following is not a Tag: ', tag, "<<END TAG>>")

def calcTokens(tag: Tag):
	return math.ceil(len(str(tag))/5)

def printNumTokens(tag: Tag, msg = "Token count:"):
	print(msg, calcTokens(tag)//100*100)
	return calcTokens(tag)

def keywordFilter(tag: Tag, keywords: list):
	found = False
	for keyword in keywords:
		if re.search(keyword, str(tag), re.IGNORECASE):
			found = True
			break
	if not found:
		tag.decompose()
		return False
	return True



In [23]:
## Split a full document into pages. These pages will then be hydrated with their context
## and given to the next step in the pipeline.
## USES MAX_TOKENS to determine the maximum number of tokens per page

# <ref type="table" target="#tab_13">
# <figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0">

class Figure:
	def __init__(self, tag : Tag):
		self.tag = tag
		self.ref = tag.attrs['xml:id']
	
	def tokens(self):
		return calcTokens(self.tag)
	
	def print(self):
		print(self.tag)

# TODO Definitely room for optimization here if necessary
class Page:

	def __init__(self, title: Tag, abstract: Tag):
		self.tags = []
		self.figures = []
		self.figureIndex = {}

		seed = '''
		<?xml version="1.0" encoding="UTF-8"?>
		<TEI xml:space="preserve"
			xmlns="http://www.tei-c.org/ns/1.0"
			xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
			xmlns:xlink="http://www.w3.org/1999/xlink">
			<teiHeader xml:lang="en">
				<fileDesc>
					<titleStmt></titleStmt>
				</fileDesc>
				<profileDesc></profileDesc>
			</teiHeader>
			<text>
				<body></body>
			</text>
		</TEI>'''
		self.pageDoc = BeautifulSoup(seed, features="xml")
		self.pageDoc.find('titleStmt').append(title)
		self.pageDoc.find('profileDesc').append(abstract)
		self.title = title.string


	def print(self, label = "", verbose = False):
		print("============", label, '============')
		print("Token count:", self.tokens())
		print("Tag token count:", sum([calcTokens(tag) for tag in self.tags]))
		print("Figure index:", self.figureIndex)
		print("Figure token count:", sum([self.figureIndex[fig] for fig in self.figureIndex]))
		if verbose:
			print("------- List Tags -------")
			for tag in self.tags:
				print(tag.prettify())
			print("----------END---------\n------- List Figures-------")
			for fig in self.figures:
				print(fig.tag)
			print("------- END -------\n------- Title -------")
			if self.title is None:
				print("ERROR: Title is None")
			else:
				print(self.title.prettify())
			print("------- END -------\n------- Abstract -------")
			if self.abstract is None:
				print("ERROR: Abstract is None")
			else:
				print(self.abstract.prettify())
			print("------- END -------")
		print("============ END", label, '============')
	
	def pageString(self):
		doc = copy.deepcopy(self.pageDoc)
		body = doc.find('body')
		for tag in self.tags:
			body.append(tag)
		for fig in self.figures:
			body.append(fig.tag)

		return doc.prettify()

	def tokens(self, overrideIndex = None):
		ret = sum([calcTokens(tag) for tag in self.tags])
		if overrideIndex is not None:
			ret += sum([overrideIndex[fig] for fig in overrideIndex])
		else:
			ret += sum([self.figureIndex[fig] for fig in self.figureIndex])
		ret += calcTokens(self.pageDoc)
		return ret
	
	# TODO optimize
	def prospectiveTokens(self, tag: Tag, docCopy : Tag):
		index : dict[str, int] = self.figureIndex.copy()
		for ref in tag.find_all('ref', target=True):
			currRef = ref.attrs['target'].strip('#')

			foundFigure = docCopy.find('figure', attrs={'xml:id': currRef})
			if isinstance(foundFigure, Tag):
				index[str(currRef)] = calcTokens(foundFigure)
			else:
				print("ERROR: Figure not found for reference:", currRef)
		
		return self.tokens(overrideIndex=index) + calcTokens(tag)
	
	def addTag(self, tag: Tag, docCopy : Tag):
		# Add the tag
		self.tags.append(tag)

		# Add the references
		for ref in tag.find_all('ref', target=True):
			currRef = ref.attrs['target'].strip('#')

			foundFigure = docCopy.find('figure', attrs={'xml:id': currRef})
			if isinstance(foundFigure, Tag):
				self.figureIndex[currRef] = calcTokens(foundFigure)
			else:
				print("ERROR: Figure not found for reference:", currRef)
	
	def hydrate(self, docCopy : Tag):
		for ref in self.figureIndex:
			foundFigure = docCopy.find('figure', attrs={'xml:id': ref})
			if isinstance(foundFigure, Tag):
				self.figures.append(Figure(foundFigure))
			else:
				print("ERROR: Figure not found for reference:", ref)



	

In [24]:
## Filter
def filter(soup: BeautifulSoup, keywords: list[str]):
	# Load text from file
	textObj = soup.find('text')
	assertIsTag(textObj, "Text not found")
	docCopy = copy.deepcopy(textObj) # TODO will this cause odd reference issues?

	beforeTokens = printNumTokens(textObj)

	# Destroy reference list
	textObj.back.find(type='references').decompose()

	# Destroy bibliographical references in text
	for bibl in textObj.findChildren('ref', type='bibr'):
		bibl.decompose()
	
	#Filter divs based on keywords
	for div in [child for child in textObj.children if child.name == 'div']:
		assertIsTag(div)
		keywordFilter(div, keywords)

	# TODO this may break some references, and those should be removed somehow
	#Filter figures based on keywords
	for div in textObj.find_all('figure'):
		assertIsTag(div)
		keywordFilter(div, keywords)

	total_tokens = printNumTokens(textObj)
	print("Tokens removed:", beforeTokens - total_tokens, "\nToken reduction:", 100-(total_tokens/beforeTokens*10000//1)/100, "%")
	ret = 0
	for div in textObj.find_all('div'):
		ret += calcTokens(div)
	print("Total tokens in divs:", ret)
	return (soup, textObj, total_tokens, docCopy)

## Paginate
def paginate(soup: BeautifulSoup, textObj: Tag, total_tokens: int, docCopy: Tag):
	# Calculate universal context size
	title : Tag = soup.find('title')
	abstract : Tag = soup.find('abstract')
	universalCtxSize = Page(title, abstract).tokens()

	# Calculate page size
	page_num = math.ceil(total_tokens/(MAX_TOKENS))
	page_size = math.ceil(total_tokens/page_num)

	print("Total tokens:", total_tokens, "\nPage size:", page_size, "\nNumber of pages:", page_num)
	print("Universal context size:", universalCtxSize)

	# Split text into pages
	# TODO error check for div larger than page size
	pages = []
	currentPage = Page(title, abstract)
	for div in textObj.find_all('div'):
		assertIsTag(div)
		if currentPage.prospectiveTokens(div, docCopy) > page_size:
			if currentPage.tokens() > 0:
				pages.append(copy.deepcopy(currentPage))
			currentPage = Page(title, abstract)
			currentPage.addTag(div, docCopy)
			continue
		currentPage.addTag(div, docCopy)
	pages.append(currentPage)

	# Hydrate pages with universal and dynamic context
	for page in pages:
		page.hydrate(docCopy)

	# Print pages
	for i, page in enumerate(pages):
		title = page.title
		if len(title) > 200:
			title = title[:200]

		string = "./PAGES/" + title + "_PAGE-" + str(i) + ".xml"
		string = string.replace(" ", "_")
		with open(string, "w", encoding='utf-8') as file:
			file.write(page.pageString())

In [25]:
## File & Init
MAX_TOKENS = 3000
keywords = ['modulus']

for filename in glob.glob(os.path.join('./TEI/', '*.xml')):
	with open(filename, 'r', encoding='utf-8') as file:
		xml_doc = file.read()
	soup = BeautifulSoup(xml_doc, features="xml")
	o = filter(soup, keywords)
	paginate(*o)

Token count: 11400
Tag:  <figure xml:id="fig_0" xmlns="http://www.tei-c.org/ns/1.0">
<head>Figure 1 .</head>
<label>1</label>
<figDesc>Figure 1. 3D ion maps of all major alloying elements (and their oxides) before and after oxidation at either 120 °C-10 mbar O2-5 minutes or 300 °C-10 mbar O2-2 minutes in high purity O2 gas. APT reconstructions shown are cropped to ~10 nm thick to clearly show the oxide/metal interface.</figDesc>
<graphic coords="7,200.28,72.00,247.32,327.00" type="bitmap"/>
</figure>
Tag:  <figure xml:id="fig_1" xmlns="http://www.tei-c.org/ns/1.0">
<head>Figure 2 .</head>
<label>2</label>
<figDesc>Figure 2. Two-dimensional compositional contour plots illustrating elemental fractions across the Ni38Fe20Cr22Mn10Co10 alloy/oxide interface after oxidation at (a) 120°C-10 mbar-5 minutes, and (b) 300 °C-10 mbar-2 minutes. The approximate location of the oxide/metal interface is indicated by a black dashed line in each plot. Contour plots were generated from reconstructed APT

TypeError: object of type 'NoneType' has no len()