In [22]:
#Author: Michael McDowall
%pip install bs4
%pip install grobid-client-python
from grobid_client.grobid_client import GrobidClient
import os
from bs4 import BeautifulSoup, Tag
import re
import math
import copy





In [23]:
## Initialize constants

MAX_TOKENS = 3000
keywords = ['hardness']

#determine and initialize environment
grobidClient = "http://localhost:8070"
inputDir = "ALLPDF"
outputDir = "TEI"

if os.getenv('FLAG_EXTRACT_PAPER_IS_DOCKER') is not None:
    grobidClient = "http://grobid:8070"
    inputDir = "ALLPDF"
    outputDir = "TEI"
print(grobidClient)

# Initialize grobid client configuration
configFile = open("./config.json", "w")
configText = '''
{
    "grobid_server": "''' + grobidClient + '''",
    "batch_size": 1000,
    "sleep_time": 5,
    "timeout": 60000,
    "coordinates": [ "persName", "figure", "ref", "biblStruct", "formula", "s" ]
}
'''
print(configText)
print("Inputdir", inputDir)
print("Outputdir", outputDir)
configFile.write(configText)
configFile.close()

http://grobid:8070

{
    "grobid_server": "http://grobid:8070",
    "batch_size": 1000,
    "sleep_time": 5,
    "timeout": 60000,
    "coordinates": [ "persName", "figure", "ref", "biblStruct", "formula", "s" ]
}

Inputdir ALLPDF
Outputdir TEI


In [24]:
## Run grobid

client = GrobidClient(config_path="./config.json")
client.process("processFulltextDocument", inputDir, n=1)

GROBID server is up and running
Processing of ALLPDF/MnFeNiCuPt and MnFeNiCuCo high-entropy alloys designed based on.pdf failed with error 500 , [NO_BLOCKS] PDF parsing resulted in empty content
Processing of ALLPDF/Processing of AlCoCrFeNiTi high entropy alloy by.pdf failed with error 500 , [NO_BLOCKS] PDF parsing resulted in empty content
Processing of ALLPDF/Synthesis and thermoelectric properties of high-entropy.pdf failed with error 500 , [NO_BLOCKS] PDF parsing resulted in empty content
Processing of ALLPDF/Influence of thermal and thermal mechanical treatments on.pdf failed with error 500 , [NO_BLOCKS] PDF parsing resulted in empty content
Processing of ALLPDF/Exceptionally high spallation strength for a high-entropy alloy demonstrated by experiments and simulations.pdf failed with error 500 , [NO_BLOCKS] PDF parsing resulted in empty content
Processing of ALLPDF/Effect of entropy-packing fraction relation on the formation of.pdf failed with error 500 , [NO_BLOCKS] PDF parsing r

In [25]:
## TEI Processing functions

## Cleanses files and removes unnecessary contextual and bibliographical information.
## USES keywords to filter out irrelevant sections

def assertIsTag(tag: Tag, msg = ""):
	if tag is None or not isinstance(tag, Tag):
		raise Exception('Error:', msg,'The following is not a Tag: ', tag, "<<END TAG>>")

def calcTokens(tag: Tag):
	return math.ceil(len(str(tag))/5)

def printNumTokens(tag: Tag, msg = "Token count:"):
	print(msg, calcTokens(tag)//100*100)
	return calcTokens(tag)

def keywordFilter(tag: Tag, keywords: list):
	found = False
	for keyword in keywords:
		if re.search(keyword, str(tag), re.IGNORECASE):
			found = True
			break
	if not found:
		tag.decompose()
		return False
	return True

def filterTei(soup: BeautifulSoup):
	# Load text from file
	textObj = soup.find('text')
	assertIsTag(textObj, "Text not found")
	docCopy = copy.deepcopy(textObj) # TODO will this cause odd reference issues?

	beforeTokens = printNumTokens(textObj)

	# Destroy reference list
	textObj.back.find(type='references').decompose()

	# Destroy bibliographical references in text
	for bibl in textObj.find_all('ref', type='bibr'):
		bibl.decompose()

	#Filter divs based on keywords
	for div in textObj.find_all('div'):
		assertIsTag(div)
		keywordFilter(div, keywords)

	# TODO this may break some references, and those should be removed somehow
	#Filter figures based on keywords
	for div in textObj.find_all('figure'):
		assertIsTag(div)
		keywordFilter(div, keywords)

	total_tokens = printNumTokens(textObj)
	print("Tokens removed:", beforeTokens - total_tokens, "\nToken reduction:", 100-(total_tokens/beforeTokens*10000//1)/100, "%")
	ret = 0
	for div in textObj.find_all('div'):
		ret += calcTokens(div)
	print("Total tokens in divs:", ret)

	return (soup, total_tokens, textObj, docCopy) #TODO optimize this behavior. rn it is odd data passing

In [26]:
## Split a full document into pages. These pages will then be hydrated with their context
## and given to the next step in the pipeline.
## USES MAX_TOKENS to determine the maximum number of tokens per page

# <ref type="table" target="#tab_13">
# <figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0">

class Figure:
	def __init__(self, tag : Tag):
		self.tag = tag
		self.ref = tag.attrs['xml:id']
	
	def tokens(self):
		return calcTokens(self.tag)
	
	def print(self):
		print(self.tag)

# TODO Definitely room for optimization here if necessary
class Page:

	def __init__(self, title: Tag, abstract: Tag):
		self.tags = []
		self.figures = []
		self.figureIndex = {}

		seed = '''
		<?xml version="1.0" encoding="UTF-8"?>
		<TEI xml:space="preserve"
			xmlns="http://www.tei-c.org/ns/1.0"
			xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
			xmlns:xlink="http://www.w3.org/1999/xlink">
			<teiHeader xml:lang="en">
				<fileDesc>
					<titleStmt></titleStmt>
				</fileDesc>
				<profileDesc></profileDesc>
			</teiHeader>
			<text>
				<body></body>
			</text>
		</TEI>'''
		self.pageDoc = BeautifulSoup(seed, features="xml")
		self.pageDoc.find('titleStmt').append(title)
		self.pageDoc.find('profileDesc').append(abstract)


	def print(self, label = "", verbose = False):
		print("============", label, '============')
		print("Token count:", self.tokens())
		print("Tag token count:", sum([calcTokens(tag) for tag in self.tags]))
		print("Figure index:", self.figureIndex)
		print("Figure token count:", sum([self.figureIndex[fig] for fig in self.figureIndex]))
		if verbose:
			print("------- List Tags -------")
			for tag in self.tags:
				print(tag.prettify())
			print("----------END---------\n------- List Figures-------")
			for fig in self.figures:
				print(fig.tag)
			print("------- END -------\n------- Title -------")
			if self.title is None:
				print("ERROR: Title is None")
			else:
				print(self.title.prettify())
			print("------- END -------\n------- Abstract -------")
			if self.abstract is None:
				print("ERROR: Abstract is None")
			else:
				print(self.abstract.prettify())
			print("------- END -------")
		print("============ END", label, '============')
	
	def promptPrint(self):
		doc = copy.deepcopy(self.pageDoc)
		body = doc.find('body')
		for tag in self.tags:
			body.append(tag)
		for fig in self.figures:
			body.append(fig.tag)

		print(doc.prettify())

	def tokens(self, overrideIndex = None):
		ret = sum([calcTokens(tag) for tag in self.tags])
		if overrideIndex is not None:
			ret += sum([overrideIndex[fig] for fig in overrideIndex])
		else:
			ret += sum([self.figureIndex[fig] for fig in self.figureIndex])
		ret += calcTokens(self.pageDoc)
		return ret
	
	# TODO optimize
	def prospectiveTokens(self, tag: Tag, docCopy : Tag):
		index : dict[str, int] = self.figureIndex.copy()
		for ref in tag.find_all('ref', target=True):
			currRef = ref.attrs['target'].strip('#')

			foundFigure = docCopy.find('figure', attrs={'xml:id': currRef})
			if isinstance(foundFigure, Tag):
				index[str(currRef)] = calcTokens(foundFigure)
			else:
				print("ERROR: Figure not found for reference:", currRef)
		
		return self.tokens(overrideIndex=index) + calcTokens(tag)
	
	def addTag(self, tag: Tag, docCopy : Tag):
		# Add the tag
		self.tags.append(tag)

		# Add the references
		for ref in tag.find_all('ref', target=True):
			currRef = ref.attrs['target'].strip('#')

			foundFigure = docCopy.find('figure', attrs={'xml:id': currRef})
			if isinstance(foundFigure, Tag):
				self.figureIndex[currRef] = calcTokens(foundFigure)
			else:
				print("ERROR: Figure not found for reference:", currRef)
	
	def hydrate(self, docCopy : Tag):
		for ref in self.figureIndex:
			foundFigure = docCopy.find('figure', attrs={'xml:id': ref})
			if isinstance(foundFigure, Tag):
				self.figures.append(Figure(foundFigure))
			else:
				print("ERROR: Figure not found for reference:", ref)
				
def paginateTei(soup: BeautifulSoup, total_tokens: int, textObj: Tag, docCopy: Tag):
	# Calculate universal context size
	title : Tag = soup.find('title')
	abstract : Tag = soup.find('abstract')
	universalCtxSize = Page(title, abstract).tokens()

	# Calculate page size
	page_num = math.ceil(total_tokens/(MAX_TOKENS))
	page_size = math.ceil(total_tokens/page_num)

	print("Total tokens:", total_tokens, "\nPage size:", page_size, "\nNumber of pages:", page_num)
	print("Universal context size:", universalCtxSize)

	# Split text into pages
	# TODO error check for div larger than page size
	pages = []
	currentPage = Page(title, abstract)
	for div in textObj.find_all('div'):
		assertIsTag(div)
		if currentPage.prospectiveTokens(div, docCopy) > page_size:
			if currentPage.tokens() > 0:
				pages.append(copy.deepcopy(currentPage))
			currentPage = Page(title, abstract)
			currentPage.addTag(div, docCopy)
			continue
		currentPage.addTag(div, docCopy)
	pages.append(currentPage)

	# Hydrate pages with universal and dynamic context
	for page in pages:
		page.hydrate(docCopy)

	# Print pages
	for i, page in enumerate(pages):
		# page.print("Page " + str(i))
		page.promptPrint()


	

In [27]:
with open('/TEI/Pal_Sanhita_202211_MSc.grobid.tei.xml', 'r') as file:
	xml_doc = file.read()
soup = BeautifulSoup(xml_doc, features="xml")
data = filterTei(soup)
paginateTei(data[0], data[1], data[2], data[3])

Token count: 60400
Token count: 12600
Tokens removed: 47741 
Token reduction: 79.0 %
Total tokens in divs: 10824
Total tokens: 12691 
Page size: 2539 
Number of pages: 5
Universal context size: 503
ERROR: Figure not found for reference: foot_1
ERROR: Figure not found for reference: foot_1
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd">
 <teiHeader xml:lang="en">
  <fileDesc>
   <titleStmt>
    <title level="a" type="main">
     Development of Multi-Functional Flame Sprayed High Entropy Alloy (HEA) Coatings
    </title>
   </titleStmt>
  </fileDesc>
  <profileDesc>
   <abstract>
    <div xmlns="http://www.tei-c.org/ns/1.0">
     <p>
      High entropy alloys (HE