## Install dependencies

In [None]:
!pip install beautifulsoup4==4.12.3 grobid-client-python==0.0.8 lxml==5.3.0

## Extract from the sample PDF

In [None]:
import multiprocessing
import os

from bs4 import BeautifulSoup
from grobid_client.grobid_client import GrobidClient

Before creating the client, make sure that the GROBID server is up and running:

```sh
docker compose up grobid
```

In [None]:
grobid_server = os.environ.get("GROBID_SERVICE_URL", "http:localhost:8070")
n = 2 * multiprocessing.cpu_count()  # Assumes hyperthreading
output = "../data/grobid"
pdfs = "../aicacia/extraction/example/pdf"

client = GrobidClient(grobid_server=grobid_server)

In [None]:
client.process("processFulltextDocument", pdfs, output=output, n=n)

## Parsing the TEI output with Beautiful Soup

In [None]:
with open("../data/grobid/sample.grobid.tei.xml") as f:
    soup = BeautifulSoup(f, "lxml-xml")

### Extracting text

In [None]:
title_stmt = soup.find("titleStmt")
title_stmt.title.text

'Bi-criteria Algorithm for Scheduling Jobs on Cluster Platforms'

In [None]:
abstract = soup.find("abstract")
print(abstract.text)

In [None]:
text = soup.find("text").body.find_all("div")
first_paragraph = text[0].p.text
first_paragraph_title = text[0].head.text

print(first_paragraph_title, end="\n\n")
print(first_paragraph)

### Extracting metadata

In [None]:
# TODO(jason.prasad): still attempting to process the metadata. GROBID does have the ability to
# include a crossref service: https://grobid.readthedocs.io/en/latest/Consolidation/