In [2]:
# importing modules

from libs.ai.openai import ICNAzureChatOpenAI, ICNOpenAIEmbeddings
import requests
from langchain.prompts import (
	ChatPromptTemplate,
	PromptTemplate,
	SystemMessagePromptTemplate,
	AIMessagePromptTemplate,
	HumanMessagePromptTemplate,
)


### Overview
In this notebook, I will be going over the workflow needed to translate a PDF:
1) Create Prompt Templates for passing into the model
2) Convert the PDF to HTML and ensure text is accurately extracted
3) Parse through HTML to reference English text locations
4) Pass in each block of English text to model and return translated text back
5) Replace the original text with translated text
6) Output new translated HTML back and convert to PDF format

In [13]:
# --- Helper Functions ---

def split_string_by_chunks(s: str, n):
	return [s[i:i+n] for i in range(0, len(s), n)]

def get_prompt(lang_to, text):
	template = """
	Translate the text from English to {lang_to}. \
	If the text is blank, then return an empty string. \
	Return the translated text within triple backticks. \
	text: {text}
	"""

	prompt_template = HumanMessagePromptTemplate.from_template(template)
	prompt_to_ask = prompt_template.format(
		lang_to=lang_to,
		text=text
	)
	return prompt_to_ask

def get_query_string(prompt):
	headers = {
	"x-simon-accesstoken": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0In0.GCNRhCKtAWxuPO2XRviKbY6i1Xo_tHvM8JrInE8SAz4"
	}
	payload = {
		"chatId": None,
		"query": prompt
	}
	response = requests.post("http://127.0.0.1:8000/simon/api/v1/ai-chat/query",
					  json=payload,
					  headers=headers)
	response = response.json()
	return response['messages'][1]['message']['data']['content']

In [14]:
# --- Reading in PDF as HTML ---
from langchain.document_loaders import PDFMinerPDFasHTMLLoader
loader = PDFMinerPDFasHTMLLoader("Blackstone4Q22EarningsPressRelease.pdf")
data = loader.load()[0]
with open("blackstone.html", "w") as f:
	f.write(data.page_content) # writing to HTML file for reading later

When LangChain converts PDFs to HTML:
- Graphics (i.e. images, charts) are lost and are rendered as boxes
- All simple markup is extracted and put into <span> tags with breaks in between to resemble the original PDF

Using an HTML parser allows me to grab the English text and pass it through the model for translation

In [15]:
from bs4 import BeautifulSoup
file = open("blackstone.html", "r")
html = file.read()
soup = BeautifulSoup(html)
span = soup.find_all("span") # grabbing all text and assigning it an id number for reference
for tag in span:
	tag.attrs['id'] = span.index(tag)

with open('prac.html', 'w') as f:
	f.write(str(soup))

In [16]:
# --- Opening newly written HTML file for parsing ---
file = open('prac.html', 'r')
html = file.read()
soup = BeautifulSoup(html)
span = soup.find_all("span")

In [18]:
x = span[1]
print(x)

<span id="1" style="font-family: Georgia-Bold; font-size:19px">Blackstone Reports Fourth Quarter and 
<br/>Full Year 2022 Results</span>


In [21]:
p = get_prompt("French", x)
get_query_string(p.content)

'Bonjour! I am happy to help you translate the text from English to French. The text you provided reads "Blackstone Reports Fourth Quarter and Full Year 2022 Results." In French, it would be "Blackstone rapporte les résultats du quatrième trimestre et de l\'année complète 2022." Here is the translated text within triple backticks:\n```\nBlackstone rapporte les résultats du quatrième trimestre et de l\'année complète 2022.\n```'

In [22]:
response_list = {}
for s in span:
	# print(s.get('id'))
	i = s.get('id')
	print(i)
	text = s.text
	if s.text != "": # as long as the span is not empty (meaning there is text to be translated)
		p = get_prompt("French", text)
		response = get_query_string(p.content)
		print(response + "\n")
		response_list[i] = response

0
1
Bonjour! Je suis heureux de vous aider à traduire ce texte en français. Voici la traduction du texte:

```
Rapports Blackstone Quatrième Trimestre et Résultats Année Complète 2022
```

J'espère que cela vous aide! Si vous avez d'autres questions, n'hésitez pas à demander.

2
Bonjour! I can definitely help you with that translation. However, since the text provided is blank, I cannot provide a translation. Is there any other way I can assist you?

3
Bonjour! Je suis heureux de vous aider. Le texte en anglais est "New York, January 26, 2023". Je vais maintenant le traduire en français. 

```Nouvelle York, 26 janvier 2023``` 

J'espère que cela vous a aidé! Si vous avez d'autres questions, n'hésitez pas à me les poser.

4
Bonjour! Je suis heureux de vous aider à traduire le texte en français. Voici la traduction: 

```
Blackstone (NYSE:BX) a annoncé aujourd'hui ses résultats du quatrième trimestre et de l'ensemble de l'année 2022. Si le texte est vide, la chaîne de caractères retourné

Since there are "br" tags in the English text, I need to replace them with linebreaks so that I can replace the text in place on the HTML

In [23]:
clean = BeautifulSoup(str(soup).replace("<br/>", "\n"))
clean

<html><head>
<meta content="text/html" http-equiv="Content-Type"/>
</head><body>
<span id="0" style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:720px; height:540px;"></span>
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:72px; top:172px; width:413px; height:42px;"><span id="1" style="font-family: Georgia-Bold; font-size:19px">Blackstone Reports Fourth Quarter and 

Full Year 2022 Results</span><span id="2" style="font-family: Georgia-Bold; font-size:19px">
</span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:72px; top:229px; width:562px; height:10px;"><span id="3" style="font-family: TrebuchetMS-Bold; font-size:10px">New York, January 26, 2023</span><span id="4" style="font-family: TrebuchetMS; font-size:10px">: Blackstone (NYSE:BX) today reported its fourth quarter and full year 2022 results. 

</span></div><div

In [24]:
import re
for i, x in response_list.items():

	reg = re.findall("```([^`]+)```", x) # regex to extract the translated text from the chat model
	if len(reg) > 0:
		text = reg[0]
		print(i)
		print(text)

		print(clean.find(id=i).string.replace_with(text))

1

Rapports Blackstone Quatrième Trimestre et Résultats Année Complète 2022

Blackstone Reports Fourth Quarter and 

Full Year 2022 Results
3
Nouvelle York, 26 janvier 2023
New York, January 26, 2023
4

Blackstone (NYSE:BX) a annoncé aujourd'hui ses résultats du quatrième trimestre et de l'ensemble de l'année 2022. Si le texte est vide, la chaîne de caractères retournée est vide. 

: Blackstone (NYSE:BX) today reported its fourth quarter and full year 2022 results. 


5

Stephen A. Schwarzman, président-directeur général, a déclaré,

Stephen A. Schwarzman, Chairman and Chief Executive Officer, said, 
7

Malgré l'un des contextes de marché les plus difficiles de l'histoire, Blackstone a réussi à satisfaire nos clients en 2022. Nous avons protégé le capital des investisseurs en nous concentrant sur les bons secteurs, ce qui a entraîné un afflux supplémentaire de 226 milliards de dollars pour l'année, dont 43 milliards de dollars au quatrième trimestre. Notre total d'actifs sous gestion a

In [25]:
clean # newly translated HTML string

<html><head>
<meta content="text/html" http-equiv="Content-Type"/>
</head><body>
<span id="0" style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:720px; height:540px;"></span>
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:72px; top:172px; width:413px; height:42px;"><span id="1" style="font-family: Georgia-Bold; font-size:19px">
Rapports Blackstone Quatrième Trimestre et Résultats Année Complète 2022
</span><span id="2" style="font-family: Georgia-Bold; font-size:19px">
</span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:72px; top:229px; width:562px; height:10px;"><span id="3" style="font-family: TrebuchetMS-Bold; font-size:10px">Nouvelle York, 26 janvier 2023</span><span id="4" style="font-family: TrebuchetMS; font-size:10px">
Blackstone (NYSE:BX) a annoncé aujourd'hui ses résultats du quatrième trimestre et de l

In [26]:
with open("translated_prac.html", "w") as f:
	f.write(str(clean))

In [29]:
options = {
    'page-height': '10in',
    'page-width': '7.5in',
}

In [30]:
import pdfkit
pdfkit.from_file("translated_prac.html", "translated_prac.pdf", options=options)

Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                                                      


True

### Next Steps

#### Technicals
- Need output parser to ensure that only translated text is outputted, not entire chats
- Use agents to distinguish words and sentences that CAN be translated (ie anything not proper nouns, punctuation, etc)
- Find way to either keep or remove graphics so that text format remains the same throughout process

#### Styling
- Work on creating a pdf that fills the whole page
- Allow for different language characters in generated PDF
- Remove unnecessary page indicators on top of the generated pdf
- Work on formatting the HTML better so that the text itself is less cramped due to loss of graphics
- Find ways to adapt this approach for websites that do not have set structures like this example