In [43]:
import fitz
import numpy as np
import langchain
from langchain.chat_models.openai import ChatOpenAI
from langchain.chains import LLMChain
import dotenv
from tqdm import tqdm
import time
import json

from utils import count_tokens

_ = dotenv.load_dotenv(dotenv.find_dotenv())

## Load document

In [3]:
pdf = fitz.open('data/kallan.pdf')
pdf.page_count

216

In [4]:
# Pages to keep: from 3 to 211 (zero-indexed, extremes included)
idxs_pages_to_keep = list(range(3, 211 + 1))
len(idxs_pages_to_keep)

209

In [5]:
pdf_text = ''
for i, page in enumerate(pdf):
    if i in idxs_pages_to_keep:
        pdf_text += page.get_text()
        
len(pdf_text)

397670

In [6]:
print(pdf_text[:500])

1
Det började med några egendomliga ord som den där underliga flickan sa.
Hon talade om en skogskälla som hennes mor sett som barn och berättat
om. Vad skulle det vara för en källa, och vad var det för märkvärdigt med
den? hade han frågat. Och flickan sa att det varken kunde eller fick hon
säga, för det var en hemlighet, och om hon i förtid förrådde den
hemligheten skulle källan ifråga bara försvinna. Detta sällsamma tal väckte
hans undran och intresse. Och han kunde inte tiga. Så han frågade va


In [7]:
total_tokens = count_tokens(pdf_text)
total_tokens

141446

## Costs

In [8]:
# Text-to-speech model ('tts-1')
cost_tts_1k_tokens = 0.015 

# GPT3.5 Turbo ('gpt-3.5-turbo-1106')
cost_input_1k_tokens = 0.001
cost_output_1k_tokens = 0.002

# # GPT4 Turbo ('gpt-4-1106-preview')
# cost_input_1k_tokens = 0.01
# cost_output_1k_tokens = 0.03

# Assuming same length of input and output (1 chunk in, 1 chunk out)
total_cost_1 = (total_tokens / 1000) * (cost_input_1k_tokens + cost_output_1k_tokens + cost_tts_1k_tokens)

# Assuming to feed more chunks in input for context (3 chunk in, 1 chunk out)
total_cost_2 = (total_tokens / 1000) * (3 * cost_input_1k_tokens + cost_output_1k_tokens + cost_tts_1k_tokens)

round(total_cost_1, 2), round(total_cost_2, 2)

# GPT3.5: (2.55, 2.83)
# GPT4: (7.78, 10.61)

(2.55, 2.83)

## Split in chunks

In [9]:
# Split in chunks corresponding to chapters
pattern = r'\d+\n'
chunks = re.split(pattern, pdf_text)
chunks = [chunk for chunk in chunks if chunk != '']
len(chunks)

26

In [9]:
# Chunk sizes
sizes = [count_tokens(chunk) for chunk in chunks]
min(sizes), np.mean(sizes).round(), max(sizes)

# Chunk size is well under the GPT models context limit

(1482, 5438.0, 7836)

## Test translation quality of a long text

In [11]:
with open('data/example.txt', 'r') as file:
    example = file.read()
    
count_tokens(example)

7991

In [12]:
prompt_template = '''Please translate the following excerpt from a novel into English. 
Aim to preserve the author's original style and word choice as closely as possible. 
Here is the excerpt: {text}'''

In [13]:
# llm = langchain.llms.OpenAI(model='gpt-3.5-turbo', temperature=0, max_tokens=50000)
# llm_chain = langchain.chains.LLMChain(
#     llm=llm, 
#     prompt=langchain.prompts.PromptTemplate.from_template(prompt_template)
# )

In [15]:
# chat = langchain.chat_models.openai.ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
# prompt = langchain.prompts.PromptTemplate.from_template(prompt_template)
# llm_chain = langchain.chains.LLMChain(llm=chat, prompt=prompt)

In [16]:
model_name = "gpt-3.5-turbo-1106" # max 16K tokens
# model_name = "gpt-4-1106-preview" # max 128K tokens (10 times the price!)

chat = ChatOpenAI(model_name=model_name, temperature=0, max_tokens=None)
prompt = langchain.prompts.PromptTemplate.from_template(prompt_template)
llm_chain = LLMChain(llm=chat, prompt=prompt)

In [17]:
output = llm_chain(inputs={'text': example})

In [18]:
count_tokens(output['text'])

4096

Both `gpt-3.5-turbo-1106` and `gpt-4-1106-preview` have output token limit equal to 4096, so it's not possible to translate a longer text. So shorter chunks are required.

## Split in smaller chunks

In [10]:
sentences = []
for chunk in chunks:
    sentences += chunk.split('.')
    
len(sentences)

4182

In [24]:
n_sentences = 20

In [25]:
# idxs = list(range(0, len(sentences), 10))
# groups = []
# for idx in idxs:
#     start = 0 + idx
#     end = n_sentences + idx
#     # print(start, end)
#     group = sentences[start:end] # (0, 10), (10, 20), (20, 30), ...)
#     groups.append(group)

In [29]:
groups = [sentences[i:i + n_sentences] for i in range(0, len(sentences), n_sentences)]
len(groups)

210

In [30]:
minichunks = []
for group in groups:
    group_text = ''
    for sentence in group:
        group_text += sentence + '.' + ' '
    
    minichunks.append(group_text)
    
len(minichunks)

210

In [31]:
sizes = [count_tokens(minichunk) for minichunk in minichunks]
min(sizes), int(np.mean(sizes)), max(sizes)

# 10: (95, 347, 1363)
# 20: (124, 693, 1841)

(124, 693, 1841)

## Translate

In [38]:
prompt_template = '''Please translate the following excerpt from a novel into English. 
Aim to preserve the author's original style and word choice as closely as possible.
You must answer with just the translated text and nothing else.
Here is the excerpt: {text}'''

In [39]:
model_name = "gpt-3.5-turbo-1106" # max 16K tokens
# model_name = "gpt-4-1106-preview" # max 128K tokens (10 times the price!)

chat = ChatOpenAI(model_name=model_name, temperature=0, max_tokens=None)
prompt = langchain.prompts.PromptTemplate.from_template(prompt_template)
llm_chain = LLMChain(llm=chat, prompt=prompt)

In [40]:
output = llm_chain(inputs={'text': minichunks[54]})
print(output['text'])

It was this quality in Felix that she called carefulness. It had been a long time since she dared to think about the day when Carl-Gustaf slammed his fist on the dining table and demanded that she remain silent on a matter of right or wrong just because he had a different opinion than she did, for it was the only time in her married life that she felt its foundation waver beneath her. She met her husband's gaze steadily, but what she saw in it - hatred and unforgiveness - she hoped she would never have to see again. For if it happened again, she did not know if she could overcome herself. But today she had thought about it. Why? She thought about it now - and shivered - when she saw the two mismatched brothers walking together among the graves and trees in the cemetery. She thought about how while Carl-Gustaf had a mild and good-natured face that he only managed to make look stern with a certain effort, Felix had a naturally stern face, a pair of coolly observant granite-gray eyes unde

It was this quality in Felix that she called gentleness. It had been a long time since she dared to think about the day when Carl-Gustaf slammed his fist on the coffee table and demanded that she remain silent on a matter of right or wrong just because he had a different opinion than she did, for it was the only time in her married life that she felt its foundation waver beneath her. She met her husband's gaze steadily, but what she saw in it - hatred and unforgiveness - she hoped she would never have to see again. For if it happened again, she did not know if she could overcome herself. But today she had thought about it. Why? She thought about it now - and shivered - when she saw the two mismatched brothers walking together among the graves and trees in the cemetery. She thought about how while Carl-Gustaf had a mild and good-natured face that he only managed to make look stern with a certain effort, Felix had a naturally stern face, a pair of coolly observant granite-gray eyes under imposing, bushy black eyebrows, which he struggled to transform into something friendly and genial. In the end, they had both almost succeeded in reshaping their faces: over the years, they had acquired some of the character they wanted to have. She stood at the window curtain and watched the serious, stern brothers. Now they stood solemnly absorbed at the sight of their parents' grave; she could almost only guess that they were there, for they were half hidden by the trees - which were in full bloom and buzzing with bees. They were middle-aged now, the brothers, she thought. They had begun to look back, in a kind of shared reconciliation and reflection, and in this she saw the best thing that had happened - since before that day. That was when little Eugen came down the road, a little rascal of eight years. She opened the window, leaned out, and called to him: Eugen, have you seen Jerk? No, Eugen hadn't. He had just come here thinking Jerk was at home. After about half an hour, Carl-Gustaf came home alone. Felix had accompanied Eugen Agrell down to the village to visit the Grandien brothers. Well, well - what's the matter with you, little Olga? her husband said when she rushed up to him and kissed him, once, twice, three times. You look as if you've seen something terrible. What has happened?

It was this quality in Felix that she called carefulness. It had been a long time since she dared to think about the day when Carl-Gustaf slammed his fist on the dining table and demanded that she remain silent on a matter of right or wrong just because he had a different opinion than she did, for it was the only time in her married life that she felt its foundation waver beneath her. She met her husband's gaze steadily, but what she saw in it - hatred and unforgiveness - she hoped she would never have to see again. For if it happened again, she did not know if she could overcome herself. But today she had thought about it. Why? She thought about it now - and shivered - when she saw the two mismatched brothers walking together among the graves and trees in the cemetery. She thought about how while Carl-Gustaf had a mild and good-natured face that he only managed to make look stern with a certain effort, Felix had a naturally stern face, a pair of coolly observant granite-gray eyes under imposing, bushy black eyebrows, which he with difficulty transformed into something friendly and genial. In the end, they had both almost succeeded in reshaping their faces: over the years they had acquired some of the character they wanted to have. She stood by the window curtain and watched the serious, sternly stretched brothers. Now they stood solemnly sunk at the sight of their parents' grave; she could almost only guess that they were standing there, for they were half hidden by the trees - which were in full bloom and buzzing with bees. They were middle-aged now, the brothers, she thought. They had begun to look back, in a kind of common reconciliation and reflection, and in this she saw the best thing that had happened - since before that day. It was then that little Eugen came down the road, a little rascal of eight years. She opened the window, leaned out and called to him: Eugen, have you seen Jerk? No, Eugen hadn't. He had just come here thinking Jerk was home. After about half an hour, Carl-Gustaf came home alone. Felix had accompanied Eugen Agrell down to the village to visit the Grandien brothers. Well, well - what's the matter with you, little Olga? her husband said when she rushed up to him and kissed him, once, twice, three times. You look as if you've seen something terrible. What has happened?

In [45]:
translations = []
for minichunk in tqdm(minichunks):
    time.sleep(3)
    output = llm_chain(inputs={'text': minichunk})
    translations.append(output['text'])

100%|██████████| 210/210 [30:20<00:00,  8.67s/it]


In [48]:
with open('translations_202312021800.json', 'w') as file:
    json.dump(translations, file)