Given input text in source language, identify terminology and dialogue styles

In [3]:
source_text = ""
with open("example2.txt", "r") as file:
    for line in file:
        source_text += line.strip()
        source_text += '\n'

In [4]:
import tiktoken

GPT_MODEL = "gpt-4"
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [5]:
def chunk_source_text(source_text: str, max_tokens: int = 8192) -> list[str]:
    """Chunk the source text into chunks with a maximum number of tokens."""
    chunks = []
    chunk = ""
    
    for line in source_text.split('\n'):
        line_token_count = num_tokens(line)
        
        if line_token_count > max_tokens:
            raise ValueError(f"Line with more than {max_tokens} tokens")
        
        chunk_token_count = num_tokens(chunk)

        cut_here = (line.strip() == "" and chunk_token_count > max_tokens/8) \
            or (chunk_token_count + line_token_count > max_tokens)
        
        if cut_here:
            chunks.append(chunk)
            chunk = line + '\n'
        else:
            chunk += line + '\n'
    
    chunks.append(chunk)  # Append the last chunk
    return chunks

# chunks = chunk_source_text(source_text)
# for chunk in chunks:
#     print(f"Chunk with {num_tokens(chunk)} tokens:\n{chunk}\n")

In [6]:
import os
from openai import OpenAI

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_MANGA_API_KEY"),
)

In [7]:
import google.generativeai as genai
genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
gemini = genai.GenerativeModel('gemini-pro')

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
instructions_1 = "You are localizing an anime light novel. Identify named entities (such as proper nouns and unique terminologies) in the provided text passage and suggest English translations."
instructions_2 = "Do not redefine terms already in the glossary. "
instructions_3 = "Output in JSON format: `{ \"original_word\": \"translated_word\", ... }`."

def get_named_entities_from_gpt3(text: str, glossary: dict = None) -> str:
    """Ask GPT model to extract named entities from text."""
    glossary_instructions = instructions_2 if glossary else ""
    instructions = instructions_1 + glossary_instructions + instructions_3
    
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": text},
    ]

    if glossary:
        sub_glossary = {k: v for k, v in glossary.items() if k in text}
        sub_glossary_string = str(sub_glossary)
        messages.insert(1, {"role": "assistant", "content": f"Current glossary: {sub_glossary_string}"})

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
    )

    completion_message = completion.choices[0].message.content
    return completion_message

def get_named_entities_from_gpt4(text: str, glossary: dict = None) -> str:
    """Ask GPT model to extract named entities from text."""
    glossary_instructions = instructions_2 if glossary else ""
    instructions = instructions_1 + glossary_instructions + instructions_3
    
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": text},
    ]

    if glossary:
        sub_glossary = {k: v for k, v in glossary.items() if k in text}
        sub_glossary_string = str(sub_glossary)
        messages.insert(1, {"role": "assistant", "content": f"Current glossary: {sub_glossary_string}"})

    completion = client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=messages,
        response_format={ "type": "json_object" },
        temperature=0,
    )

    completion_message = completion.choices[0].message.content
    return completion_message

def get_named_entities_from_gemini(text: str, glossary: dict = None) -> str:
    glossary_instructions = instructions_2 if glossary else ""
    instructions = instructions_1 + glossary_instructions + instructions_3

    if glossary:
        sub_glossary = {k: v for k, v in glossary.items() if k in text}
        sub_glossary_string = str(sub_glossary)
        instructions += f" Current glossary: {sub_glossary_string}"
    
    message = f"{instructions}\n\nText:\n```{text}```"

    response = gemini.generate_content(message)
    return response.text

In [10]:
import json
import re

def update_glossary(glossary: dict, new_terms: dict):
    for k, v in new_terms.items():
        if len(k) > 10: # too long
            continue
        glossary[k] = v

def build_glossary_pipeline_gpt3(source_text: str) -> dict:
    """Extract named entities from source text and build terminology dictionary."""
    glossary = {}
    chunks = chunk_source_text(source_text)

    for chunk in chunks:
        named_entities = get_named_entities_from_gpt3(chunk, glossary)
        update_glossary(glossary, json.loads(named_entities))
        print(str(glossary), end='\r')

    return glossary

def build_glossary_pipeline_gemini(source_text: str) -> dict:
    glossary = {}
    chunks = chunk_source_text(source_text)

    for text in chunks:
        glossary_instructions = instructions_2 if glossary else ""
        instructions = instructions_1 + glossary_instructions + instructions_3

        if glossary:
            sub_glossary = {k: v for k, v in glossary.items() if k in text}
            sub_glossary_string = str(sub_glossary)
            instructions += f" Current glossary: {sub_glossary_string}"
        
        message = f"{instructions}\n\nText:\n```{text}```"
        
        chat = gemini.start_chat()
        response = chat.send_message(
            message,
            generation_config=genai.types.GenerationConfig(
                temperature=0
            )
        )
        
        match = re.search(r"{.*}", response.text, re.DOTALL)
        if match:
            update_glossary(glossary, json.loads(match.group()))
            print(str(glossary), end='\r')

    return glossary

In [11]:
chunks = chunk_source_text(source_text)
entities = get_named_entities_from_gemini(chunks[0])
print(entities)

```json
{
  "中考": "entrance exam",
  "城原千太郎": "Shirohara Sentaro",
  "雾乃雫": "Kirino Shizuku",
  "樱": "Sakura",
  "石田": "Ishida",
  "BUNNYS": "Bunny's",
  "神奈川县逗子海岸店": "Kanagawa Prefecture Zushi Coast Store"
}
```


In [68]:
chunks = chunk_source_text(source_text, max_tokens=120000)
entities = get_named_entities_from_gpt4(chunks[0])
print(entities)

{
  "城原千太郎": "Jōhara Chitarō",
  "雾乃雫": "Kirino Shizuku",
  "樱": "Sakura",
  "石田": "Ishida",
  "BUNNYS": "BUNNYS",
  "神奈川县逗子海岸店": "Kanagawa Prefecture Zushi Coast Store",
  "辻桥高中": "Tsujibashi High School",
  "监督": "Director",
  "棒球教练": "Baseball Coach",
  "工程监理": "Construction Supervisor",
  "旧视听室": "Old Audio-Visual Room"
}


In [127]:
glossary = build_glossary_pipeline_gpt3(source_text)
print(glossary)

{'城原千太郎': 'Chitara Johara', '雾乃雫': 'Kasumi Kirino', '电影导演': 'film director', '超级帅气角色': 'super handsome character', '樱': 'Sakura', '石田': 'Ishida', '中考': 'entrance exam', '学妹': 'junior student', '自行车': 'bicycle', '学长': 'senior student', '神奈川县逗子海岸店': 'Zushi Coast branch in Kanagawa Prefecture', '四月中旬': 'mid-April', '海风': 'sea breeze', '哈欠': 'yawn', '神奈川县立辻桥高中': 'Kanagawa Tsujiki High School', 'BUNNYS': 'BUNNYS', '芭菲': 'parfait', '喵喵收藏品': 'Meow Meow collectible', '辻桥高中': 'Tsujiki High School', 'A型血': 'blood type A', '归宅部': 'After-School Club', '家庭餐厅': 'family restaurant', '儿童午餐': "children's lunch", '监督': 'supervisor', '导演': 'director', '棒球教练': 'baseball coach', '工程监理': 'engineering supervisor', '英语笔记': 'English notes', '二年级': 'second year', '精通七国语言': 'fluent in seven languages', '不及格': 'failing grade', '黑暗': 'darkness', '玻璃杯': 'glass', '店长': 'store manager', '炒鱿鱼': 'get fired', '城原同学': 'Hirohara', '鹰野店长': 'Takanashi', '逗子海岸店': 'Zushi Coast Store', '逗子的人鱼': 'Zushi Mermaid', '逗子的鱼人': 'Zushi

In [12]:
glossary = build_glossary_pipeline_gemini(source_text)
print(glossary)

{'城原千太郎': 'Chihara Sentaro', '雾乃雫': 'Kirino Shizuku', '樱': 'Sakura', '石田': 'Ishida', 'BUNNYS': 'BUNNYS', '神奈川县逗子海岸店': 'Kanagawa Prefecture Zushi Coast Store', '神奈川县立辻桥高中': 'Kanagawa Prefectural Tsujihashi High School', '辻桥高中': 'Tsujihashi High School', '辻桥高中二年B班': 'Tsujihashi High School, Class 2-B', '城原千太郎学长': 'Senior Chihara Sentaro', '监督': 'Supervisor', '城原同——学': 'Shirohara-kun', '鹰野': 'Takano', '逗子海岸': 'Zushi Kaigan', '逗子的人鱼': 'Mermaid of Zushi', '逗子的鱼人': 'Fishman of Zushi', '微观管理': 'Micromanagement', '葛格': 'Onii-chan', '灯里': 'Hotaru', '光之美少女角色扮演': 'PreCure cosplay', '危机(crisis)': 'crisis', '训斥场所': 'scolding place', '演员': 'Actor', '旧视听室': 'Old Audio-Visual Room', '学妹': 'Junior', '店长': 'Manager'}
