In [None]:
! pip install openai --upgrade
! pip install langchain --upgrade
! pip install python-dotenv
! pip install srt
! pip install codecs
! pip install tiktoken

In [7]:
import os
import openai
import langchain as lc
from langchain.document_loaders import SRTLoader
from dotenv import load_dotenv
import srt
import codecs
import tiktoken
from IPython.display import clear_output

# Load environment variables
load_dotenv()

openai_api_key = os.environ.get("OPENAI_API_KEY")

if not openai_api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables.")

openai.api_key = openai_api_key

GPT_MODEL_NAME = "gpt-4-turbo-preview"

In [8]:
from openai import OpenAI

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens

def get_completion(prompt, model=GPT_MODEL_NAME):
    response = client.chat.completions.create(
        temperature=0, 
        model=GPT_MODEL_NAME, 
        messages=[
            {"role": "system", "content": "You are a helpful assistant with expertise in english language reading and writing. You are an expert copywriter."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content

def write_to_file(content: str, file_path: str):
    """Write content to a file."""
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

def read_srt_file(file_path: str):
    """Read and parse an SRT file."""
    with codecs.open(file_path, 'r', encoding='utf-8-sig') as f:
        return list(srt.parse(f.read()))

### Transcript Translation

In [9]:
from typing import List
import srt

def translate_with_context(subtitles: List[srt.Subtitle], index: int) -> str:
    """Translates a subtitle segment with extended context and explicit focus on the current segment."""
    # Extend the context window: Current + 3 previous + 3 next subtitles
    context_before = ' '.join([sub.content for sub in subtitles[max(0, index-3):index]])
    context_after = ' '.join([sub.content for sub in subtitles[index+1:min(len(subtitles), index+4)]])
    current_segment = subtitles[index].content
    
    # Constructing the prompt for translation
    prompt = f"Translate the following segment into American English, considering the previous and next context for a natural and accurate translation. Only provide the translated segment that represents the segment to translate.\n\n" \
             f"Previous context: {context_before}\n" \
             f"Segment to translate: {current_segment}\n" \
             f"Next context: {context_after}\n" \
             f"Translation:"

    # Get the translation
    translated_text = get_completion(prompt)
    
    return translated_text

def translate_srt_file(file_path: str, output_file_path: str):
    """Reads an SRT file, translates it to American English considering contextual information, and writes to a new file."""
    subtitles = read_srt_file(file_path)
    translated_subtitles = []

    for index, subtitle in enumerate(subtitles):
        # Translate each subtitle with extended context
        translated_text = translate_with_context(subtitles, index)
        translated_subtitles.append(srt.Subtitle(index=subtitle.index, start=subtitle.start, end=subtitle.end, content=translated_text.strip()))

    # Write to a new SRT file
    with open(output_file_path, 'w', encoding='utf-8') as f:
        f.write(srt.compose(translated_subtitles))

# Example usage
translate_srt_file('2024-03-22-Github-Copilot-Nedir.srt', '2024-03-22-Github-Copilot-Nedir-EN.srt')
