# Initialize the functions

In [1]:
from openai import OpenAI
from typing import List, Tuple, Dict
import json
import os
from dotenv import load_dotenv
import time
load_dotenv()


def read_txt_file(file_path: str) -> List[Dict]:
    """
    Read a TXT file and return a list of dictionaries containing
    speaker and text.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    parsed_blocks = []
    
    for line in lines:
        if line.strip():  # Ignore empty lines
            speaker, text = line.split(':', 1)
            parsed_blocks.append({
                'speaker': speaker.strip(),
                'text': text.strip()
            })
    
    return parsed_blocks


def clean_transcript_chunk(chunk: List[Dict], 
                           context: str, 
                           client: OpenAI,
                           ) -> List[Dict]:
    """
    Clean a chunk of transcript text while maintaining context and style.
    
    Args:
        chunk: List of dictionaries with 'speaker' and 'text'
        context: Previous context to maintain consistency
        client: OpenAI client instance
        max_tokens: Maximum tokens to process at once
    """
    prompt = {
        "context": context,
        "chunk": chunk
    }

    system_message = """You are cleaning a transcript. Your task is to:
                1. Fix any obvious transcription errors
                2. Maintain the speaking style and informality while making the text more readable and continuous
                3. Keep all original meaning intact
                4. Use the previous context to maintain consistency
                5. In some cases, the text from a speaker might get added to the next speaker. Ensure that the diarization is done properly. Combine the text if needed. But apply extra caution while combining multiple blocks into one
                6. Remove fillers like "um", "uh", "you know", repeated "okays" that are redundant.
                Background to the conversation: A researcher is talking to a school teacher regarding a tool named Shiksha Copilot that can be used to generate lesson plans and lesson resources. The organization Sikshana Foundation is helping with the tool implementation. The conversation is in English and Kannada. A translator is also present to help with the conversation. The Kannada excerpts are already translated to English in the transcript. The conversation happened in a school office so in some cases there are other teachers and students interrupting in between.
                
                Return json output without context in the format: {"chunk": [{'speaker': '', 'text':''},...]}.
                """

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": json.dumps(prompt)}
        ],
        temperature=0.2,
        response_format= { "type": "json_object" }  
    )
    print(response)
    
    cleaned_chunk = json.loads(response.choices[0].message.content)
    return cleaned_chunk

def clean_txt_file(input_file: str, 
                   output_file: str,
                   chunk_size: int = 5) -> None:
    """
    Clean an entire TXT file while maintaining context.
    
    Args:
        input_file: Path to input TXT file
        output_file: Path to save cleaned TXT file
        chunk_size: Number of conversation lines to process at once
    """
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    txt_blocks = read_txt_file(input_file)
    cleaned_blocks = []
    context = ""
    
    # Process in chunks to maintain context
    total_chunks = len(txt_blocks)//chunk_size + 1
    for i in range(0, len(txt_blocks), chunk_size):
        print(f"Processing chunk {i//chunk_size + 1}/{total_chunks}")
        chunk = txt_blocks[i:i+chunk_size]
        
        # Clean the chunk while maintaining context
        max_retries = 5
        retries = 0
        cleaned_chunk = None

        while retries < max_retries:
            try:
                cleaned_chunk = clean_transcript_chunk(chunk, context, client)
                if "chunk" in cleaned_chunk:
                    break  # Exit loop if the response is correct
                else:
                    raise KeyError("Missing 'chunk' in response")
            except KeyError as e:
                retries += 1
                time.sleep(5)
                print(f"Error: {e}. Retrying {retries}/{max_retries}...")

        if cleaned_chunk is None or "chunk" not in cleaned_chunk:
            raise RuntimeError("Failed to clean chunk after multiple retries")
        
        response = cleaned_chunk["chunk"]
        # Append cleaned blocks
        cleaned_blocks.extend(response)
        
        # Update context with the cleaned text
        context = ' '.join(block['text'] for block in response)
    
    # Write cleaned TXT file
    with open(output_file, 'w', encoding='utf-8') as file:
        for block in cleaned_blocks:
            file.write(f"{block['speaker']}: {block['text']}\n\n")
    print(f"Cleaned transcript saved to {output_file}")

# Set the file locations and run the functions

In [2]:
input_file = "./data/input/siddayya.txt"
output_file = "./data/clean/siddayya_clean.txt"

clean_txt_file(input_file, output_file, chunk_size=10)

Processing chunk 1/10
ChatCompletion(id='chatcmpl-Ay1YQkDEELhQAmCZ3lqSsQYoYNQMP', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{"chunk": [{"speaker": "Speaker 2", "text": "OK, we can start. So, how long have you been working with the Sikshana Foundation, sir?"}, {"speaker": "Speaker 0", "text": "Sir, 11 years."}, {"speaker": "Speaker 2", "text": "Oh, OK, sir. And currently, how many teachers are you supporting for the Siksha Copilot tool implementation?"}, {"speaker": "Speaker 0", "text": "250 teachers, sir. I am working on June."}, {"speaker": "Speaker 2", "text": "187 schools, 250 teachers. Okay, sir. This is which zone, sir? And how many training sessions have you conducted with the teachers, sir?"}, {"speaker": "Speaker 0", "text": "I also have completed a school-level training. In total, we have completed 246 teachers\' training in person, at school level, and virtually, sir."}, {"speaker": "Speaker 2", "text": "Were they in-