In [None]:
!pip install nltk

In [None]:
import os
import openai
import tiktoken
from pathlib import Path
from dotenv import load_dotenv
import nltk
nltk.download('punkt')

# Load environment variables
load_dotenv()

openai.api_type = "azure"
openai.base_url = os.environ.get("OPENAI_API_BASE")
openai.api_key = os.environ.get("OPENAI_API_KEY")
openai.api_version = "2023-05-15"

CHAT_MODEL =  os.environ.get("OPENAI_CHAT_DEPLOYMENT_NAME") 
encoding = tiktoken.get_encoding('cl100k_base')

### Chunking

In [None]:
import os
# Define the path to the large text file
file_path = 'script.txt'

# Open the file and read its content
with open(file_path, 'r') as file:
    text = file.read()

# Split the text into sentences
sentences = nltk.sent_tokenize(text)

# Group sentences into chunks of approximately 5000 tokens
chunks = []
current_chunk = ""
for sentence in sentences:
    if len(current_chunk + sentence) <= 5000:
        current_chunk += sentence
    else:
        chunks.append(current_chunk)
        current_chunk = sentence
chunks.append(current_chunk)  # append the last chunk

# Create a new directory named 'chunks' if it doesn't exist
Path('chunks').mkdir(parents=True, exist_ok=True)

# Delete all files in the 'chunks' directory
folder_path = 'chunks'
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        os.remove(file_path)

# Save each chunk into a separate file in the 'chunks' directory
for i, chunk in enumerate(chunks):
    with open(f'chunks/chunk_{i}.txt', 'w') as file:
        file.write(chunk)

### Processing

In [2]:
from openai import AzureOpenAI

client = AzureOpenAI(
  api_key = os.getenv("OPENAI_API_KEY"),  
  api_version = "2023-05-15",
  azure_endpoint = os.getenv("OPENAI_API_BASE")
)

def get_completion(prompt, model=CHAT_MODEL):
    response = client.chat.completions.create(
        temperature=0, 
        model=CHAT_MODEL, 
        messages=[
            {"role": "system", "content": "You are a helpful assistant with expertise in reading, writing and notetaking."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content

In [10]:
promptPreFix = "Read the transcript below. Summarize the highlights about OpenAI, its history and relevant informtion in a bullet list. Keep only the highlights. I will use these as talking points when I present about OpenAI. Keep all text concise. Provide chronological sense."

os.makedirs('chunk-summary', exist_ok=True)
folder_path = 'chunks'
file_count = len(os.listdir(folder_path)) 

# Read each chunk and call the get_completion function
for i in range(file_count):
    with open(f'chunks/chunk_{i}.txt', 'r') as file:
        chunk_content = file.read()
    
    prompt = f"""
        {promptPreFix}
        ---
        {chunk_content}
        ---
        """
    
    # Call the get_completion function and get the response
    response = get_completion(prompt)
    
    # Save the response to a separate file in the 'chunk-summary' directory
    with open(f'chunk-summary/chunk_{i}_summary.txt', 'w') as file:
        file.write(response)

In [11]:
import os

# Create a new directory named 'combined' if it doesn't exist
combined_folder_path = 'combined'
os.makedirs(combined_folder_path, exist_ok=True)

# Get the list of chunk summary files
chunk_summary_folder_path = 'chunk-summary'
chunk_summary_files = os.listdir(chunk_summary_folder_path)

# Combine the content of all chunk summary files into a single text file
combined_text = ""
for file_name in chunk_summary_files:
    file_path = os.path.join(chunk_summary_folder_path, file_name)
    with open(file_path, 'r') as file:
        chunk_summary = file.read()
        combined_text += chunk_summary + "\n"

# Save the combined text into a single file in the 'combined' directory
combined_file_path = os.path.join(combined_folder_path, 'combined_summary.txt')
with open(combined_file_path, 'w') as file:
    file.write(combined_text)
