In [3]:
import tiktoken
def count_tokens_in_messages(messages, model="gpt-4o-mini"):
    """Count tokens in messages using tiktoken"""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")

    total_tokens = 0
    for message in messages:
        # Count tokens for role and content
        total_tokens += len(encoding.encode(message.get("role", "")))
        total_tokens += len(encoding.encode(message.get("content", "")))
        # Add extra tokens for message formatting (estimated)
        total_tokens += 4

    return total_tokens

model = "gpt-4o-mini"
for name, content in transcripts.items():
    encoding = tiktoken.encoding_for_model(model)
    current_tokens = len(encoding.encode(content))
    print(current_tokens)

16094


In [14]:
import concurrent
from tqdm import tqdm
import tiktoken
import json
import requests
from collections import defaultdict

# from . import prompts
from openai import RateLimitError, APITimeoutError
import time
from pydantic import BaseModel
from typing import Dict, List
def request_gpt(
    client, messages, model="gpt-4o-mini", temperature=0.5, format=None, seed=None
):
    try:
        if format == "json":
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                response_format={"type": "json_object"},
                temperature=temperature,
                seed=seed,
            )
        else:
            response = client.chat.completions.create(
                model=model, messages=messages, temperature=temperature, seed=seed
            )
        return response.choices[0].message.content
    except RateLimitError as e:
        print("RateLimitError")
        print(e)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)
    except APITimeoutError as e:
        print("APITimeoutError")
        print(messages)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)

In [9]:
import re

def parseTranscript(content: str):
    messages = []
    lines = content.split('\n')
    current_message = None

    for i, line in enumerate(lines):
        line = line.strip()
        
        # Check if line matches speaker pattern: [Speaker Name] hh:mm:ss
        speaker_match = re.match(r'^\[([^\]]+)\]\s+(\d{1,2}:\d{2}:\d{2})$', line)
        
        if speaker_match:
            # If we were building a previous message, save it
            if current_message:
                messages.append(current_message)
            
            # Start a new message
            current_message = {
                'speaker': speaker_match.group(1),
                'timestamp': speaker_match.group(2),
                'content': ""
            }
        elif current_message and line != '':
            # Add content line to current message
            current_message['content'] += line + "\n"
        elif current_message and line == '':
            # Empty line - could be end of message or just spacing
            # We'll keep building the current message until we hit a new speaker
            current_message['content'] += "\n"
    
    # Don't forget to add the last message if it exists
    if current_message:
        messages.append(current_message)
    
    return messages

def messages_to_string(messages):
    messages_str = ""
    for index, message in enumerate(messages):
        messages_str += f"{index}: [{message['speaker']}] {message['content']}"
    return messages_str

# Test the function with the Alex transcript
if 'Alex' in transcripts:
    parsed_messages = parseTranscript(transcripts['Alex'])
    messages_str = messages_to_string(parsed_messages)
    print(messages_str)

0: [Alex Marasigan] Uh, hello?

1: [Chifang Chou] Hello, can you hear me? Okay, so, um, first of all, can you see my screen as well?

2: [Alex Marasigan] Yes.

3: [Alex Marasigan] Yes.

4: [Chifang Chou] Okay, so thank you so much for, um, signing up for the study, and later the process will be, like, um, I'll read you through the letter of information first, and then I will demo the study process for you, and then we will go straight to the study process.

5: [Chifang Chou] So, if you have any questions regarding the study, you can just interrupt me at any time, and I'll answer you directly.

6: [Alex Marasigan] Okay, thank you.

7: [Chifang Chou] Okay, so for the lot of information here, basically, our purpose here is that you'll later be asked to use an AI system.

8: [Chifang Chou] Which is ChatGPT here to explore a dataset and generate visualizations using natural language prompts.

9: [Chifang Chou] Describe your insights from the chart, reveal their clarity and usefulness, and c

In [11]:
prompt = [
    {
        "role": "system",
        "content": """You are a helpful assistant that segment transcripts.
        The user will give you a transcript with indices for each message, and the criteria for segmentation.
        You will follow the criteria to segment the transcript into sections, providing the start and end indices for each segment.
        Reply in the following JSON format:
        {
            "segments": [
                {
                    "start_index": <int>,
                    "end_index": <int>,
                    "title": "<str>"
                },
                ...
            ]
        }
        """
    },
    {
        "role": "user",
        "content": """
        This transcript is from a user study. The study is divided into an introduction session, three scenario/task sessions, each followed by a brief questionnaire, and then a final interview session. 
        Here is the transcript:
        {transcript}
        
        Please segment the transcript into sections based on the following criteria:
        - The first segment is the introduction, where one speaker introduces the topic and procedure.
        - The second segment is the first scenario/task session with its questionnaire.
        - The third segment is the second scenario/task session with its questionnaire.
        - The fourth segment is the third scenario/task session with its questionnaire.
        - The final segment is the interview session.
        Return the segments in the specified JSON format.
        """.format(transcript=messages_str)
    }
]

In [20]:
import os
import glob
from openai import OpenAI
# read txt files from transcripts/*.txt
transcript_files = glob.glob("transcripts/*.txt")
transcripts = {}

for file_path in transcript_files:
    # Extract filename without extension to use as key
    filename = os.path.splitext(os.path.basename(file_path))[0]
    
    # Read the content of each file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        transcripts[filename] = content

print(f"Found {len(transcripts)} transcript files:")
for name in transcripts.keys():
    print(f"- {name}")
    
client = OpenAI()
responses = []
for name, content in transcripts.items():
    messages = parseTranscript(content)
    messages_to_str = messages_to_string(messages)
    segmentation_response = request_gpt(client, prompt, model="gpt-4o-mini", temperature=0, format="json")
    responses.append((name, messages, segmentation_response))

Found 1 transcript files:
- Alex


In [18]:
segmentation_response = json.loads(responses[0][1])
segmentation_response

{'segments': [{'start_index': 0, 'end_index': 46, 'title': 'Introduction'},
  {'start_index': 47,
   'end_index': 145,
   'title': 'First Scenario/Task Session'},
  {'start_index': 226,
   'end_index': 413,
   'title': 'Second Scenario/Task Session'},
  {'start_index': 414,
   'end_index': 601,
   'title': 'Third Scenario/Task Session'},
  {'start_index': 602, 'end_index': 626, 'title': 'Final Interview Session'}]}

In [30]:
def parse_index(segment, messages):
    start_index = segment["start_index"]
    end_index = segment["end_index"]
    segment_messages = messages[start_index:end_index + 1]
    segment["messages"] = segment_messages
    return segment
for name, messages, segmentation_response_str in responses:
    segmentation_list = json.loads(segmentation_response_str)
    print(segmentation_list)
    for index, segment_response in enumerate(segmentation_list["segments"]):
        segmented_with_messages = parse_index(segment_response, messages)
        print(f"Transcript: {name}")
        with open(f"segmented/{name}_{index}.json", "w") as f:
            json.dump(segmented_with_messages, f, indent=4)

{'segments': [{'start_index': 0, 'end_index': 46, 'title': 'Introduction'}, {'start_index': 47, 'end_index': 145, 'title': 'First Scenario/Task Session'}, {'start_index': 226, 'end_index': 413, 'title': 'Second Scenario/Task Session'}, {'start_index': 414, 'end_index': 601, 'title': 'Third Scenario/Task Session'}, {'start_index': 602, 'end_index': 626, 'title': 'Final Interview Session'}]}
Transcript: Alex
Transcript: Alex
Transcript: Alex
Transcript: Alex
Transcript: Alex
