In [37]:
import tiktoken
def count_tokens_in_messages(messages, model="gpt-4o-mini"):
    """Count tokens in messages using tiktoken"""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")

    total_tokens = 0
    for message in messages:
        # Count tokens for role and content
        total_tokens += len(encoding.encode(message.get("role", "")))
        total_tokens += len(encoding.encode(message.get("content", "")))
        # Add extra tokens for message formatting (estimated)
        total_tokens += 4

    return total_tokens

# model = "gpt-4o-mini"
# for name, content in transcripts.items():
#     encoding = tiktoken.encoding_for_model(model)
#     current_tokens = len(encoding.encode(content))
#     print(current_tokens)

In [38]:
import concurrent
from tqdm import tqdm
import tiktoken
import json
import requests
from collections import defaultdict

# from . import prompts
from openai import RateLimitError, APITimeoutError
import time
from pydantic import BaseModel
from typing import Dict, List
def request_gpt(
    client, messages, model="gpt-4o-mini", temperature=0.5, format=None, seed=None
):
    try:
        if format == "json":
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                response_format={"type": "json_object"},
                temperature=temperature,
                seed=seed,
            )
        else:
            response = client.chat.completions.create(
                model=model, messages=messages, temperature=temperature, seed=seed
            )
        return response.choices[0].message.content
    except RateLimitError as e:
        print("RateLimitError")
        print(e)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)
    except APITimeoutError as e:
        print("APITimeoutError")
        print(messages)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)

In [39]:
import re

def parseTranscript(content: str):
    messages = []
    lines = content.split('\n')
    current_message = None

    for i, line in enumerate(lines):
        line = line.strip()
        
        # Check if line matches speaker pattern: [Speaker Name] hh:mm:ss
        speaker_match = re.match(r'^\[([^\]]+)\]\s+(\d{1,2}:\d{2}:\d{2})$', line)
        
        if speaker_match:
            # If we were building a previous message, save it
            if current_message:
                messages.append(current_message)
            
            # Start a new message
            current_message = {
                'speaker': speaker_match.group(1),
                'timestamp': speaker_match.group(2),
                'content': ""
            }
        elif current_message and line != '':
            # Add content line to current message
            current_message['content'] += line + "\n"
        elif current_message and line == '':
            # Empty line - could be end of message or just spacing
            # We'll keep building the current message until we hit a new speaker
            current_message['content'] += "\n"
    
    # Don't forget to add the last message if it exists
    if current_message:
        messages.append(current_message)
    
    return messages

def messages_to_string(messages):
    messages_str = ""
    for index, message in enumerate(messages):
        messages_str += f"{index}: [{message['speaker']}] {message['content']}"
    return messages_str

# Test the function with the Alex transcript
# if 'Alex' in transcripts:
#     parsed_messages = parseTranscript(transcripts['Alex'])
#     messages_str = messages_to_string(parsed_messages)
#     print(messages_str)

In [81]:
def segmentation_prompt(messages_str):
    return [
        {
            "role": "system",
            "content": """You are a helpful assistant that segment transcripts.
            The user will give you a transcript with indices for each message, and the criteria for segmentation.
            You will follow the criteria to segment the transcript into sections, providing the start and end indices for each segment.
            Reply in the following JSON format:
            {
                "segments": [
                    {
                        "start_index": <int>,
                        "end_index": <int>,
                        "title": "<str>"
                    },
                    ...
                ]
            }
            """
        },
        {
            "role": "user",
            "content": """
            This transcript is from a user study. The study is divided into an introduction session, three scenario/task sessions, each followed by a brief questionnaire, and then a final interview session. 
            Here is the transcript:
            {transcript}
            
            Please segment the transcript into sections based on the following criteria:
            - The first segment is the introduction, where one speaker introduces the topic and procedure.
            - The second segment is the first scenario/task session with its questionnaire.
            - The third segment is the second scenario/task session with its questionnaire.
            - The fourth segment is the third scenario/task session with its questionnaire.
            - The final segment is the interview session.
            Return the segments in the specified JSON format.
            The start and end indices must cover the entire transcript without gaps or overlaps.
            """.format(transcript=messages_str)
        }
    ]

In [64]:
import os
import glob
from openai import OpenAI
# read txt files from transcripts/**/*.txt
transcript_files = glob.glob("transcripts/Cars/*.txt") + glob.glob("transcripts/Movies/*.txt")
transcripts = {}

for file_path in transcript_files:
    print(f"Reading transcript file: {file_path}")
    # Extract filename without extension to use as key
    filename = os.path.splitext(os.path.basename(file_path))[0]
    
    # Read the content of each file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        transcripts[filename] = content

print(f"Found {len(transcripts)} transcript files:")
for name in transcripts.keys():
    print(f"- {name}")

Reading transcript file: transcripts/Cars/Shasha.txt
Reading transcript file: transcripts/Cars/Chris.txt
Reading transcript file: transcripts/Cars/Sage.txt
Reading transcript file: transcripts/Cars/Sebastian.txt
Reading transcript file: transcripts/Cars/Ederson.txt
Reading transcript file: transcripts/Cars/Sarah.txt
Reading transcript file: transcripts/Cars/Advait.txt
Reading transcript file: transcripts/Cars/Alex.txt
Reading transcript file: transcripts/Cars/Ezekiel.txt
Reading transcript file: transcripts/Cars/Ian.txt
Reading transcript file: transcripts/Movies/Shay.txt
Reading transcript file: transcripts/Movies/Zoe.txt
Reading transcript file: transcripts/Movies/Jacob.txt
Reading transcript file: transcripts/Movies/Maria.txt
Reading transcript file: transcripts/Movies/Hailey.txt
Reading transcript file: transcripts/Movies/Atharva.txt
Reading transcript file: transcripts/Movies/Crosby.txt
Reading transcript file: transcripts/Movies/Ronny.txt
Reading transcript file: transcripts/Movi

In [91]:
client = OpenAI()
responses = []
for name, content in transcripts.items():
    # if name != "Jacob": continue
    messages = parseTranscript(content)
    messages_to_str = messages_to_string(messages)
    prompt = segmentation_prompt(messages_to_str)
    segmentation_response = request_gpt(client, prompt, model="gpt-4o-mini", temperature=0, format="json")
    segmentation_response = validate_segmentation(segmentation_response, messages)
    responses.append((name, messages, segmentation_response))

[{'start_index': 0, 'end_index': 31, 'title': 'Introduction'}, {'start_index': 32, 'end_index': 83, 'title': 'First Scenario/Task Session with Questionnaire'}, {'start_index': 84, 'end_index': 301, 'title': 'Second Scenario/Task Session with Questionnaire'}, {'start_index': 302, 'end_index': 373, 'title': 'Third Scenario/Task Session with Questionnaire'}, {'start_index': 374, 'end_index': 376, 'title': 'Interview Session'}]
[{'start_index': 0, 'end_index': 19, 'title': 'Introduction'}, {'start_index': 20, 'end_index': 66, 'title': 'Scenario 1 and Questionnaire'}, {'start_index': 188, 'end_index': 201, 'title': 'Scenario 2 and Questionnaire'}, {'start_index': 232, 'end_index': 275, 'title': 'Scenario 3 and Questionnaire'}, {'start_index': 278, 'end_index': 358, 'title': 'Interview Session'}]
[{'start_index': 0, 'end_index': 62, 'title': 'Introduction'}, {'start_index': 63, 'end_index': 60, 'title': 'First Scenario/Task Session with Questionnaire'}, {'start_index': 162, 'end_index': 229,

In [86]:
def validate_segmentation(segmentation_response, messages):
    try:
        segmentation_list = json.loads(segmentation_response)['segments']
        print(segmentation_list)
        for i in range(len(segmentation_list)):
            start_index = segmentation_list[i]['start_index']
            end_index = segmentation_list[i]['end_index']
            if end_index < start_index:
                if i < len(segmentation_list) - 1:
                    end_index = segmentation_list[i + 1]['start_index'] - 1
                else:
                    end_index = len(messages) - 1
            segmentation_list[i]['start_index'] = start_index
            segmentation_list[i]['end_index'] = end_index
        for i in range(len(segmentation_list) - 1):
            this_end_index = segmentation_list[i]['end_index']
            next_start_index = segmentation_list[i + 1]['start_index']
            if this_end_index + 1 != next_start_index:
                segmentation_list[i]['end_index'] = next_start_index - 1
            segmentation_list[i]['end_index'] = segmentation_list[i]['end_index']
        if segmentation_list[-1]['end_index'] != len(messages) - 1:
            segmentation_list[-1]['end_index'] = len(messages) - 1
        return {
            "segments": segmentation_list
        }
        
        
    except (json.JSONDecodeError, AssertionError) as e:
        print("Segmentation response validation failed:")
        print(e)
        raise
# validate_segmentation(responses[0][2], responses[0][1])

In [92]:
def parse_index(segment, messages):
    start_index = segment["start_index"]
    end_index = segment["end_index"]
    assert end_index >= start_index, f"end_index {end_index} is less than start_index {start_index}"
    segment_messages = messages[start_index:end_index + 1]
    segment["messages"] = segment_messages
    return segment

for name, messages, segmentation_list in responses:
    # segmentation_list = json.loads(segmentation_response_str)
    print(segmentation_list)
    # create directory segmented/{name}
    
    os.makedirs(f"test/segmented/{name}", exist_ok=True)
    for index, segment_response in enumerate(segmentation_list["segments"]):
        segmented_with_messages = parse_index(segment_response, messages)
        print(f"Transcript: {name}")
        with open(f"test/segmented/{name}/{index}.json", "w") as f:
            json.dump(segmented_with_messages, f, indent=4)

{'segments': [{'start_index': 0, 'end_index': 31, 'title': 'Introduction'}, {'start_index': 32, 'end_index': 83, 'title': 'First Scenario/Task Session with Questionnaire'}, {'start_index': 84, 'end_index': 301, 'title': 'Second Scenario/Task Session with Questionnaire'}, {'start_index': 302, 'end_index': 373, 'title': 'Third Scenario/Task Session with Questionnaire'}, {'start_index': 374, 'end_index': 376, 'title': 'Interview Session'}]}
Transcript: Shasha
Transcript: Shasha
Transcript: Shasha
Transcript: Shasha
Transcript: Shasha
{'segments': [{'start_index': 0, 'end_index': 19, 'title': 'Introduction'}, {'start_index': 20, 'end_index': 187, 'title': 'Scenario 1 and Questionnaire'}, {'start_index': 188, 'end_index': 231, 'title': 'Scenario 2 and Questionnaire'}, {'start_index': 232, 'end_index': 277, 'title': 'Scenario 3 and Questionnaire'}, {'start_index': 278, 'end_index': 359, 'title': 'Interview Session'}]}
Transcript: Chris
Transcript: Chris
Transcript: Chris
Transcript: Chris
Tr

In [None]:
de_id_list = open("de_id", 'r').read().splitlines()
print(de_id_list)
de_id_dict = {}
for (original_name, de_id_name) in zip(de_id_list[:20], de_id_list[20:]):
    print(original_name, "-", de_id_name)
    de_id_dict[original_name] = de_id_name

['Esther', 'Malik', 'Jaylon ', 'Ronny', 'Chris', 'Zoe', 'Advait ', 'Shay', 'Alex', 'Atharva ', 'jacob', 'ederson', 'Ina', 'Sebastian', 'Ian', 'Maria', 'Jian', 'Sarah', 'Crosby', 'Sage', 'Maya', 'Oliver', 'Serena', 'Devin', 'Lina', 'Jasper', 'Naomi', 'Theo', 'Carmen', 'Felix', 'Aria', 'Marcus', 'Tessa', 'Gabriel', 'Rhea', 'Daniel', 'Ivy', 'Victor', 'Elinor', 'Nolan']
Esther - Maya
Malik - Oliver
Jaylon  - Serena
Ronny - Devin
Chris - Lina
Zoe - Jasper
Advait  - Naomi
Shay - Theo
Alex - Carmen
Atharva  - Felix
jacob - Aria
ederson - Marcus
Ina - Tessa
Sebastian - Gabriel
Ian - Rhea
Maria - Daniel
Jian - Ivy
Sarah - Victor
Crosby - Elinor
Sage - Nolan


In [99]:
import shutil
import os

# Create the de_id_segmented directory
os.makedirs("de_id_segmented", exist_ok=True)

# Get list of folders in segmented directory
segmented_folders = [item for item in os.listdir("test/segmented") if os.path.isdir(os.path.join("test/segmented", item))]
print("Original folders in segmented directory:", segmented_folders)

# Create a mapping from lowercase original names to de-id names
lowercase_de_id_dict = {}
for original_name, de_id_name in de_id_dict.items():
    lowercase_de_id_dict[original_name.lower().strip()] = de_id_name

print("\nLowercase de_id mapping:")
for orig, de_id in lowercase_de_id_dict.items():
    print(f"  '{orig}' -> '{de_id}'")

# Copy folders with name replacement
for folder in segmented_folders:
    folder_lower = folder.lower().strip()
    print("Processing folder:", folder_lower) 
    # Find matching de-id name (case-insensitive)
    de_id_folder_name = None
    for orig_name_lower, de_id_name in lowercase_de_id_dict.items():
        if folder_lower == orig_name_lower:
            de_id_folder_name = de_id_name
            break
    
    if de_id_folder_name:
        # Copy the entire folder with new name
        src_path = os.path.join("test/segmented", folder)
        dst_path = os.path.join("de_id_segmented", de_id_folder_name)
        shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
        print(f"Copied '{folder}' -> '{de_id_folder_name}'")
    else:
        print(f"Warning: No de-id mapping found for folder '{folder}'")

print("\nDe-identification completed!")
print("Contents of de_id_segmented directory:")
for item in sorted(os.listdir("de_id_segmented")):
    print(f"  {item}")

Original folders in segmented directory: ['Alex', 'Zoe', 'Ian', 'Shay', 'Shasha', 'Jian', 'Hailey', 'Advait', 'Ina', 'Jacob', 'Ezekiel', 'Ronny', 'Jaylon', 'Chris', 'Crosby', 'Maria', 'Sebastian', 'Sarah', 'Ederson', 'Atharva', 'Sage']

Lowercase de_id mapping:
  'esther' -> 'Maya'
  'malik' -> 'Oliver'
  'jaylon' -> 'Serena'
  'ronny' -> 'Devin'
  'chris' -> 'Lina'
  'zoe' -> 'Jasper'
  'advait' -> 'Naomi'
  'shay' -> 'Theo'
  'alex' -> 'Carmen'
  'atharva' -> 'Felix'
  'jacob' -> 'Aria'
  'ederson' -> 'Marcus'
  'ina' -> 'Tessa'
  'sebastian' -> 'Gabriel'
  'ian' -> 'Rhea'
  'maria' -> 'Daniel'
  'jian' -> 'Ivy'
  'sarah' -> 'Victor'
  'crosby' -> 'Elinor'
  'sage' -> 'Nolan'
Processing folder: alex
Copied 'Alex' -> 'Carmen'
Processing folder: zoe
Copied 'Zoe' -> 'Jasper'
Processing folder: ian
Copied 'Ian' -> 'Rhea'
Processing folder: shay
Copied 'Shay' -> 'Theo'
Processing folder: shasha
Processing folder: jian
Copied 'Jian' -> 'Ivy'
Processing folder: hailey
Processing folder: adv

In [60]:
# Remove the previous incomplete attempt
if os.path.exists("de_id_segmented"):
    shutil.rmtree("de_id_segmented")

# Create the de_id_segmented directory
os.makedirs("de_id_segmented", exist_ok=True)

# Get list of folders in segmented directory
segmented_folders = [item for item in os.listdir("segmented") if os.path.isdir(os.path.join("segmented", item))]
print("Original folders in segmented directory:", segmented_folders)

print("\nDe_id_dict mapping:")
for orig, de_id in de_id_dict.items():
    print(f"  '{orig}' -> '{de_id}'")

# Copy folders with name replacement using partial matching
for folder in segmented_folders:
    folder_lower = folder.lower().strip()
    
    # Find matching de-id name by checking if folder name appears in the original names
    de_id_folder_name = None
    for orig_name, de_id_name in de_id_dict.items():
        orig_name_lower = orig_name.lower().strip()
        
        # Check if folder name matches the beginning of original name or is contained in it
        if (folder_lower in orig_name_lower or 
            orig_name_lower.startswith(folder_lower) or
            any(word.startswith(folder_lower) for word in orig_name_lower.split())):
            de_id_folder_name = de_id_name
            print(f"Matched '{folder}' with '{orig_name}' -> '{de_id_name}'")
            break
    
    if de_id_folder_name:
        # Copy the entire folder with new name
        src_path = os.path.join("segmented", folder)
        dst_path = os.path.join("de_id_segmented", de_id_folder_name)
        shutil.copytree(src_path, dst_path)
        print(f"Copied '{folder}' -> '{de_id_folder_name}'")
    else:
        print(f"Warning: No de-id mapping found for folder '{folder}'")

# Also copy the README.md file if it exists
readme_src = "segmented/README.md"
readme_dst = "de_id_segmented/README.md"
if os.path.exists(readme_src):
    shutil.copy2(readme_src, readme_dst)
    print("Copied README.md")

print("\nDe-identification completed!")
print("Contents of de_id_segmented directory:")
for item in sorted(os.listdir("de_id_segmented")):
    print(f"  {item}")

Original folders in segmented directory: ['Alex', 'Zoe', 'Ian', 'Shay', 'Shasha', 'Jian', 'Hailey', 'Advait', 'Ina', 'Jacob', 'Ezekiel', 'Ronny', 'Jaylon', 'Chris', 'Crosby', 'Maria', 'Sebastian', 'Sarah', 'Ederson', 'Atharva', 'Sage']

De_id_dict mapping:
  'Esther Biden' -> 'Maya'
  'Malik Ashan' -> 'Oliver'
  'Jaylon Cooper' -> 'Serena'
  'Ronny' -> 'Devin'
  'Chris Martin ' -> 'Lina'
  'Zoe Cheng' -> 'Jasper'
  'Advait ' -> 'Naomi'
  'Shay Drake' -> 'Theo'
  'Alex Marasigan' -> 'Carmen'
  'Atharva Harshe' -> 'Felix'
  'jacob adams' -> 'Aria'
  'jack ederson' -> 'Marcus'
  'Ina Chou' -> 'Tessa'
  'Sebastian' -> 'Gabriel'
  'Ian Wong' -> 'Rhea'
  'Maria Rodriguez' -> 'Daniel'
  'Jian Chen' -> 'Ivy'
  'Sarah Odinma ' -> 'Victor'
  'Crosby Cox' -> 'Elinor'
  'Sage Zope' -> 'Nolan'
Matched 'Alex' with 'Alex Marasigan' -> 'Carmen'
Copied 'Alex' -> 'Carmen'
Matched 'Zoe' with 'Zoe Cheng' -> 'Jasper'
Copied 'Zoe' -> 'Jasper'
Matched 'Ian' with 'Sebastian' -> 'Gabriel'
Copied 'Ian' -> 'Gabr

FileExistsError: [Errno 17] File exists: 'de_id_segmented/Gabriel'

In [100]:
import json
import glob

# Create a reverse mapping from de-id names back to original names
original_to_de_id = {}
for orig_name, de_id_name in de_id_dict.items():
    original_to_de_id[orig_name.lower().strip()] = de_id_name

print("Original to de-id mapping:")
for orig, de_id in original_to_de_id.items():
    print(f"  '{orig}' -> '{de_id}'")

# Get all JSON files in de_id_segmented directory
json_files = glob.glob("de_id_segmented/*/*.json")
print(f"\nFound {len(json_files)} JSON files to process")

# Process each JSON file
for json_file in json_files:
    print(f"\nProcessing: {json_file}")
    
    # Extract folder name (which is the de-identified name)
    folder_name = os.path.basename(os.path.dirname(json_file))
    
    # Read the JSON file
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Track changes for logging
    changes_made = 0
    
    # Process each message
    for message in data.get('messages', []):
        original_speaker = message.get('speaker', '')
        
        # Skip if speaker is already "Chifang Chou"
        if original_speaker == "Chifang Chou":
            continue
            
        # Find if this speaker should be replaced with the folder name
        speaker_lower = original_speaker.lower().strip()
        should_replace = False
        
        # Check if this speaker corresponds to the folder's original name
        for orig_name_lower, de_id_name in original_to_de_id.items():
            if de_id_name == folder_name:
                # Check if the speaker matches this original name
                if (speaker_lower == orig_name_lower or 
                    speaker_lower in orig_name_lower or 
                    orig_name_lower.split()[0] == speaker_lower.split()[0]):
                    should_replace = True
                    break
        
        # Replace speaker name with folder name if it should be replaced
        if should_replace:
            message['speaker'] = folder_name
            changes_made += 1
            print(f"  Changed '{original_speaker}' -> '{folder_name}'")
    
    # Write back the updated JSON file if changes were made
    if changes_made > 0:
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4)
        print(f"  Updated {changes_made} speaker entries in {json_file}")
    else:
        print(f"  No changes needed for {json_file}")

print("\nSpeaker de-identification completed!")

Original to de-id mapping:
  'esther' -> 'Maya'
  'malik' -> 'Oliver'
  'jaylon' -> 'Serena'
  'ronny' -> 'Devin'
  'chris' -> 'Lina'
  'zoe' -> 'Jasper'
  'advait' -> 'Naomi'
  'shay' -> 'Theo'
  'alex' -> 'Carmen'
  'atharva' -> 'Felix'
  'jacob' -> 'Aria'
  'ederson' -> 'Marcus'
  'ina' -> 'Tessa'
  'sebastian' -> 'Gabriel'
  'ian' -> 'Rhea'
  'maria' -> 'Daniel'
  'jian' -> 'Ivy'
  'sarah' -> 'Victor'
  'crosby' -> 'Elinor'
  'sage' -> 'Nolan'

Found 90 JSON files to process

Processing: de_id_segmented/Devin/0.json
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Updated 14 speaker entries in de_id_segmented/Devin/0.json

Proces

In [None]:
# Verification: Check speaker names in a few sample files
sample_files = [
    "de_id_segmented/Carmen/0.json",
    "de_id_segmented/Jasper/1.json", 
    "de_id_segmented/Rhea/2.json"
]

print("Verification - Sample speaker names after de-identification:")
for file_path in sample_files:
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        speakers = set()
        for message in data.get('messages', []):
            speakers.add(message.get('speaker', ''))
        
        print(f"\n{file_path}:")
        for speaker in sorted(speakers):
            print(f"  - {speaker}")

# Count total files processed
total_files = len(glob.glob("de_id_segmented/*/*.json"))
print(f"\nTotal JSON files processed: {total_files}")
print("All speaker names should now be either 'Chifang Chou' or the folder name (de-identified name)")