In [46]:
# First let's make sure to read the transcription correctly

import pandas as pd

def read_transcription_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [line for line in f if not line.startswith('#') and line.strip() != '']


    from io import StringIO
    data_str = ''.join(lines)
    df = pd.read_csv(StringIO(data_str), sep='|', header=None, names=['Start Time', 'End Time', 'Speaker', 'Transcription'])

    df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

    return df

file_path = 'Test/full/NVTV25-R (SUSPECTED)_full.txt'
# file_path = 'Test/full/NATA41 (SUSPECTED)_full.txt'
# file_path = 'Test/full/NVTV35 (SUSPECTED)_full.txt'
# NVTV26 (SUSPECTED)_full
# NVTV25-R (SUSPECTED)_full

df = read_transcription_file(file_path)
print(df.head())
print(len(df))

     Start Time      End Time    Speaker  \
0  00:00:00.000  00:00:01.050  speaker_2   
1  00:00:02.220  00:00:03.450  speaker_2   
2  00:00:04.219  00:00:04.729  Speaker 1   
3  00:00:06.059  00:00:06.684  Speaker 1   
4  00:00:06.684  00:00:08.250  speaker_2   

                                     Transcription  
0                       I'm going to send you the.  
1                         I guess it is reporting.  
2  Take two thdays and.... speak to you tatue ball  
3                                  congratulations  
4                   Yep, this has changed already.  
79


In [38]:
import re

def normalize_speaker_label(s):
    match = re.search(r'(\d+)', s)
    if match:
        return f"Speaker {match.group(1)}"
    return s.strip().title()

df['Speaker'] = df['Speaker'].apply(normalize_speaker_label)

In [39]:
df['Speaker'].unique()

array(['Speaker 0', 'Speaker 1', 'Speaker 2'], dtype=object)

# LLM

In [4]:
import os
from openai import OpenAI

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [8]:
def get_sample_dialogue(df):
    lines = []
    for _, row in df.iterrows():
        speaker = row["Speaker"]
        text = row["Transcription"]
        lines.append(f"{speaker}: {text}")
    return "\n".join(lines)

In [9]:
def identify_roles_with_gpt(conversation_text):
    system_prompt = (
        "You are an assistant helping identify speaker roles in a two-person conversation. "
        "Ali is the person offering money. Rowan is the one deciding to accept or reject."
    )

    user_prompt = (
        "Given the following conversation, determine who is Ali and who is Rowan.\n\n"
        f"{conversation_text}\n\n"
        "Please respond in the format:\n"
        "Speaker 1 is <NAME>\n"
        "Speaker 2 is <NAME>"
    )

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.3
    )

    return response.choices[0].message.content

In [10]:
sample_text = get_sample_dialogue(df)
llm_output = identify_roles_with_gpt(sample_text)
print(llm_output)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

# Role classification via Embedding

In [30]:
# Semantic base Role Classification

from sentence_transformers import SentenceTransformer, util
import pandas as pd
import torch

model = SentenceTransformer('all-mpnet-base-v2')

giver_prompts = [
    "I offer you dollars.",
    "I am offering you some dollars",
    "Here is the amount I'm giving.",
    "I decide how much to allocate.",
    "This is my offer",
    "Here is my offer",
    "I decide how much money to give",
    "I am going to give you",
]
receiver_prompts = [
    "I will accept your offer.",
    "I am rejecting the offer.",
    "I reject that",
    "I need to think about your offer",
    "You are offering me money?",
    "I get to decide if the offer is fair.",
    "I am going to accept it",
    "I will reject it",
    "No, I don't want it",
]

giver_embed = torch.stack([model.encode(p, convert_to_tensor=True) for p in giver_prompts]).mean(dim=0)
receiver_embed = torch.stack([model.encode(p, convert_to_tensor=True) for p in receiver_prompts]).mean(dim=0)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [27]:
import pandas as pd

def assign_roles(df, text_col="Transcription", speaker_col="Speaker", top_k=50):
    # Focus on top_k utterances for each speaker
    role_map = {}
    for speaker in df[speaker_col].unique():
        texts = df[df[speaker_col] == speaker][text_col].tolist()[:top_k]
        if not texts:
            continue
        speaker_embs = model.encode(texts, convert_to_tensor=True)
        # Compute similarity
        giver_score = util.cos_sim(speaker_embs, giver_embed).mean().item()
        receiver_score = util.cos_sim(speaker_embs, receiver_embed).mean().item()
        role_map[speaker] = "Giver" if giver_score > receiver_score else "Receiver"
        print(f"speaker: {speaker} --> scores: {giver_score}, {receiver_score}")
    return role_map

In [28]:
def determine_roles_by_embedding(df):
    # Group all text per speaker
    speaker_texts = {
        speaker: " ".join(df[df["Speaker"] == speaker]["Transcription"].tolist())
        for speaker in df["Speaker"].unique()
    }

    scores = {}

    for speaker, text in speaker_texts.items():
        if not text.strip():
            scores[speaker] = {"giver": 0.0, "receiver": 0.0}
            continue

        text_embed = model.encode(text, convert_to_tensor=True)
        scores[speaker] = {
            "giver": util.cos_sim(text_embed, giver_embed).item(),
            "receiver": util.cos_sim(text_embed, receiver_embed).item()
        }

    # Compute score difference
    role_scores = {
        speaker: score["giver"] - score["receiver"]
        for speaker, score in scores.items()
    }

    sorted_speakers = sorted(role_scores, key=role_scores.get, reverse=True)
    print(role_scores)
    if len(sorted_speakers) >= 2:
        return {
            sorted_speakers[0]: "Giver",
            sorted_speakers[-1]: "Receiver"
        }
    else:
        # Fallback (only one speaker)
        return {sorted_speakers[0]: "Giver"}

In [29]:
role_map = assign_roles(df)
role_map

speaker: Speaker 2 --> scores: 0.76962810754776, 0.7655006051063538
speaker: Speaker 1 --> scores: 0.7702497839927673, 0.776236891746521


{'Speaker 2': 'Giver', 'Speaker 1': 'Receiver'}

In [74]:
# NVTV25
determine_roles_by_embedding(df)

{'Speaker 2': -0.07934367656707764, 'Speaker 1': 0.0026491880416870117}


{'Speaker 1': 'Giver', 'Speaker 2': 'Receiver'}

In [80]:
# NATA41
determine_roles_by_embedding(df)

{'Speaker 0': -0.03853234648704529, 'Speaker 1': 0.01988561451435089, 'Speaker 2': 0.0289078950881958}


{'Speaker 2': 'Giver', 'Speaker 0': 'Receiver'}

In [84]:
# NVTV35
determine_roles_by_embedding(df)

{'Speaker 1': 0.06889933347702026, 'speaker_4': 0.067948117852211, 'Speaker 2': 0.0006491243839263916, 'speaker_2': 0.03777685761451721, 'speaker_3': 0.06741087883710861}


{'Speaker 1': 'Giver', 'Speaker 2': 'Receiver'}

In [6]:
# NVTV35
determine_roles_by_embedding(df)

{'Speaker 1': 0.06889933347702026, 'Speaker 4': 0.067948117852211, 'Speaker 2': 0.024840623140335083, 'Speaker 3': 0.06741087883710861}


{'Speaker 1': 'Giver', 'Speaker 2': 'Receiver'}

## TF-IDF+ LR classifier

In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Example labeled training data
data = [
    ("I offer you five dollars.", "Giver"),
    ("Here is the amount I'm giving.", "Giver"),
    ("You can take this money.", "Giver"),
    ("I think five is fair.", "Giver"),
    ("I will accept your offer.", "Receiver"),
    ("Why are you giving me money?", "Receiver"),
    ("I reject that offer.", "Receiver"),
    ("I don't think it's enough.", "Receiver"),
    # Optionally add neutral/other roles with "Other"
]

# Create DataFrame
df = pd.DataFrame(data, columns=["utterance", "role"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df["utterance"], df["role"], test_size=0.2, random_state=42)

# Define pipeline: TF-IDF + Logistic Regression
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 2))),
    ("clf", LogisticRegression())
])

# Train model
pipeline.fit(X_train, y_train)

# Evaluate model
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Predict on new utterances
new_utterances = [
    "I'm offering you ten dollars.",
    "I think this is a fair deal.",
    "I'm not sure about your offer.",
    "Yes, I accept your offer.",
    "Why would you offer that?"
]

predictions = pipeline.predict(new_utterances)

for utterance, role in zip(new_utterances, predictions):
    print(f"{utterance} --> {role}")

              precision    recall  f1-score   support

       Giver       0.50      1.00      0.67         1
    Receiver       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

I'm offering you ten dollars. --> Giver
I think this is a fair deal. --> Giver
I'm not sure about your offer. --> Receiver
Yes, I accept your offer. --> Receiver
Why would you offer that? --> Receiver


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Keyword matching

In [41]:
import pandas as pd
import re
from collections import defaultdict, Counter

def preprocess(text):
    return re.findall(r'\w+', text.lower())

# Phrases and keywords
giver_keywords = {"offer", "give", "giving", "allocate", "amount", "money", "dollars"}
receiver_keywords = {"accept", "reject", "take", "fair", "decline", "deal"}

giver_phrases = ["i'm offering", "i offer", "i'm supposed to offer", "i decide how much"]
receiver_phrases = ["i accept", "i reject", "do you accept", "i'll accept", "they're offering me"]

# Observer indicators
observer_keywords = ["end the call", "have a couple surveys", "see you", "i'll just end"]

def identify_roles_with_observer_check(df):
    speaker_scores = defaultdict(lambda: {"giver": 0, "receiver": 0, "is_observer": False})

    for _, row in df.iterrows():
        speaker = row["Speaker"].strip().lower()
        text = row["Transcription"].lower()

        if any(obs in text for obs in observer_keywords):
            speaker_scores[speaker]["is_observer"] = True
            continue

        # Phrase-based boosting
        for phrase in giver_phrases:
            if phrase in text:
                speaker_scores[speaker]["giver"] += 3
        for phrase in receiver_phrases:
            if phrase in text:
                speaker_scores[speaker]["receiver"] += 3

        # Token-based keyword scoring
        tokens = preprocess(text)
        for token in tokens:
            if token in giver_keywords:
                speaker_scores[speaker]["giver"] += 1
            if token in receiver_keywords:
                speaker_scores[speaker]["receiver"] += 1

    # Assign final roles
    roles = {}
    for speaker, scores in speaker_scores.items():
        if scores["is_observer"]:
            roles[speaker] = "Observer"
        elif scores["giver"] > scores["receiver"]:
            roles[speaker] = "Giver"
        elif scores["receiver"] > scores["giver"]:
            roles[speaker] = "Receiver"
        else:
            roles[speaker] = "Unknown"
    return roles

In [47]:
# NVTV25
# print(df.head())
identify_roles_with_observer_check(df)

{'speaker 1': 'Receiver', 'speaker 2': 'Receiver'}

In [42]:
#NATA41
# print(df.head())
identify_roles_with_observer_check(df)

{'speaker 0': 'Observer', 'speaker 1': 'Receiver', 'speaker 2': 'Giver'}

In [None]:
#NATA35
# print(df.head())
identify_roles_with_observer_check(df)

# ConLL

In [13]:
import re
from collections import defaultdict, Counter

# Define a custom whitelist of expected names
VALID_NAMES = {'ali', 'rowan', 'rohan', 'sam', 'bailey'}

def clean_name(name):
    name = name.strip().lower()
    # Reject common verbs or non-names
    if name in VALID_NAMES:
        return name.capitalize()
    return None

def parse_transcription(transcript_text):
    speaker_lines = []
    for line in transcript_text.strip().split('\n'):
        if '|' not in line or line.startswith('#'):
            continue
        parts = line.strip().split('|')
        if len(parts) != 4:
            continue
        _, _, speaker, utterance = parts
        speaker = speaker.strip().lower()
        utterance = utterance.strip()
        speaker_lines.append((speaker, utterance))
    return speaker_lines

def extract_names_with_whitelist(speaker_lines):
    patterns = [
        r"\bmy name is (\w+)",
        r"\bi am (\w+)",
        r"\bi'm (\w+)",
        r"\bthis is (\w+)",
        r"\b(\w+) is my name",
    ]

    candidates = defaultdict(list)

    for speaker, utt in speaker_lines:
        utt_lower = utt.lower()
        for pattern in patterns:
            match = re.search(pattern, utt_lower)
            if match:
                raw_name = match.group(1)
                name = clean_name(raw_name)
                if name:
                    candidates[speaker].append(name)
                break

    final_names = {}
    for speaker, name_list in candidates.items():
        if name_list:
            final_names[speaker] = Counter(name_list).most_common(1)[0][0]
    return final_names


In [14]:
file_path = 'Test/full/NVTV25-R (SUSPECTED)_full.txt'
with open(file_path, "r") as f:  # Or directly paste the content as a string
        transcription = f.read()

speaker_lines = parse_transcription(transcription)
detected_names = extract_names_with_whitelist(speaker_lines)

print("Detected speaker names:")
for speaker, name in detected_names.items():
    print(f"{speaker}: {name}")

Detected speaker names:
speaker 1: Ali
speaker 2: Rohan


## Using NER to detect different varations

In [17]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [60]:
import re
from collections import defaultdict, Counter
from thefuzz import fuzz

VALID_NAMES = ['ali', 'rowan', 'sam', 'bailey']
BAD_NAME_TOKENS = {'okay', 'good', 'sure', 'fine', 'yes', 'no', 'cool', 'hi', 'bye', 'hello', 'alright'}

SELF_PATTERNS = [
    r"\bmy name is (\w+)\b",
    r"\bi am (\w+)\b",
    r"\bi'm (\w+)\b",
    r"\bthis is (\w+)\b",
    r"\b(\w+) is my name\b"
]

def parse_transcription(transcript_text):
    lines = []
    for line in transcript_text.strip().split('\n'):
        if line.startswith('#') or '|' not in line:
            continue
        parts = line.split('|')
        if len(parts) >= 4:
            _, _, speaker, utterance = parts[:4]
            lines.append((speaker.strip().lower(), utterance.strip()))
    return lines

def extract_names_by_pattern(lines):
    name_candidates = defaultdict(list)

    for speaker, utt in lines:
        utt_lower = utt.lower()

        for pattern in SELF_PATTERNS:
            match = re.search(pattern, utt_lower)
            if match:
                raw_name = match.group(1).strip().lower()

                if len(raw_name) < 3 or raw_name in BAD_NAME_TOKENS:
                    break

                # Fuzzy token sort ratio
                scores = [(name, fuzz.token_sort_ratio(raw_name, name)) for name in VALID_NAMES]
                best_match, best_score = max(scores, key=lambda x: x[1])
                print(f"raw name: {raw_name}, best mathc: {best_match}, score: {best_score}")
                if best_score >= 60:  # lowered threshold for rare names like Rola → Rowan
                    name_candidates[speaker].append(best_match.capitalize())
                break

    speaker_names = {}
    for speaker, names in name_candidates.items():
        if names:
            speaker_names[speaker] = Counter(names).most_common(1)[0][0]
    return speaker_names

In [58]:
file_path = 'Test/full/NVTV25-R (SUSPECTED)_full.txt'
with open(file_path, "r", encoding="utf-8") as f:
    transcript = f.read()

parsed_lines = parse_transcription(transcript)
speaker_names = extract_names_by_pattern(parsed_lines)

print("Detected speaker names:")
for speaker, name in speaker_names.items():
    print(f"{speaker}: {name}")

Detected speaker names:
speaker 1: Ali
speaker 2: Rowan


In [61]:
file_path = 'Test/full/NATA41-R (UNPINNED)_full.txt'

with open(file_path, "r", encoding="utf-8") as f:
    transcript = f.read()

parsed_lines = parse_transcription(transcript)
speaker_names = extract_names_by_pattern(parsed_lines)

print("Detected speaker names:")
for speaker, name in speaker_names.items():
    print(f"{speaker}: {name}")

raw name: just, best mathc: sam, score: 29
raw name: just, best mathc: sam, score: 29
raw name: going, best mathc: rowan, score: 40
raw name: just, best mathc: sam, score: 29
raw name: rola, best mathc: rowan, score: 67
raw name: ali, best mathc: ali, score: 100
raw name: studying, best mathc: ali, score: 18
raw name: liking, best mathc: ali, score: 44
raw name: currently, best mathc: rowan, score: 29
raw name: more, best mathc: sam, score: 29
raw name: not, best mathc: rowan, score: 25
raw name: supposed, best mathc: sam, score: 18
raw name: offering, best mathc: rowan, score: 31
raw name: gonna, best mathc: rowan, score: 40
raw name: kind, best mathc: ali, score: 29
Detected speaker names:
speaker 1: Rowan
speaker 2: Ali


In [51]:
import re
from collections import defaultdict, Counter
from thefuzz import fuzz
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Known valid participant names
VALID_NAMES = ['ali', 'rowan', 'sam', 'bailey']

# Regex patterns for self-identification
SELF_PATTERNS = [
    r"\bmy name is (\w+)",
    r"\bi am (\w+)",
    r"\bi'm (\w+)",
    r"\bthis is (\w+)",
    r"\b(\w+) is my name",
]

INVALID_NAME_TOKENS = {
    # Verbs (base and present participle forms)
    "offering", "offer", "giving", "give", "rejecting", "reject", "accepting", "accept",
    "doing", "going", "thinking", "think", "studying", "study", "saying", "say",
    "living", "live", "coming", "come", "taking", "take", "asking", "ask",
    "speaking", "speak", "sitting", "sit", "watching", "watch", "talking", "talk",
    "waiting", "wait", "starting", "start", "working", "work", "recording", "record",

    # Modal and auxiliary verbs
    "am", "is", "are", "was", "were", "be", "been", "being", "do", "does", "did", "have", "has", "had",

    # Common adjectives/adverbs
    "just", "more", "most", "some", "any", "not", "even", "currently", "later", "now", "soon", "only",

    # Pronouns and function words
    "i", "you", "he", "she", "we", "they", "it", "your", "my", "this", "that", "those", "these",

    # Conjunctions, prepositions
    "and", "or", "but", "if", "because", "so", "also", "then", "when", "while", "where", "in", "on", "at", "of",

    # Noise tokens and fillers
    "uh", "um", "yeah", "okay", "okay.", "yes", "no", "thanks", "hello", "hi", "bye"
}

def clean_name(raw):
    raw = raw.lower().strip()
    return raw if raw in VALID_NAMES else None

def is_likely_name(word):
    """Use spaCy to check if a word is likely a proper name (Noun or pronoun)"""
    doc = nlp(word)
    return any(token.pos_ in {"PROPN", "NOUN"} for token in doc)

def parse_transcription(text):
    lines = []
    for line in text.strip().split('\n'):
        if line.startswith('#') or '|' not in line:
            continue
        parts = line.split('|')
        if len(parts) >= 4:
            _, _, speaker, utterance = parts[:4]
            lines.append((speaker.strip().lower(), utterance.strip()))
    return lines

def extract_names_by_max_score(lines):
    speaker_name_scores = defaultdict(lambda: defaultdict(int))  # speaker → name → total score

    for speaker, utt in lines:
        for pattern in SELF_PATTERNS:
            match = re.search(pattern, utt, re.IGNORECASE)
            if match:
                raw_name = match.group(1).strip()
                if not is_likely_name(raw_name):
                    # print(f"\t[DEBUG] Skipping {raw_name} as it is not noun")
                    continue
                if raw_name in INVALID_NAME_TOKENS:
                    continue
                
                print(f"[DEBUG] Speaker: {speaker}, Matched pattern: '{pattern}', Raw name: '{raw_name}', Utterance: '{utt}'")
                for valid in VALID_NAMES:
                    score = fuzz.ratio(raw_name.lower(), valid)
                    speaker_name_scores[speaker][valid] += score
                break  # stop at first matching pattern

    final_names = {}
    print(speaker_name_scores)
    for speaker, scores in speaker_name_scores.items():
        if scores:
            best_match = max(scores.items(), key=lambda x: x[1])
            final_names[speaker] = best_match[0].capitalize()
    return final_names

def infer_roles_from_behavior(speaker_lines):
    role_actions = defaultdict(Counter)
    for speaker, utt in speaker_lines:
        utt_lower = utt.lower()
        if any(phrase in utt_lower for phrase in ["i got", "i offer", "i'm offering", "i am offering", "i'll give", "i will give", "i'm giving", 'i am giving', "i give", "you rejected it", "you approved it", "you take it", "you took it"]):
            role_actions[speaker]['offer'] += 1
        elif any(phrase in utt_lower for phrase in ['i accept', "i'll accept", "i agree", "i am going to accept", "i'm going to accept", "i'm gonna accept", "i am gonna accept"]):
            role_actions[speaker]['accept'] += 1
        elif any(phrase in utt_lower for phrase in ['i reject', "i don't accept", "i refuse", "i'm gonna reject", "i am gonna reject", "i'm going to reject", "i am going to reject"]):
            print(f"speaker {speaker} is rejecting with utternace: {utt_lower}")
            role_actions[speaker]['reject'] += 1

    inferred_roles = {}
    print(role_actions)
    if role_actions:
        offer_speaker = max(role_actions.items(), key=lambda x: x[1]['offer'])[0]
        inferred_roles[offer_speaker] = 'Ali'
        accept_candidates = {s: v['accept'] + v['reject'] for s, v in role_actions.items() if s != offer_speaker}
        if accept_candidates:
            receive_speaker = max(accept_candidates.items(), key=lambda x: x[1])[0]
            inferred_roles[receive_speaker] = "Rowan"
    return inferred_roles

def detect_speakers(text):
    speaker_lines = parse_transcription(text)
    name_guesses = extract_names_by_max_score(speaker_lines)
    role_guesses = infer_roles_from_behavior(speaker_lines)
    print(role_guesses)
    return {**role_guesses, **name_guesses}

In [16]:
file_path = 'Test/full/NATA41-R (UNPINNED)_full.txt'

with open(file_path, "r", encoding="utf-8") as f:
    transcript = f.read()

lines = parse_transcription(transcript)
speaker_names = extract_names_by_max_score(lines)

print("Detected speaker names:")
for speaker, name in speaker_names.items():
    print(f"{speaker}: {name}")

[DEBUG] Speaker: speaker 1, Matched pattern: '\bi'm (\w+)', Raw name: 'just', Utterance: 'So I'm just saying if you accept her.'
	[DEBUG] Skipping just as it is not noun
[DEBUG] Speaker: speaker_0, Matched pattern: '\bthis is (\w+)', Raw name: 'just', Utterance: 'This is just like you guys talk. You accept or reject their offer. That's it.'
	[DEBUG] Skipping just as it is not noun
[DEBUG] Speaker: speaker_0, Matched pattern: '\bi'm (\w+)', Raw name: 'I', Utterance: 'I Yeah, so you gotta turn your camera off Um, I figured that it would be um turned on so I was like, okay, I'll just stop. I did the same thing. I'm I'm based towards me Um, all right Rename your stuff. It's 41 Yeah, I was just doing that perfect That's good. Um, I'm gonna press record on oh I'm recording. Yeah. I forgot this You want to do'
	[DEBUG] Skipping I as it is not noun
[DEBUG] Speaker: speaker_0, Matched pattern: '\bi'm (\w+)', Raw name: 'going', Utterance: 'Just send a request. Okay. Um, so you guys know the rule

In [17]:
file_path = 'Test/full/NATA41 (SUSPECTED)_full.txt'

with open(file_path, "r", encoding="utf-8") as f:
    transcript = f.read()

lines = parse_transcription(transcript)
speaker_names = extract_names_by_max_score(lines)

print("Detected speaker names:")
for speaker, name in speaker_names.items():
    print(f"{speaker}: {name}")

[DEBUG] Speaker: speaker 1, Matched pattern: '\bi'm (\w+)', Raw name: 'just', Utterance: 'So I'm just saying if you accept her.'
	[DEBUG] Skipping just as it is not noun
[DEBUG] Speaker: speaker_0, Matched pattern: '\bthis is (\w+)', Raw name: 'just', Utterance: 'This is just like you guys talk. You accept or reject their offer. That's it.'
	[DEBUG] Skipping just as it is not noun
[DEBUG] Speaker: speaker_0, Matched pattern: '\bi'm (\w+)', Raw name: 'I', Utterance: 'I Yeah, so you gotta turn your camera off Um, I figured that it would be um turned on so I was like, okay, I'll just stop. I did the same thing. I'm I'm based towards me Um, all right Rename your stuff. It's 41 Yeah, I was just doing that perfect That's good. Um, I'm gonna press record on oh I'm recording. Yeah. I forgot this Uh'
	[DEBUG] Skipping I as it is not noun
[DEBUG] Speaker: speaker_0, Matched pattern: '\bi'm (\w+)', Raw name: 'going', Utterance: 'Just send a request. Okay. Um, so you guys know the rules. Um, I'm g

In [18]:
file_path = 'Test/full/NVTV25-R (SUSPECTED)_full.txt'

with open(file_path, "r", encoding="utf-8") as f:
    transcript = f.read()

lines = parse_transcription(transcript)
speaker_names = extract_names_by_max_score(lines)

print("Detected speaker names:")
for speaker, name in speaker_names.items():
    print(f"{speaker}: {name}")

[DEBUG] Speaker: speaker_2, Matched pattern: '\bi'm (\w+)', Raw name: 'going', Utterance: 'I'm going to send you the.'
	[DEBUG] Skipping going as it is not noun
[DEBUG] Speaker: speaker_2, Matched pattern: '\bi'm (\w+)', Raw name: 'going', Utterance: 'That's fine. I'm going to turn on our video because you guys got to do the same.'
	[DEBUG] Skipping going as it is not noun
[DEBUG] Speaker: speaker_2, Matched pattern: '\bi'm (\w+)', Raw name: 'going', Utterance: 'And then you guys should be able to see me. I'm going to have my project going to sit here. And then you guys can get straight.'
	[DEBUG] Skipping going as it is not noun
[DEBUG] Speaker: speaker 1, Matched pattern: '\bi'm (\w+)', Raw name: 'Ali', Utterance: 'I'm Ali.'
[DEBUG] Speaker: speaker 2, Matched pattern: '\bi'm (\w+)', Raw name: 'Rohan', Utterance: 'Hi, I'm Rohan.'
[DEBUG] Speaker: speaker 1, Matched pattern: '\bi'm (\w+)', Raw name: 'good', Utterance: 'I'm good. How are you?'
	[DEBUG] Skipping good as it is not noun
[

In [19]:
file_path = 'Test/full/NVTV35 (SUSPECTED)_full.txt'
with open(file_path, "r", encoding="utf-8") as f:
    transcript = f.read()

lines = parse_transcription(transcript)
speaker_names = extract_names_by_max_score(lines)

print("Detected speaker names:")
for speaker, name in speaker_names.items():
    print(f"{speaker}: {name}")

[DEBUG] Speaker: speaker 1, Matched pattern: '\bi'm (\w+)', Raw name: 'going', Utterance: 'Yeah. All right. So during your interaction, I cannot be in the room. So I'm going to leave and then wait on the, like the right and front of the door. So once you're done, you can just open it and then come back and take you to. Okay.'
	[DEBUG] Skipping going as it is not noun
[DEBUG] Speaker: speaker_2, Matched pattern: '\bi'm (\w+)', Raw name: 'offering', Utterance: 'Okay, so I was given $10 and I'm offering you five.'
[DEBUG] Speaker: speaker_4, Matched pattern: '\bi'm (\w+)', Raw name: 'going', Utterance: 'I'm going to go ahead and see if I can get a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit 

In [52]:
file_path = 'Test/full/NVTV35 (SUSPECTED)_full.txt'
# file_path = 'Test/full/NATA41-R (UNPINNED)_full.txt'
# file_path = 'Test/full/NVTV25-R (SUSPECTED)_full.txt'
# file_path = 'Test/full/NVTV26 (SUSPECTED)_full.txt'
with open(file_path, "r", encoding="utf-8") as f:
    transcript = f.read()

detected = detect_speakers(transcript)

print("Detected speaker names:")
print(detected)
# for speaker, name in speaker_names.items():
    # print(f"{speaker}: {name}")

defaultdict(<function extract_names_by_max_score.<locals>.<lambda> at 0x7f401b10bac0>, {})
defaultdict(<class 'collections.Counter'>, {'speaker_2': Counter({'offer': 2})})
{'speaker_2': 'Ali'}
Detected speaker names:
{'speaker_2': 'Ali'}
