In [15]:
# as always, some boilerplate
from openai import OpenAI
import os
import base64
import requests
from tqdm import tqdm
from IPython.display import FileLink, display, Markdown
from dotenv import load_dotenv
from random import shuffle, randint, choice, random
from math import floor
from convokit import Corpus, Speaker, Utterance

# Load API key
_ = load_dotenv("../../../comm4190_F25/01_Introduction_and_setup/.env")
client = OpenAI()

# changing the topic to make it a bit more conversational too and less of a debate
TOPIC = """Code, testing, and infra as a source of truth versus comprehensive documentation."""

# we're interested in consensus
EVALUATION_PROMPT = """
Your objective is to analyze this conversation between a few speakers.
Your response should follow this organization:
- Dynamic: Collaborative (1) vs. Competitive (10)
- Conclusiveness: Consensus (1) vs. Divergence (10)
- Speaker Identity: Similarity (1) vs. Diversity (10)
- Speaker Fluidity: Malleability (1) vs. Consistency (10)
Please offer a score from 1 to 10 for each.
For each section, format your result as follows:
**[Section Name]:**

Score: [score]/10

Verdict: [a short summary]

Explanation: [reasoning with explicit examples from the conversation]

Use Markdown when convenient.
"""

def generate_llm_conversation_review(conversation: str):
    input_chat = [
        {
            "role": "system",
            "content": EVALUATION_PROMPT
        },
        {
            "role": "user",
            "content": "Here is the transcript\n" + conversation
        }
    ]
    response = client.chat.completions.create(
        model = "gpt-4o",
        messages = input_chat,
        store = False
    )
    display(Markdown(response.choices[0].message.content))

# code to save the conversation
def save_conversation(
    filename: str,
    conversation_history: list[dict]
) -> str:

    messages = []

    for record in conversation_history:

        if record["role"] == "user":
            messages.append("mediator:\n" + record["content"])
        
        if record["role"] == "assistant":
            messages.append(f"{record["name"]}:\n{record["content"]}")
    
    conversation_transcript = "\n\n".join(messages)
    
    with open(filename, "w", encoding="utf-8") as f:
        f.write(conversation_transcript)
    
    display(FileLink(filename))

    return conversation_transcript

## Generating a Conversation
Let's build a sufficiently long conversation so we can create a way to analyze it.

In [2]:
NEW_SYSTEM_PROMPT = (
    "You a participant in a conversation between experienced software engineers. "
    "Keep questions minimal and only use them when necessary. "
    "Please greet the other participants when you join."
)

def run_conversation(
    iterations: int, 
    openai_model_id: str,
    participant_count: int,
    participant_personas: list[str],
    topic: str,
    system_prompt: str,
    dropout_chance: float
) -> list[dict]:
    conversation_history = [
        {"role": "system", "content": f"{system_prompt} The topic is: {topic}"}
    ]

    ordering = list(range(1, participant_count + 1))
    last_speaker = -1

    def build_message(history, speaker_id, persona, message_window_size):

        speaker_messages = [
            msg for msg in history 
            if msg.get("name") == speaker_id
        ][-message_window_size:]
    
        other_messages = [
            msg for msg in history 
            if msg.get("name") not in (None, speaker_id)  # skip system, skip self
        ][-message_window_size:]

        transcript = []
        if speaker_messages:
            transcript.append("Recent messages from you:")
            transcript.extend(
                f"- {msg['content']}" for msg in speaker_messages
            )
        if other_messages:
            transcript.append("\nRecent messages from others:")
            transcript.extend(
                f"- {msg.get('name', msg['role'])}: {msg['content']}"
                for msg in other_messages
            )
    
        transcript_str = "\n".join(transcript)
        
        return history + [
            {
                "role": "user", 
                "content": (
                    f"{speaker_id}, please share your perspective with the others and engage "
                    f"with their responses."
                )
            },
            {
                "role": "assistant",
                "name": speaker_id,
                "content": (
                    f"I should remember that the following is the most current state of the conversation.\n"
                    f"{transcript_str}\n\n"
                    f"I also recall my identity is {persona}."
                )
            }
        ]

    def shuffle_order(ordering: list[int]) -> list[int]:
        first = choice(ordering[:-1])
        remaining = [p for p in ordering if p != first]
        shuffle(remaining)
        return [first] + remaining

    for i in tqdm(range(iterations)):

        # shuffle ordering
        if i > 0:
            ordering = shuffle_order(ordering)

        # follow ordering
        for participant_id in ordering:

            # chance to skip speaker and avoid double speak (1984)
            if random() < dropout_chance or last_speaker == participant_id:
                continue

            speaker_id = f"speaker_{participant_id}"
            persona = participant_personas[participant_id - 1]
            response = client.chat.completions.create(
                model = openai_model_id,
                messages=build_message(conversation_history, speaker_id, persona, 5),
                store = False
            )
            message = response.choices[0].message.content
            conversation_history.append({"role": "assistant", "name": speaker_id, "content": message})
            last_speaker = participant_id

    return conversation_history

In [5]:
personas = [
    "a software engineer in big tech with mainly internal work",
    "an open source developer with experience in major upstream projects",
    "a founder of a startup"
]
conversation = run_conversation(20, 'gpt-4o', 3, personas, TOPIC, NEW_SYSTEM_PROMPT, 0.3)

100%|██████████| 20/20 [02:52<00:00,  8.63s/it]


In [7]:
transcript = save_conversation("long_conversation.txt", conversation)

## Our Old LLM Analyzer
We love our good buddy GPT-4o. Not doing work is super exciting. 

In [12]:
import re
# load transcript from here to pick up where we left off
with open("long_conversation.txt", "r") as file:
    transcript = file.read()

pattern = re.compile(r'(speaker_\d+):\n(.*?)(?=\nspeaker_\d+:|$)', re.DOTALL)

matches = pattern.findall(transcript)

# Display results
conversation = []
for speaker, message in matches:
    message = message.strip().replace('\n', ' ')
    conversation.append({
        "name": speaker,
        "content": message,
        "role": "assistant"
    })

In [4]:
generate_llm_conversation_review(transcript)

**Dynamic:**

Score: 2/10

Verdict: The conversation is distinctly collaborative, with participants building on each other's points and sharing strategies.

Explanation: Speakers frequently express agreement and expand upon each other's ideas, as seen in speaker_2 supporting speaker_3's points and vice versa. Their discussions revolve around mutual challenges and solutions, recognizing each other's contributions positively.

**Conclusiveness:**

Score: 2/10

Verdict: The conversation leans heavily towards consensus, with participants finding common ground and aligning their views on documentation practices.

Explanation: All speakers articulate similar viewpoints on key topics, such as the balance between code and documentation, and the role of tools like AI and collaborative platforms. Despite different environments (startup, open-source, big tech), they converge on solutions, indicating low divergence.

**Speaker Identity:**

Score: 7/10

Verdict: The speakers exhibit distinct identities based on their environments (startup, open-source, big tech), affecting their approaches to documentation.

Explanation: Each speaker brings a unique perspective influenced by their specific context, such as speaker_3's agility focus in startups, speaker_1's emphasis on comprehensive documentation in big tech, and speaker_2's open-source considerations. However, they share a common interest in documentation improvements.

**Speaker Fluidity:**

Score: 8/10

Verdict: Speakers maintain consistent stances and identities throughout the conversation, contributing steadily from their perspectives.

Explanation: Each speaker consistently integrates their experiences and methodologies into the discussion, reinforcing their individual professional backgrounds. They stick to their respective contexts and offer specific examples (e.g., tools and strategies) relevant to their environments, reflecting consistent roles in the dialogue.

## Build a Corpus
Now it's time for us to assemble a ConvoKit corpus, which is a common data structure that forms the base of a lot of analysis pipelines in ConvoKit.

In [65]:
# filter messages for assistant messages only
assistant_messages = [
    message for message in conversation if message["role"] == "assistant"
]

speakers = {message["name"]: Speaker(id=message["name"]) for message in assistant_messages}

print(len(assistant_messages))
print(list(speakers))

utterances = [Utterance(
    id=str(i),
    speaker=speakers[message["name"]],
    text=message["content"],
    conversation_id="conversation_1",
    reply_to= str(i-1) if i != 0 else None
) for i, message in enumerate(assistant_messages)]

corpus = Corpus(utterances=utterances)

print(len(utterances))

32
['speaker_2', 'speaker_3', 'speaker_1']
32


In [9]:
!/opt/jupyterhub/share/jupyter/venv/python3-12_comm4190/bin/python -m spacy download en_core_web_sm


This process (pid=1347514) is multi-threaded, use of forkpty() may lead to deadlocks in the child.



Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m7.2 MB/s[0m  [33m0:00:01[0m6m0:00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Politeness/Collaboration
Let's get a coarse measure for politeness given ConvoKit's politeness strategies feature extraction results.

In [66]:
from convokit import TextParser, PolitenessStrategies
import json

# Analyze politeness
ps = PolitenessStrategies()
parser = TextParser()
text_corpus = parser.transform(corpus)
ps_corpus = ps.transform(text_corpus)

example_utterance = next(corpus.iter_utterances())
print(example_utterance.speaker)
# print(json.dumps(example_utterance.meta['politeness_strategies'], indent = 4))

# first (and only) metric for now, count strategies used
def get_politeness_metrics(corpus: Corpus) -> dict[str, int]:

    def get_strategy_count(utterance):
        utterance_meta = utterance.meta
        speaker_id = utterance.speaker.id
        strategies = utterance.meta['politeness_strategies']
        count = sum(strategies.values())
        return (speaker_id, count)
    
    utterances = corpus.iter_utterances()
    return {
        "counts": [get_strategy_count(u) for u in utterances]
    }

ps_metrics = get_politeness_metrics(ps_corpus)
print(ps_metrics)

Speaker(id: 'speaker_2', vectors: [], meta: ConvoKitMeta({}))
{'counts': [('speaker_2', 8), ('speaker_3', 8), ('speaker_2', 6), ('speaker_3', 7), ('speaker_2', 6), ('speaker_3', 5), ('speaker_2', 6), ('speaker_1', 7), ('speaker_3', 7), ('speaker_2', 6), ('speaker_1', 7), ('speaker_3', 7), ('speaker_2', 8), ('speaker_1', 5), ('speaker_3', 6), ('speaker_2', 6), ('speaker_3', 5), ('speaker_1', 6), ('speaker_3', 6), ('speaker_2', 6), ('speaker_1', 5), ('speaker_2', 5), ('speaker_3', 9), ('speaker_2', 7), ('speaker_3', 5), ('speaker_1', 8), ('speaker_3', 6), ('speaker_1', 7), ('speaker_2', 7), ('speaker_1', 6), ('speaker_3', 9), ('speaker_2', 6)]}


In [46]:
totals = {}
for speaker, ct in ps_metrics['counts']:
    if speaker not in totals:
        totals[speaker] = [1, ct]
    else:
        n, tct = totals[speaker]
        totals[speaker] = n + 1, ct + tct
for speaker, (n, tct) in totals.items():
    print(speaker)
    print("Utterances:", n)
    print("Strategies:", tct)
    print("Average:", round(tct / n, 3))
    print()

speaker_2
Utterances: 12
Strategies: 77
Average: 6.417

speaker_3
Utterances: 12
Strategies: 80
Average: 6.667

speaker_1
Utterances: 8
Strategies: 51
Average: 6.375



## Consensus: TF/IDF Similarity
We'll look at consensus overall as the TF/IDF similarity of messages in the conversation, potentially weighting them to make them heavier.

In [68]:
from convokit import ColNormedTfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: Parse corpus text (if not already done)
parser = TextParser()
tx_corpus = parser.transform(corpus)

# Step 2: Apply TF-IDF transformer
tfidf = ColNormedTfidfTransformer(input_field="text")
tx_corpus = tfidf.transform(tx_corpus)

# Step 3: Compute consensus as average pairwise cosine similarity
def compute_consensus(corpus: Corpus) -> float:
    utterances = list(corpus.iter_utterances())
    vectors = [utt.meta['tfidf'] for utt in utterances if 'tfidf' in utt.meta]

    if len(vectors) < 2:
        return 0.0  # Not enough data to compute consensus

    matrix = np.vstack(vectors)
    sim_matrix = cosine_similarity(matrix)

    # Extract upper triangle (excluding diagonal) for pairwise similarities
    n = sim_matrix.shape[0]
    upper_triangle = [sim_matrix[i, j] for i in range(n) for j in range(i+1, n)]

    return np.mean(upper_triangle)

# Run it
consensus_score = compute_consensus(tx_corpus)
print("Consensus score (TF-IDF similarity):", consensus_score)


ImportError: cannot import name 'ColNormedTfidfTransformer' from 'convokit' (/opt/jupyterhub/share/jupyter/venv/python3-12_comm4190/lib/python3.12/site-packages/convokit/__init__.py)

## Zooming Out

This is probably the closest I've come so far to having something that looks like an "analysis pipeline" instead of just vibes: generate a conversation, turn it into a corpus, run a few feature extractors, and spit out numbers. Even if the specific metrics here (politeness counts, TF–IDF consensus) are pretty rough, they at least give me a repeatable handle on how the same setup behaves across runs.

At the same time, these experiments make it very obvious how much judgment still sits outside the notebook. Deciding whether a consensus score is "good" or "bad" is still a human call, and a single number can't tell you whether a conversation is insightful, boring, or off-the-rails. My plan for the rest of the series is to keep layering these tools—LLM-based summaries, token accounting, TF–IDF, ConvoKit—into a shared toolkit I can reuse instead of reinventing everything from scratch each time.

> **Future Work:**
> - Apply the same pipeline to a handful of very different topics to see how stable these metrics are.
> - Compare human ratings of "good" conversations against the automatic scores here.
> - Use the token-impact estimator from Blog 9 to relate conversational quality back to actual resource usage.
