In [22]:
import json
import textwrap

import asyncio
from prompts.hypothesis_generation import HYPOTHESIS_GENERATION_PROMPT, HYPOTHESIS_EVALUATION_PROMPT, HYPOTHESIS_RANK_PROMPT
from response_formats import HypothesisResponse, HypothesisEvalResponse, HypothesisRankResponse
from utils import call_gpt
from dotenv import load_dotenv
from openai import AsyncOpenAI
load_dotenv()


True

In [23]:
client = AsyncOpenAI()

In [24]:
fidx = "40"
data = json.load(open(f'infact_dataset/results/llm_pipeline_alt/{fidx}_o4-mini_20251009.json'))
all_observations = [f"ID {k} | {v['description']}" for k, v in data.items()]

In [4]:
options = []
for k, v in data.items():
    if 'interesting' in v and v['interesting'] == 1 and v['confidence'] > 5:
        options.append(v)
observations = []
raw_obs = []
tasks = []
for o in options:
    raw_obs.append(o)
    item = f"Observation: {o['description']}\nEvidence: {o['evidence']}"
    observations.append(item)
    tasks.append(call_gpt(client=client, prompt=HYPOTHESIS_GENERATION_PROMPT.format(observations=item, limit=5), \
        model="gpt-5", resp_format=HypothesisResponse))
print(observations)

['Observation: Michelle’s workflow embodies a continuous, tightly coupled loop between software development, quantitative experimentation, and qualitative analysis, where insights from each domain immediately drive changes in the others.\nEvidence: [\'She runs best-of-N experiments in Jupyter and modular_pipeline.py, then pivots to editing Svelte components and survey content in VS Code and Obsidian (IDs 2, 127), and simultaneously refines interview summaries and usability feedback in Google Docs (IDs 225, 249, 274).\', \'Michelle simultaneously edits Svelte front-end components (ObservationSelector.svelte), tunes Python pipelines in Jupyter (modular_pipeline.py), and revises interview notes in Google Docs (‘Eval hub’), suggesting she leverages live user feedback to directly shape both experiment design and UI/backend code.\', "In every timestep (IDs 225, 274, 397) she simultaneously edits \'Eval hub\' interview bullets in Google Docs while debugging Python/Svelte code and running Jupy

In [108]:
for k, v in data.items():
    if 'interesting' in v:
        print(v['reasoning'])

This behavior—pushing sample sizes/parallelism for better best-of-N and then throttling due to 429s—is common and expected when working with API rate limits. It doesn’t uniquely reveal something about Michelle beyond standard performance tuning trade-offs, so it’s not particularly surprising or user-specific.
This describes a common developer workflow: keeping front-end and server running while focusing work on backend/model code. It doesn’t reveal a distinctive or unexpected personal pattern beyond typical role-based behavior, and could be reasonably guessed without observing this specific user.
Collapsing verbose logs and displaying only key columns are common notebook hygiene practices to manage clutter and focus on results. The behavior is expected for many users running experiments and doesn’t reveal a unique or surprising personal strategy beyond standard workflow optimization. The evidence doesn’t strongly support the more specific claim of using clear/collapse as stage markers,

In [109]:
resp = await asyncio.gather(*tasks, return_exceptions=True)
for r in resp:
    print(r)
    print('-'*100)


hypotheses=[Hypothesis(text='Shortening the Learning Loop', description='She prioritizes minimizing the time from user signal to product change to accelerate discovery and reduce rework, believing speed of iteration is the main competitive advantage.'), Hypothesis(text='Guarding Against Silo Drift', description='She distrusts handoffs and wants to preserve the nuance of user feedback, so she keeps research, experimentation, and coding tightly integrated to ensure fidelity to user needs.'), Hypothesis(text='Parallel Work as Focus Regulation', description='Her cognitive style benefits from high stimulation and continuous context linking; bouncing among code, experiments, and notes helps maintain flow and prevents loss of tacit insights.'), Hypothesis(text='Building Defensible Evidence', description='She anticipates stakeholder scrutiny and uses a mixed-methods trail—qual quotes, quant results, and immediate code changes—to justify decisions and secure buy-in.'), Hypothesis(text='Scarcity

In [110]:
idx = 0
print(textwrap.fill(options[idx]['description'], width=80))
print('\n')
for i in resp[idx].hypotheses:
    print(textwrap.fill(i.text, width=80))
    print(textwrap.fill(i.description, width=80))
    print('-'*100)

Michelle’s workflow embodies a continuous, tightly coupled loop between software
development, quantitative experimentation, and qualitative analysis, where
insights from each domain immediately drive changes in the others.


Shortening the Learning Loop
She prioritizes minimizing the time from user signal to product change to
accelerate discovery and reduce rework, believing speed of iteration is the main
competitive advantage.
----------------------------------------------------------------------------------------------------
Guarding Against Silo Drift
She distrusts handoffs and wants to preserve the nuance of user feedback, so she
keeps research, experimentation, and coding tightly integrated to ensure
fidelity to user needs.
----------------------------------------------------------------------------------------------------
Parallel Work as Focus Regulation
Her cognitive style benefits from high stimulation and continuous context
linking; bouncing among code, experiments, and notes

In [111]:
output = []
for i, r in enumerate(resp):
    observation = raw_obs[i]
    output.append({
        'observation': observation['description'],
        'evidence': observation['evidence'],
        'hypotheses': [h.model_dump() for h in r.hypotheses]
    })
json.dump(output, open(f'infact_dataset/results/llm_pipeline_alt/{fidx}_o4-mini_20251009_hypotheses.json', 'w'))

In [None]:
all_hypotheses = json.load(open(f'infact_dataset/results/llm_pipeline_alt/{fidx}_o4-mini_20251009_hypotheses.json'))

In [None]:
tasks = []

for item in all_hypotheses[:]:
    hypotheses = item['hypotheses']
    behavior = item['observation']
    options = ["A", "B", "C", "D", "E"]
    fmt_hypotheses = []
    for i, h in enumerate(hypotheses):
        string_fmt = f"{options[i]}. {h['text']}: {h['description']}"
        fmt_hypotheses.append(string_fmt)
    fmt_obs = "\n".join([f"ID {k} | {v['description']}" for k, v in data.items() if v['description'] != behavior])
    # fmt_hypotheses = "\n".join(fmt_hypotheses)
    prompt = HYPOTHESIS_RANK_PROMPT.format(hypotheses=fmt_hypotheses, observations=fmt_obs, behavior=behavior)
    tasks.append(call_gpt(client=client, prompt=prompt, model="gpt-5", resp_format=HypothesisRankResponse))


In [27]:
resps = await asyncio.gather(*tasks, return_exceptions=True)

In [28]:
for item, resp in zip(all_hypotheses, resps):
    selection = resp.selection
    hypotheses = item['hypotheses']
    print((item['observation']))
    fmt_hypotheses = []
    for i, h in enumerate(hypotheses):
        if options[i] == selection:
            string_fmt = f"*{options[i]}. {h['text']}: {h['description']}"
        else:
            string_fmt = f"{options[i]}. {h['text']}: {h['description']}"
        fmt_hypotheses.append(string_fmt)
    print("\n".join(fmt_hypotheses))
    print('\n')

Dora’s proposal exhibits a lopsided focus on large-scale data sourcing while leaving the core model training strategy underdefined and repeatedly deferred to collaborators, creating a planning imbalance that threatens to bottleneck project execution.
A. Leaning Into Strengths, Avoiding Weaknesses: Dora is confident in data sourcing but lacks expertise in model training, so she over-specifies collection and defers modeling to reduce personal risk and anxiety. She prioritizes delivering where she feels competent and expects collaborators to cover the training gap.
B. Critical-Path Optimization: She believes data acquisition is the true schedule bottleneck and thus front-loads detailed plans there while leaving training flexible until data characteristics are known. This reflects pragmatic sequencing, even if it creates a short-term planning imbalance.
*C. Role Deference and Boundary Management: Team norms or prior ownership may assign training to Omar, prompting Dora to leave placeholder

In [29]:
resps

[HypothesisRankResponse(selection='C', reasoning='Repeated, explicit deferral of the Training section to Omar (IDs 15, 23, 289) alongside direct edit requests to “DORA/OMAR” (ID 63) and evidence of shared ownership norms (IDs 223, 227) point to role boundaries more than lack of capability. Dora demonstrates broad ML fluency (IDs 24, 285) and sophisticated planning, suggesting she’s not fundamentally unable but is respecting team division of labor. The strong, detailed emphasis on data sourcing (IDs 14, 260) creates the planning imbalance, but the decisive factor is deference to the teammate responsible for training rather than a purely strategic bottleneck focus (B) or signaling (D) or a belief that training is interchangeable (E). This pattern best fits Role Deference and Boundary Management.'),
 HypothesisRankResponse(selection='A', reasoning='Multiple observations show high uncertainty and pressure (IDs 15, 23, 61, 105, 149, 158, 165, 289, 291) alongside hyper-vigilant monitoring an