# Extract Goals from Transcripts

**RQ:** Does more or less transcript context improve goal extraction coverage?

**RQ:** How does high-temperature sampling affect goal extraction goverage?

**RQ:** How do hallucinations affect goal generation?


In [26]:
from openai import OpenAI

client = OpenAI()

def prompt_model(prompt):
    response = client.chat.completions.create(
      model="gpt-4o-2024-08-06",
      messages=[
        {
          "role": "system",
          "content": "You are a business analyst collecting requirements for a software application. Your job is to review interview transcripts between an interviewer who is a business analyst and a stakeholder who is a prospective user of the application. The interviewer will ask the stakeholder questions to identify their requirements for the application."
        },
        {
          "role": "user",
          "content": prompt
        }
      ]
    )
    return response.choices[0].message.content

In [77]:
import json

data_path = 'data1'

data = json.load(open('f24-transcript-data.json', 'r'))
print('Read %i transcripts' % len(data['transcript']))

Read 34 transcripts


In [84]:
import re

goals = []
for i in range(len(data['actions'])):
    g = []
    for line in data['actions'][i].split('\n'):
        if len(line) > 0 and line[0] == '*':
            g.append(line[1:].strip())
    goals.append(g)

    if i == 5:
        print(data['actions'][4])

Goals:
* Professional Communication - Primarily, Jacqueline’s email use is for career needs - receiving updates from recruiters, scheduling, and networking. Given this, reliable access to important emails from recruiters is critical.
* Staying Updated on Academic Announcements - Another goal is to use email to stay current with university communications, from notifications to updates from platforms like Canvas.
* Personal Organization – Other than goals related to school and career, Jacqueline uses email for organizing personal appointments (e.g., medical appointments) and to keep track of offers or discounts from merchants.
* Organizing Inbox - She prefers a clutter-free inbox, frequently categorizing, deleting, or tagging emails to streamline her interactions and ensure that critical emails are easy to access.
* Efficient Email Composition - Jacqueline values efficiency when composing emails and often relies on an AI assistant to make the process faster and more precise.

Actions:
* 

In [36]:
prompt2 = """Read the following goals in JSON format and identify the substrings in the interview transcript excerpt from which the goals were generated. Respond in JSON using the format {'goal': 'goal statement', 'phrases': ['phrase1', 'phrase2']}

%s

%s

Response: """

def trace_goals(transcript, goals):
    print(' .', end='')
    excerpt = '\n'.join(['%s: %s\n' % (t['speaker'], t['text']) for t in transcript])

    passed = False
    failed = 0
    while not passed and failed < 3:
        phrases = []
        p2 = prompt2 % (json.dumps(goals), excerpt)
        r2 = prompt_model(p2)
    
        if r2.startswith('```json') and r2.endswith('```'):
            try:
                phrases = json.loads(r2[7:-3])
                passed = True
            except Exception as e:
                failed += 1
  
    return {'excerpt': excerpt, 'goals': goals, 'phrases': phrases} 

results = {}
for i in range(len(data['transcript'])):
    d = data['transcript'][i]
    g = goals[i]
    print('Tracing goals to transcript %i with %i turns' % (i, len(d)), end='')
    results[i] = []
    for j in range(0, len(d), 2):
        e = trace_goals(d[j:j+4], g)
        results[i].append(e)
    print(' .done!')

Tracing goals to transcript 0 with 84 turns . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . .done!h 27 turns .
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

In [38]:
json.dump(results, open('%s/human-trace.json' % data_path, 'w+'))

In [39]:
results = json.load(open('%s/human-trace.json' % data_path, 'r'))

## Analyze Extracted Goals and Source Goal Phrases

In [60]:
from collections import Counter
import re

def annotate(excerpt, phrases):
    index = []
    for p in phrases:
        for m in re.finditer(p, excerpt):
            index.append([m.start(), '['])
            index.append([m.end(), ']'])

    
    for i, c in sorted(index, key = lambda x:x[0], reverse = True):
        excerpt = excerpt[:i] + c + excerpt[i:]
    return excerpt

def eval_extracted(extracted, counter=Counter(), base_turn=0):
    errors = []
    review = []
    for e in extracted:
        for g in e['phrases']:
            # if g is not a dict, then model failed to follow format instructions
            if not isinstance(g, dict):
                counter['type errors'] += 1
                continue

            # if g is missing phrases key, then model failed to follow format instructions
            elif not 'phrases' in g:
                counter['type errors'] += 1
                continue

            # if phrase goal doesn't match original generated goal
            if g['goal'] not in e['goals']:
                counter['goal mismatch'] += 1

            # if phrase list is empty
            if len(g['phrases']) == 0:
                counter['no phrases'] += 1

            # check for phrase match separately for interviewer and stakeholder
            i_c_total = 0
            s_c_total = 0
            i_c = 0
            s_c = 0
            turn_match = []
            for p in g['phrases']:
                turns = e['excerpt'].split('\n\n')
                i_c = 0
                s_c = 0
                for i, t in enumerate(turns):
                    i_p = re.findall(r'Interviewer:\s(.+)', t)
                    s_p = re.findall(r'Stakeholder:\s(.+)', t)
                    
                    i_c_t = sum([a.lower().count(p.lower()) for a in i_p])
                    s_c_t = sum([a.lower().count(p.lower()) for a in s_p])
                    if i_c_t + s_c_t > 0:
                        turn_match.append(base_turn + i)

                    i_c += i_c_t
                    s_c += s_c_t

                # if not match between interviewer and stakeholder
                if i_c + s_c == 0:
                    counter['unmatched phrase'] += 1
                    errors.append({'excerpt': e['excerpt'], 'phrase': p, 'goal': g['goal']})

                # if only one match between interviewer and stakeholder
                if i_c + s_c == 1:
                    counter['unique phrases'] += 1
                
                # count speaker role from which matches were found
                counter['from interviewer'] += i_c
                i_c_total += i_c
                s_c_total += s_c
                counter['from stakeholder'] += s_c
                counter['total phrases'] += 1
                
            # if interviewer and stakeholder matched for one goal, then count multi-turn goal
            multi = 'N'
            counter['interviewer goal'] += 1 if i_c > 0 else 0
            counter['stakeholder goal'] += 1 if s_c > 0 else 0
            if i_c_total > 0 and s_c_total > 0:
                counter['multi-turn goal'] += 1
                multi = 'Y'
            if i_c_total + s_c_total > 0:
                review.append({'index': len(review), 'goal': g['goal'], 'excerpt': annotate(e['excerpt'], g['phrases']), 's_match': s_c_total, 'i_match': i_c_total, 'turn_match': turn_match, 'multiturn': multi})
    return counter, errors, review

In [71]:
from collections import Counter
import csv, math

goals = {}
global_count = Counter()
counters = []
match_dist = Counter()
for i, samples in results.items():
    goals[i] = []
    reviews = []
    
    for j in range(len(samples)):
        counter, errors, review = eval_extracted([samples[j]], Counter(), base_turn = j * 2)
        counter['total turns'] = len(data['transcript'][int(i)])
        counters.append(counter)
        
        for k, c in counter.items():
            global_count[k] += c

        reviews.extend(review)
    
        # record population data for evaluating the prompt
        goals[i].append([])
        for r in review:
            goals[i][j].append(r['goal'])
            
            for m in r['turn_match']:
                norm_m = math.floor(10 * m / counter['total turns'])
                match_dist[norm_m] += 1

    with open('human3/%s-generated-goals.csv' % i, 'w+') as f:
        writer = csv.DictWriter(f, fieldnames=['index', 'excerpt', 'goal', 'i_match', 's_match', 'turn_match', 'multiturn'])
        writer.writeheader()
        writer.writerows(reviews)

# how does the distribution compare to 'lost in the middle'?
for i in sorted(match_dist.keys()):
    print('%i\t%i' % (i, match_dist[i]))

0	295
1	634
2	757
3	557
4	579
5	525
6	542
7	493
8	396
9	191


In [62]:
fieldnames = set([k for c in counters for k in c.keys()])
with open('%s/human-statistics.csv' % data_path, 'w+') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(counters)

In [63]:
print(global_count)

Counter({'total turns': 133084, 'total phrases': 5952, 'unique phrases': 4927, 'from stakeholder': 4668, 'stakeholder goal': 2327, 'unmatched phrase': 996, 'no phrases': 775, 'from interviewer': 326, 'goal mismatch': 184, 'type errors': 162, 'interviewer goal': 161, 'multi-turn goal': 135})


In [71]:
import statistics

turn_count = []
for i in range(0, len(counters), 10):
    turn_count.append(counters[i]['total turns'])
print(statistics.median(turn_count))
print(turn_count)

70
[70, 78, 27, 46, 70, 75, 19, 56, 50, 114, 116, 33, 163, 46, 84, 50, 27, 191, 104, 111, 59, 111, 86, 118, 65, 48, 98, 77, 36, 67, 60, 142, 118]


In [45]:
print(goals.keys())

dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33'])


In [48]:
for i in goals['0']:
    print(goals['0'])

[[], [], [], [], [], [], [], [], [], [], ['Allow users to PIN or bookmark important chats at the top of the chat list.', 'Enable organization of chats based on importance, like family and friends.'], [], [], ['Enable users to search for messages within a chat by clicking a three-dot icon.', 'Allow users to enter search queries to find messages containing specific keywords.', 'Provide functionality to search for images containing specific keywords.'], [], [], [], [], [], [], [], [], ['The application should allow users to search for images using keywords.'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
[[], [], [], [], [], [], [], [], [], [], ['Allow users to PIN or bookmark important chats at the top of the chat list.', 'Enable organization of chats based on importance, like family and friends.'], [], [], ['Enable users to search for messages within a chat by clicking a three-dot icon.', 'Allow users to enter search queries to find messages containing spe