# Extract Goals from Transcripts - Whole Transcript

This notebook proceeds to extract goals from interview transcripts in two stages: (1) generate goals over the whole transcript; (2) trace generated goals back to the source text in the transcript.

In [1]:
from openai import OpenAI

client = OpenAI()

def prompt_model(prompt):
    response = client.chat.completions.create(
      model="gpt-4o-2024-08-06",
      messages=[
        {
          "role": "system",
          "content": "You are a business analyst collecting requirements for a software application. Your job is to review interview transcripts between an interviewer who is a business analyst and a stakeholder who is a prospective user of the application. The interviewer will ask the stakeholder questions to identify their requirements for the application."
        },
        {
          "role": "user",
          "content": prompt
        }
      ]
    )
    return response.choices[0].message.content

In [2]:
import json

data_path = 'data1'

data = json.load(open('%s/transcripts.json' % data_path, 'r'))
print('Read %i transcripts' % len(data['transcript']))

Read 34 transcripts


In [3]:
prompt1 = """Read the following interview transcript excerpt and respond with any goals that the speaker expresses. Write each goal in general language so that it describes only one action, and do not include references to applications, products or services in the goal. Only write goals that can be traced to specific phrases in the speech. Respond with the goals in a JSON list of strings.

%s

Goals: 
"""

prompt2 = """Read the following goals in JSON format and identify the substrings in the interview transcript excerpt from which the goals were generated. Respond in JSON using the format {'goal': 'goal statement', 'phrases': ['phrase1', 'phrase2']}

%s

%s

Response: """

def extract_goals(transcript):
    extracted = []
    excerpt = '\n'.join(['%s: %s\n' % (t['speaker'], t['text']) for t in transcript])
    p1 = prompt1 % excerpt
    #print(p1)
    r1 = prompt_model(p1)
    #print(r1)

    goals = []
    phrases = []
    if r1.startswith('```json') and r1.endswith('```'):
        goals = json.loads(r1[7:-3])

        p2 = prompt2 % (r1[7:-3], excerpt)
        r2 = prompt_model(p2)

        if r2.startswith('```json') and r2.endswith('```'):
            phrases = json.loads(r2[7:-3])
  
    return {'excerpt': excerpt, 'goals': goals, 'phrases': phrases} 

results = {}
for i in range(len(data['transcript'])):
    print('Extracting goals from transcript %i with %i turns' % (i, len(data['transcript'][i])), end='')
    
    results[data['ids'][i]] = []
    for j in range(10):  # goal extraction performed 10 times
        print(' .', end='')
        
        e = extract_goals(data['transcript'][i])
        results[data['ids'][i]].append(e)
    print(' .done!')

Extracting goals from transcript 0 with 84 turns . . . . . . . . . . .done!
 . . . . . . . . . .done!nscript 1 with 50 turns .
 . . . . . . . . . .done!nscript 2 with 27 turns .
 . . . . . . . . . .done!nscript 3 with 191 turns .
 . . . . . . . . . .done!nscript 4 with 104 turns .
 . . . . . . . . . .done!nscript 5 with 70 turns .
 . . . . . . . . . .done!nscript 6 with 111 turns .
 . . . . . . . . . .done!nscript 7 with 59 turns .
 . . . . . . . . . .done!nscript 8 with 111 turns .
 . . . . . . . . . .done!nscript 9 with 86 turns .
 . . . . . . . . . .done!nscript 10 with 56 turns .
 . . . . . . . . . .done!nscript 11 with 118 turns .
 . . . . . . . . . .done!nscript 12 with 46 turns .
 . . . . . . . . . .done!nscript 13 with 70 turns .
 . . . . . . . . . .done!nscript 14 with 114 turns .
 . . . . . . . . . .done!nscript 15 with 78 turns .
 . . . . . . . . . .done!nscript 16 with 65 turns .
 . . . . . . . . . .done!nscript 17 with 48 turns .
 . . . . . . . . . .done!nscript 18 with 98

In [4]:
json.dump(results, open('%s/extracted-whole.json' % data_path, 'w+'))  # moved under data0/extracted-whole.json

In [5]:
print(results.keys())

dict_keys(['32', '35', '34', '33', '20', '18', '27', '9', '11', '7', '29', '16', '6', '28', '17', '10', '19', '26', '8', '21', '36', '31', '30', '24', '23', '4', '15', '3', '12', '13', '5', '14', '22', '25'])
