# Extract Goals from Transcripts - Moving Window

This notebook proceeds to extract goals from interview transcripts in two stages: (1) generate goals from a moving window over the transcript; (2) trace generated goals back to the source text in the transcript.

In [3]:
from openai import OpenAI

client = OpenAI()

def prompt_model(prompt):
    response = client.chat.completions.create(
      model="gpt-4o-2024-08-06",
      messages=[
        {
          "role": "system",
          "content": "You are a business analyst collecting requirements for a software application. Your job is to review interview transcripts between an interviewer who is a business analyst and a stakeholder who is a prospective user of the application. The interviewer will ask the stakeholder questions to identify their requirements for the application."
        },
        {
          "role": "user",
          "content": prompt
        }
      ]
    )
    return response.choices[0].message.content

In [4]:
import json

data_path = 'data1'

data = json.load(open('%s/transcripts.json' % data_path, 'r'))
print('Read %i transcripts' % len(data['transcript']))

Read 34 transcripts


In [5]:
# Each goal should be written in the format of VERB[STATE] where VERB is one of Achieve, Maintain or Avoid, and STATE describes a system state to be achieved, maintained or avoided. 

prompt1 = """Read the following interview transcript excerpt and respond with any goals that the speaker expresses. A goal describes a prescriptive statement of intent that the system should satisfy through the cooperating of its agents. Do not include references to applications, products or services in the goal statement. Only write goals that can be traced to specific phrases in the speech. Respond with the goals in a JSON list of strings.

%s

Response: 
"""

prompt2 = """Read the following goals in JSON format and identify the substrings in the interview transcript excerpt from which the goals were generated. Respond in JSON using the format {'goal': 'goal statement', 'phrases': ['phrase1', 'phrase2']}

%s

%s

Response: """

def extract_goals(transcript):
    extracted = []
    excerpt = '\n'.join(['%s: %s\n' % (t['speaker'], t['text']) for t in transcript])
    p1 = prompt1 % excerpt
    #print(p1)
    r1 = prompt_model(p1)
    #print(r1)

    goals = []
    phrases = []
    if r1.startswith('```json') and r1.endswith('```'):
        goals = json.loads(r1[7:-3])

        p2 = prompt2 % (r1[7:-3], excerpt)
        r2 = prompt_model(p2)

        if r2.startswith('```json') and r2.endswith('```'):
            phrases = json.loads(r2[7:-3])

    return {'excerpt': excerpt, 'goals': goals, 'phrases': phrases} 

results = {}
for i in range(len(data['transcript'])):
    print('Extracting goals from transcript %i with %i turns' % (i, len(data['transcript'][i])), end='')
    
    results[data['ids'][i]] = []
    for j in range(0, len(data['transcript'][i]), 2):
        print(' .', end='')
        
        e = extract_goals(data['transcript'][i][j:j+4])  # moving window
        results[data['ids'][i]].append(e)
    print(' .done!')

Extracting goals from transcript 0 with 84 turns . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . .done!2 with 27 turns .
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .done!
 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

In [9]:
for r in results['30']:
    print(r['goals'])

['Help prepare for university applications by achieving a good contest score.', 'Use platforms to practice coding questions for job interviews.', 'Discuss difficult coding questions with friends easily.', 'Find more explained solutions with detailed descriptions and examples for coding problems.']
['Easily identify and refer to specific coding problems for discussion with others.', 'Access detailed explanations and walkthroughs for coding solutions.', 'Quickly practice specific coding questions for interview preparation.']
['Practice specific question numbers to prepare for interviews.', 'Send question numbers to friends for easy reference.', 'Receive question numbers from friends to easily locate specific questions.', 'Practice categorized and concise list of questions beneficial for job interviews.', 'Discover and learn new algorithms.', 'Explore a variety of different questions.']
['Practice coding skills to improve performance in job interviews.', 'Discover a variety of different q

In [7]:
json.dump(results, open('%s/extracted-moving.json' % data_path, 'w+')) # moved under data0/extracted-moving.json

In [8]:
print(results.keys())

dict_keys(['32', '35', '34', '33', '20', '18', '27', '9', '11', '7', '29', '16', '6', '28', '17', '10', '19', '26', '8', '21', '36', '31', '30', '24', '23', '4', '15', '3', '12', '13', '5', '14', '22', '25'])
