# ChatGPT API
Using ChatGPT API for various reasons - summaries, theme annotation, etc

In [290]:
import sys, os
import pandas as pd
import numpy as np
from ast import literal_eval
from speach import elan
from difflib import SequenceMatcher
import re
import json
import time
from tqdm import tqdm

import IPython
import ipywidgets as widgets
from IPython.display import Markdown, display

In [11]:
# Loading and parametrizing
import openai
import tiktoken # get the number of tokens: use tiktoken

with open("../src/openai_apk", 'r') as f:
    d = {x.split('=')[0]:literal_eval(x.split('=')[1]) for x in f.readlines()}
openai.api_key = d['API_KEY']

## Functions

### ChatGPT prompt / calls

In [None]:
def get_number_tokens(s:str, model:str="gpt-3.5-turbo"):
    enc = tiktoken.get_encoding("cl100k_base") # can use for other models
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    encs = enc.encode(s)
    return len(encs)

In [None]:
def create_prompt(s:str, speakers:list):
    s = f"""ce texte est extrait d'une conversation entre {' et '.join(speakers)}. lorsqu'un nouveau locuteur prend la parole, son nom est indiqué entre crochets (par exemple <{speakers[0]}>). Peux-tu me donner les différents thèmes de la conversation et me citer la phrase par laquelle ils débutent ?

conversation: "{s}" """
    return s

In [None]:
def call_api(s:str, model:str="gpt-3.5-turbo"):#, max_tokens:int=256):
    """
    Response:
    {
        'id': 'chatcmpl-6p9XYPYSTTRi0xEviKjjilqrWU2Ve',
        'object': 'chat.completion',
        'created': 1677649420,
        'model': 'gpt-3.5-turbo',
        'usage': {'prompt_tokens': 56, 'completion_tokens': 31, 'total_tokens': 87},
        'choices': [
        {
            'message': {
            'role': 'assistant',
            'content': 'The 2020 World Series was played in Arlington, Texas at the Globe Life Field, which was the new home stadium for the Texas Rangers.'},
            'finish_reason': 'stop', # 'length' (token limit), 'content_filter' (flagged), 'null' (still in progress)
            'index': 0
        }
        ]
    }
    """
    response = openai.ChatCompletion.create(
        model=model,
        messages=[ {"role": "user", "content": s} ],
# Parametrizing not supported
#        temperature=0.7,
#        max_tokens=max_tokens,
#        top_p=1,
#        frequency_penalty=0,
#        presence_penalty=0
    )
    res = response['choices'][0]['message']['content']
    nb_tok = {'total': response['usage']['total_tokens'], 'response': response['usage']['completion_tokens']}
    return res, nb_tok

### Prepare text for call

In [None]:
transcr_loc = "../data/transcript/current"
transcr_files = sorted([x for x in os.listdir(transcr_loc) if 'eaf' in x])
markers_path = "../data/video/markers_from_video_start.csv"
markers = pd.read_csv(markers_path).set_index('file')

In [None]:
def get_conv_df(file:str):
    _, date, group = file.split('.')[0].split('-')
    eaf = elan.read_eaf(os.path.join(transcr_loc, file))
    dial = pd.DataFrame(eaf.to_csv_rows(), columns=['speaker', '?', 'start','stop','duration','text'])
    dial.start = dial.start.astype(float)
    dial.stop = dial.stop.astype(float)
    dial = dial[~dial.text.isin(['#', ''])].sort_values('start').reset_index(drop=True)
    dial.drop(columns=['?'], inplace=True)
    dial.speaker = dial.speaker.apply(lambda x: x.split('-')[-1])
    mark = markers.loc[f'{date}_{group}']
    fconv = dial[dial.start >= (mark['End Task 1'] + 2)] # giving seconds to start task
    return dial, fconv

def ann_overlap(df):
    # df if ordered by start, stop. so the line before (more before if several during one ipu) should tell if 
    df['overlap'] = False
    for idx, row in df.iterrows():
        tmp = df.loc[(row.speaker != df.speaker) & (df.start <= row.start) & (df.stop >= row.stop)]
        if tmp.shape[0] > 0:
            df.loc[idx, 'overlap'] = True

def is_feedback(s:str, threshold:float=.5) -> bool:
    words = ['ouais','mh','hm', "c'est sûr", 'oui', 'ha','ah', '@', 'alors', 'voilà', 'ok', 'du coup']
    nw = re.findall(re.compile('('+'|'.join(words)+')'), s.lower())
    return (len(nw) / len(s.split())) > threshold

In [None]:
def create_texts(df:pd.DataFrame, max_nb_tokens:int=2048):
    """Max number of tokens for API: 2048/4000 - cannot use overlap since anwsers might not overlap"""
    memory = []
    # 1. get the number of tokens for every line in df
    ### Which is the best option so as not to get almost empty last line?
    df['text_updated'] = df.apply(lambda x: f"<{x.speaker}> {x.text}", axis=1)
    #df['speaker_updated'] = (df.speaker != df.speaker.shift())*df.speaker
    #df['text_updated'] = df.apply(lambda x: f"<{x.speaker_updated}> {x.text}" if x.speaker_updated != '' else x.text, axis=1)
    df['nb_tok'] = df.text_updated.apply(get_number_tokens)
    df['nb_tok_cs'] = df.nb_tok.cumsum()
    # 2. while df is not empty, cut, add (start_index, text) to memory, and loop
    nb_splits = np.ceil( df.nb_tok.cumsum().iloc[-1] / max_nb_tokens ) 
    max_nb_tokens = df.nb_tok.cumsum().iloc[-1] // nb_splits
    print(f"Max number of tokens (speakers everywhere): {df.nb_tok.cumsum().iloc[-1]}; Number of splits: {int(nb_splits)}, Max number of tokens: {max_nb_tokens}")
    while df.shape[0] > 0:
        idx_change = df[df['nb_tok_cs'] <= max_nb_tokens].index[-1] + 1
        #t = df[df['nb_tok_cs'] <= max_nb_tokens] # include the next one too
        t = df.loc[:idx_change]
        t['speaker_updated'] = (t.speaker != t.speaker.shift())*t.speaker
        t['text_updated'] = t.apply(lambda x: f"<{x.speaker_updated}> {x.text}" if x.speaker_updated != '' else x.text, axis=1)
        d = {'idx_start': t.index[0], 'idx_stop': t.index[-1], 'text': ' '.join(t.text_updated.tolist())}
        d['len'] = get_number_tokens(d['text'])
        memory.append(d)
        # update df
        #df = df[df['nb_tok_cs'] >= max_nb_tokens]
        df = df.loc[(idx_change+1):]
        df['nb_tok_cs'] = df.nb_tok.cumsum()
    # 3. return memory
    return memory

### Parse response and match to text

In [304]:
def parse_chatgpt_res(text:str, mode:str='quotes_needed') -> int: # use with literal_eval
    """Match themes, quotes based on pattern: [1./ -] (theme)(:|\( eventual sentence) "<speaker> quote"
    """
    auth_modes = ['quotes_needed','all']
    if mode not in auth_modes:
        raise ValueError(f"Argument mode must be one of {auth_modes}")
    p = re.compile(r"""
^ # Beginning of line
(?:\d+\.|-)[ ] # Bullet or numbered point
(?P<theme>[^(:\n]+) # Theme of the conversation (any character that isn't `:` or `(`)
(
  (?::[ ]|[ ]\() # Beginning of context (match `: ` or ` (`)
  (?P<context>
    .*?[ ]? # Context is anything, maybe followed by a space
    (?:"(?P<quote>(.*?))")? # Quote must be between `"`
  )
  \)?
)?
\.?
$ # End of line
""", re.VERBOSE | re.MULTILINE)

    themes = []
    quotes = []
    for m in p.finditer(text):
        theme = m.group("theme")
        context = m.group("context")
        quote = m.group("quote")
        if mode == 'quotes_needed':
            if quote is not None:
                themes.append((theme, quote))
        else:
            themes.append(theme)
            quotes.append(quote)
    if len(quotes) == 0: quotes = None
    return themes, quotes

def clean_sent(s):
    # select sentence only - form <speaker> (need to check if accurate) ... [text] ...
    if s[0] == "<" and s[3] == ">":
        speaker = s[1:3]
        s = s[5:]
    else: speaker = None
    if s[0:3] == "...":
        s = s[4:]
    if s[-3:] == "...":
        s = s[:-3]
    return s, speaker

def match_sentence_loc(s, mem, df):
    s, speaker = clean_sent(s) #literal_eval(s)) needed for older version
    # check if matching original sentences
    #if s in mem['text']:
    # method 1: match (consecutive) parts of the sentence to the extract - issue: sole (common) words matching the extract
    t = df.loc[mem['idx_start']:(mem['idx_stop']+1)]
    t = t[t.speaker == speaker]
    t['ratio'] = t.text.apply(lambda x: len(x) if x in s else 0)
    sent_start = t.groupby([((t.ratio > 0) != (t.ratio > 0).shift()).cumsum()]).agg({'start': 'min', 'ratio': 'sum'}).reset_index(drop=True).sort_values('ratio', ascending=False).iloc[0]
    # issue: partial sentence => fall back to difflib
    if sent_start.ratio > 5: # usually long sentences
        return sent_start.start
    # method 2: use difflib.SequenceMatcher() .set_seq1(s) df.apply(sm.set_seq2(x); sm.ratio()) but issue if the sentence is split()
    sm = SequenceMatcher()
    sm.set_seq1(s)

    def apply_seqmatch(x):
        sm.set_seq2(x)
        return sm.ratio()
    t['ratio'] = t.text.apply(apply_seqmatch)
    sent_start = t.sort_values('ratio', ascending=False).iloc[0]
    return sent_start.start

def match_themes_loc(res:str, mem, df, theme_col:str='theme'):
    themes, quotes = parse_chatgpt_res(res)
    if quotes is None:
        for theme, sent in themes:
            sent_start = match_sentence_loc(sent, mem, df)
            df.loc[df.start == sent_start, theme_col] = theme
    else:
        for i, sent in enumerate(quotes):
            sent_start = match_sentence_loc(sent, mem, df)
            df.loc[df.start == sent_start, theme_col] = f"theme{i}"

#### Old

In [None]:
def parse_chatgpt_res(s:str): # use with literal_eval
    """ OLD, updated format :/
    Format: 
    ----------
    ```
    Thèmes de la conversation:
    - [liste ]
    
    Phrases introduisant les thèmes:
    ```

    Difficultés:
    ----------
    * Nombre différent de thèmes et de phrases (dans ce cas les quotes ne sont pas associées à un thème)
    * Variations dans la formulation précise des intitulés (`introduisant chaque thème`, `introduisant ces thèmes`)
    * Phrases des quotes coupées ==> comment matcher ?
    """
    themes, quotes = s.split('\n\n')
    themes = themes.split('\n- ')[1:]
    quotes = quotes.split('\n- ')[1:]
    if (len(themes) == len(quotes)) and len(quotes[0].split(':')) >= 2:
        return [(t, ': '.join(q.split(': ')[1:])) for t, q in zip(themes, quotes)], None
    else:
        print("Skipping answser, different number of themes and quotes")
        return themes, quotes

## Calls

### Random tests

In [19]:
s = "Who won the world series in 2020?"
call_api(s)

('The Los Angeles Dodgers won the World Series in 2020.', 31)

### Apply on one file

In [293]:
file_selector = widgets.Dropdown(options = transcr_files, description="Which file?")
file_selector

Dropdown(description='Which file?', options=('bkt-221116-CGLS.eaf', 'bkt-221117-TFGG.eaf', 'bkt-221118-GDNF.ea…

In [318]:
file = file_selector.value
dial, fconv = get_conv_df(file)

ann_overlap(fconv)
fconv['is_feedback'] = fconv.overlap + fconv.text.apply(is_feedback)
fconv = fconv[~fconv.is_feedback].reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['overlap'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fconv['is_feedback'] = fconv.overlap + fconv.text.apply(is_feedback)


In [148]:
#res, nb_tok = call_api(t)
#res, res2, nb_tok, nb_tok2 # print(res)

{'total': 1868, 'response': 240}

In [319]:
mem = create_texts(fconv.copy(deep=True))
l_speakers = fconv.speaker.unique().tolist()

Max number of tokens (speakers everywhere): 3758; Number of splits: 2, Max number of tokens: 1879.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['speaker_updated'] = (t.speaker != t.speaker.shift())*t.speaker
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t['text_updated'] = t.apply(lambda x: f"<{x.speaker_updated}> {x.text}" if x.speaker_updated != '' else x.text, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['nb_tok_cs'] = d

In [321]:
calls = {}
for i in tqdm(range(len(mem))):
    p = create_prompt(mem[i]['text'], speakers = l_speakers)
    res, nb_tok = call_api(p)
    # currently res, res2
    calls[i] = {'split': mem[i], 'prompt': p, 'response': res, 'nb_tokens': nb_tok}
    time.sleep(10)

100%|██████████| 2/2 [00:31<00:00, 15.61s/it]


In [322]:
calls

{0: {'split': {'idx_start': 0,
   'idx_stop': 186,
   'text': "<GG> une <TF> je ris du coup <GG> la <TF> on est mort <GG> bon <TF> i <GG> gardons <TF> hu non mais <GG> je sais pas si on était loin <TF> en vrai il nous restait deux trois trucs à faire <GG> ça ne fai que troi <TF> tout le monde l'a fait ni en afrique <GG> il peut en avoir jusqu'à onze e donc je crois qu'il faut qu'on parle <TF> j'ai la mongols <GG> je te lise au passage <TF> oui je veux bien <GG> oh voilà on part de cette espérance vous me mettrez à coup de te l'ordinateur vous avez quinze minutes de discussion libre avec l'autre participant euh trois passagers sont dans une mogonfière qui perd rapidement de latitude et qui est en passe de s'écraser le seul moyen pour les passagers de survivre est de sacrifier l'un d'eux parmi ces passagers se trouve un scientifique dont les recherches pourraient amené au développement d'une thérapie révolutionnaire contre le cancer une maîtresse d'école primaire enceinte et son mari il 

In [328]:
parse_chatgpt_res(calls[0]['response'], 'all')

(['Laughing together',
  'Possibly a failed project or task',
  'The moral dilemma of sacrificing one person to save the others in a dangerous situation',
  'Judging the values of a city',
  'Personal opinions on who should be sacrificed',
  'Hypothetical scenarios related to the moral dilemma',
  'Humor as a coping mechanism for the difficult conversation'],
 ['je ris du coup',
  'il nous restait deux trois trucs à faire',
  'trois passagers sont dans une mongolfière',
  "quand on juge les valeurs d'une ville à Clermont",
  None,
  'je voudrais être où je suis dans une mongolfière',
  'ils font un jeu ce qui permet on se rugit comme ça", "le jeu de la mort'])

In [316]:
fconv['theme'] = None
for i in range(len(calls)):
    res = calls[i]['response']
    memo = calls[i]['split']
    match_themes_loc(res, memo, fconv)
fconv['theme_filled'] = fconv.theme.ffill()

**Saving**

In [330]:
mem_path = os.path.expanduser('~/Downloads/chatgpt-calls.json')
if os.path.exists(mem_path):
    with open(mem_path, 'r') as f:
        mem_global = json.load(f)
else:
    mem_global = {}

mem_global[file] = calls
with open(mem_path, 'w') as f:
    json.dump(mem_global, f, indent=4)

In [317]:
csv_path = os.path.expanduser('~/Downloads/chatgpt-themes-brainkt.csv')
fcols = fconv.columns.tolist()
fconv['file'] = file.replace('-','_')[5:-4]
fconv = fconv[['file']+fcols] # reordering columns
if os.path.exists(csv_path):
    fdf = pd.read_csv(csv_path, na_values=[''])
    fdf = pd.concat([fdf, fconv], axis=0).reset_index(drop=True)
else:
    fdf = fconv
fdf.to_csv(csv_path, index=False)

## Notes
**Difficulties**: various ways to answer depending on days/number of calls?

23/04/25:
```
-------- prompt 1
Thèmes de la conversation: 
[4 themes]

Phrases introduisant ces thèmes:
[6 sentences]

-------- prompt 2
Thèmes de la conversation: 
- Choix difficile à faire dans une situation hypothétique (sauter d'une montgolfière)
- Orientation professionnelle et projets d'avenir
- Sujet de recherche en doctorat et expériences à mettre en place

Phrases introduisant chaque thème:
- Choix difficile: "<CG> Après s'ils sont pas très haut, qu'il y en a un qui est un petit peu adroit de ses jambes, il saute et il se réceptionne..."
- Orientation professionnelle: "<CG> Et du coup, tu fais quoi, toi? T'es en... en licence? <LS> Alors euh... j'ai une licence en management et je me réoriente, j'avais commencé un master en achat en école de commerce et je me réoriente dans le sport et euh... j'ai passé de concours pour être moniteur de sport dans l'armée de l'air..."
- Sujet de recherche: "<CG> euh... Mon sujet c'est euh... la... l'adaptation du langage à l'adolescence..."
```

23/04/26:
```
-------- prompt 1
Thèmes de la conversation :
1. Le dilemme de la mongolfière qui perd de l'altitude (début de la conversation avec la lecture du dilemme)
2. Les différentes solutions envisageables pour résoudre le dilemme (proposition d'essayer de poser la mongolfière, balance pour peser les passagers, etc.)
3. Le choix difficile à faire entre la vie du scientifique et celle du couple (discussion sur l'utilité de chaque passager, proposition de sacrifier personne, etc.)
4. La réflexion sur la situation elle-même (contexte, possibilité d'autres solutions, etc.)

-------- prompt 2
Les différents thèmes de la conversation sont :

- Choix de la personne à jeter d'une montgolfière (débutant avec la phrase "<LS> c'est un scientifique merde").
- Orientation professionnelle et études, expériences professionnelles passées (débutant avec la phrase "<CG> Et du coup, tu fais quoi, toi? T'es en... en licence?").
- Sujet de recherche en doctorat et expériences à mettre en place (débutant avec la phrase "<CG> euh... Mon sujet c'est euh... la... l'adaptation du langage à l'adolescence").
```

Variations in: the way to deliver the information. However, the _content_ hasn't changed much: same number of themes (and inability to give sentences) for prompt1, 3 themes with about the same sentences for prompt2