# Restricted question based resolution of anaphoras

In [1]:
import sys
sys.path.append("../")

In [2]:
import openai
import configparser

from llm_library.openai import configure_azure_model
from llm_library.openai import ask_openai

from pandas import read_csv
from pandas import DataFrame
from tqdm.auto import tqdm 
from math import isnan

## I. Set up Azure API 

In [3]:
config = configparser.ConfigParser()
status = config.read('../model_configurations/azure.ini') 
assert status == ['../model_configurations/azure.ini']

In [4]:
gpt_conf = configure_azure_model(config)
print(f"Going to use model {gpt_conf.model_name}")
print("Test")
print("Q: What is time?")
print(f"A: {ask_openai(gpt_conf, prompt='What is time?', stop='.')}")

Going to use model gpt-35-turbo
Test
Q: What is time?
A: Time is a fundamental concept that refers to the continued progress of events, measured in terms of seconds, minutes, hours, days, months, or years


## II. Clean the input file

There are markers in sewntence texts that should be removed for further analysis.   

In [5]:
tbl = (read_csv('input_data/anaphora_resolution.csv')
       .assign(possible_antecedents=lambda df: df['possible_antecedents'].map(eval))
       .assign(sentence=lambda tbl: tbl['pronoun_sentence'].str.replace('\*\*\*|\*\*', '', regex=True))
       )
assert sum(tbl['sentence'].str.find('*') != -1) == 3, 'These are validated sentences'
tbl.head()

Unnamed: 0,index,label,pronoun,pronoun_lemma,pronoun_sentence,question,question_fix,possible_antecedents,correct_antecedent,antecendent_in_list,sentence
0,0,1,neile,['see'],***Inimeste*** kaasamine ettevõtte tuleviku pl...,kellele tagasiside andmine ja nende arendamine ?,kellele tagasiside andmine ?,"[Inimeste, kaasamine, ettevõtte, tuleviku, pla...",Inimeste,1,Inimeste kaasamine ettevõtte tuleviku planeeri...
1,1,1,nende,['tema'],***Inimeste*** kaasamine ettevõtte tuleviku pl...,kelle arendamine ?,,"[Inimeste, kaasamine, ettevõtte, tuleviku, pla...",Inimeste,1,Inimeste kaasamine ettevõtte tuleviku planeeri...
2,2,1,kes,"['kes', 'kes']","1 ) Eesti kodaniku ***abikaasat*** , **kes** ...",kes taotleb elamisluba käesoleva seaduse § 12 ...,,"[kodaniku, abikaasat, elamisluba, seaduse, lõi...",abikaasat,1,"1 ) Eesti kodaniku abikaasat , kes taotleb e..."
3,3,1,millega,"['mis', 'mis']",Viimasel etapil hoidis tiirudes kõik märgid ma...,millega Eesti teenis 10 MK punkti ?,,"[etapil, tiirudes, märgid, kohta, punkti, Viim...",kohta,1,Viimasel etapil hoidis tiirudes kõik märgid ma...
4,4,1,nad,['tema'],"Õnneks ***mehed*** siiski päästeti , sest **n...",kes taipasid kiiresti telefonist SIM-kaardi ee...,,"[Õnneks, mehed, telefonist, SIM-kaardi, kõne, ...",mehed,1,"Õnneks mehed siiski päästeti , sest nad taip..."


## III. Strictly restricted anaphore resolution with ChatGPT 

In [6]:
prompt = (
    "Answer the following question in JSON format based on the provided list of possible answers.\n" 
    "The key value should be 'answer'. The entity value should be exactly as it is written in the list.\n" 
    "Sentence: ‘{sentence}'\n" 
    "The question: '{question}'\n" 
    "Possible answers list:\n\t{variants}")

### Constructed question

In [7]:
columns = ['pronoun_sentence', 'question', 'possible_antecedents', 'sentence']
result = tbl[columns].reset_index(names='index').assign(result=None)
for i, (_, question, variants, sentence) in tqdm(tbl[columns].iterrows(), total=len(tbl)):
    result.loc[i, 'result'] = ask_openai(gpt_conf, prompt.format(question=question, sentence=sentence, variants=variants))

  0%|          | 0/1052 [00:00<?, ?it/s]

The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766
The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766


In [8]:
export_columns = ['pronoun_sentence', 'question', 'possible_antecedents', 'result']
result[export_columns].to_csv(f"output_data/{gpt_conf.model_name}/anaphora_resolution_strictly_restricted_answer_and_constructed_question.csv")

### Manually corrected question

In [9]:
columns = ['pronoun_sentence', 'question_fix', 'possible_antecedents', 'sentence']
result = tbl[columns].reset_index(names='index').assign(result=None)
for i, (_, question, variants, sentence) in tqdm(tbl[columns].iterrows(), total=len(tbl)):
    if not isinstance(question, str):
        continue
    result.loc[i, 'result'] = ask_openai(gpt_conf, prompt.format(question=question, sentence=sentence, variants=variants))

  0%|          | 0/1052 [00:00<?, ?it/s]

In [10]:
export_columns = ['pronoun_sentence', 'question_fix', 'possible_antecedents', 'result']
result[export_columns].to_csv(f"output_data/{gpt_conf.model_name}/anaphora_resolution_strictly_restricted_answer_and_manually_constructed_question.csv")

## IV. Guided anaphore resolution with ChatGPT 

In [11]:
prompt = (
    "Answer the following question in JSON format based on the provided list of possible answers. The key value should be 'answer'.\n"
    "If the correct entity from the question is not present in the list, then the answer should be 'MISSING_VARIANT'.\n" 
    "If the correct answer is found in the list, select it exactly as it is written.\n"
    "Sentence: ‘{sentence}'\n" 
    "The question: '{question}'\n" 
    "Possible answers list:\n\t{variants}")

### Constructed question

In [12]:
columns = ['pronoun_sentence', 'question', 'possible_antecedents', 'sentence']
result = tbl[columns].reset_index(names='index').assign(result=None)
for i, (_, question, variants, sentence) in tqdm(tbl[columns].iterrows(), total=len(tbl)):
    result.loc[i, 'result'] = ask_openai(gpt_conf, prompt.format(question=question, sentence=sentence, variants=variants))

  0%|          | 0/1052 [00:00<?, ?it/s]

The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766
The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766


In [13]:
export_columns = ['pronoun_sentence', 'question', 'possible_antecedents', 'result']
result[export_columns].to_csv(f"output_data/{gpt_conf.model_name}/anaphora_resolution_guided_answer_and_constructed_question.csv")

### Manually corrected question

In [14]:
columns = ['pronoun_sentence', 'question_fix', 'possible_antecedents', 'sentence']
result = tbl[columns].reset_index(names='index').assign(result=None)
for i, (_, question, variants, sentence) in tqdm(tbl[columns].iterrows(), total=len(tbl)):
    if not isinstance(question, str):
        continue
    result.loc[i, 'result'] = ask_openai(gpt_conf, prompt.format(question=question, sentence=sentence, variants=variants))

  0%|          | 0/1052 [00:00<?, ?it/s]

In [15]:
export_columns = ['pronoun_sentence', 'question_fix', 'possible_antecedents', 'result']
result[export_columns].to_csv(f"output_data/{gpt_conf.model_name}/anaphora_resolution_guided_answer_and_manually_constructed_question.csv")