In [1]:
# Imports
import time
import os
import csv
import numpy as np
import pandas as pd
import openai
import ray
from   sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# File locations
dir = '/Users/Federica/Documents/PhD/generative_patient'
out_dir = os.path.join(dir, 'output')
fig_dir = os.path.join(dir, 'figures')
os.makedirs(fig_dir, exist_ok=True)
os.makedirs(out_dir, exist_ok=True)

# # Plotting options
# figsize_std = (12,8)
# sns.set(context='talk', rc={'figure.figsize':figsize_std})
# sns.set_style('whitegrid', {'grid.color': '.9'})
# plt.rc('figure', figsize=figsize_std)
# plt.rcParams['figure.figsize'] = figsize_std

## Data

In [None]:
# Read texts and gold labels from Google Sheets
ayers = pd.read_csv(os.path.join(dir, 'data', 'ayers2023.csv'))
print(len(ayers))
ayers[:1]

## Test

In [2]:
openai.api_type = 'azure'
openai.api_version = '2023-05-15'
openai.api_key = open(os.path.join('/Users/Federica/Documents/PhD/apikeys', 'endoqa-key.txt')).read().strip()
openai.api_base = 'https://endoqa-cornell.openai.azure.com/' # your endpoint should look like the following https://YOUR_RESOURCE_NAME.openai.azure.com/

## Check Title

In [4]:
# Read csv with posts without and with title
df = pd.read_csv(os.path.join(dir, 'data', 'general_title_check.csv'))
print(len(df))
df[:1]

10


Unnamed: 0,id,reddit_id,question_with_title,question_without_title
0,N19,xy6ur0,"First T/C seizure, then first migraine less th...","So, on September 17 I (21F) had an unprovoked,..."


In [5]:
# send API request to OpenAI's API to receive generated answers to questions

def get_response(_prompt, model='gpt4', _max_tokens=500):

    attempts = 0
    max_attempts = 3
    system__prompt = ''''''  ## for few-shot
    response = 'None'

    while attempts < max_attempts:
        try:
            response = openai.ChatCompletion.create(
                engine=model,
                max_tokens=_max_tokens,
                messages = [
                    #{'role': 'system', 'content': system__prompt},
                    {'role': 'user', 'content': _prompt}
                ]
            )
            _answer = response['choices'][0]['message']['content'].strip().lower()
        except Exception as e:
            print(f'Error {e}. Sleeping 3 seconds ...')
            time.sleep(3)
            if attempts == max_attempts-1:
                _answer = f'Error {e}'
        attempts += 1
        if model == 'gpt4':
            time.sleep(2)
        else:
            time.sleep(0.5)

    return _answer

# Check if adding title to content of post changes answers

In [11]:
r_id = []
r_title = []
r_notitle = []
for index, row in df.iterrows():

    start = time.time()

    id = row['id']
    
    n_tokens = 250
    r_id.append(f'response_{id}')
    r_title.append(get_response(row['question_with_title'], _max_tokens=n_tokens))
    r_notitle.append(get_response(row['question_without_title'], _max_tokens=n_tokens))

    end = time.time()  
    print(id, end-start)

df['response_id'] = r_id
df['response_with_title'] = r_title
df['response_without_title'] = r_notitle
df.head()

N19 196.54555487632751
N36 230.16460990905762
N38 348.6210079193115
N55 269.67299580574036
N61 185.44123077392578
N63 71.6702389717102
N68 344.6697030067444
N71 194.7001781463623
N96 282.9023599624634
N92 168.0666229724884


Unnamed: 0,id,reddit_id,question_with_title,question_without_title,response_id,response_with_title,response_without_title
0,N19,xy6ur0,"First T/C seizure, then first migraine less th...","So, on September 17 I (21F) had an unprovoked,...",response_N19,i'm sorry to hear about what you're going thro...,i'm really sorry to hear about your experience...
1,N36,ye84b6,"Stool problem! 21M, 103KG, white guy, currentl...","21M, 103KG, white guy, currently on head pain ...",response_N36,changes in frequency and consistency of bowel ...,there could be a few potential reasons why you...
2,N38,y2yo5s,hello Im 21. 48kg. 156 cm. Asian.\nMy bf was d...,Im 21. 48kg. 156 cm. Asian.\nMy bf was diagnos...,response_N38,"i'm an ai language model, but i can provide so...",recurrent utis can be really frustrating to de...
3,N55,y4knve,Nerve pain and muscle twitching 21f with brach...,"21f with brachial plexus injury from birth, I ...",response_N55,"i'm not a doctor, but i can tell you that the ...",neuropathic pain is a common symptom of brachi...
4,N61,y19mau,Is it possible to lose eyesight and it mean no...,"22F, 5‰Ûª6‰Û�,207lbs, caucausian. One time eve...",response_N61,"i'm an ai and not a doctor, but this sounds l...",i'm an ai language model and not a doctor but ...


In [12]:
output_path = os.path.join(out_dir,  f'general_title_checked.csv')
df.to_csv(output_path)

# Check if responses are consistent over multiple API requests

In [None]:

def get_response(_prompt, model='gpt4', _max_tokens=500):

    attempts = 0
    max_attempts = 3
    system__prompt = ''''''  ## for few-shot
    response = 'None'

    while attempts < max_attempts:
        try:
            response = openai.ChatCompletion.create(
                engine=model,
                max_tokens=_max_tokens,
                messages = [
                    #{'role': 'system', 'content': system__prompt},
                    {'role': 'user', 'content': _prompt}
                ]
            )
            _answer = response['choices'][0]['message']['content'].strip().lower()
        except Exception as e:
            print(f'Error {e}. Sleeping 3 seconds ...')
            time.sleep(3)
        attempts += 1
        if model == 'gpt4':
            time.sleep(2)
        else:
            time.sleep(0.5)

    return _answer

In [None]:
n_responses = 5
output_path = os.path.join(out_dir,  f'replication_ayers.csv')
if not os.path.exists(output_path):
    with open(output_path, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        row = ['id', 'question']
        row.extend([f'r{n}' for n in range(1,n_responses+1)])
        csvwriter.writerow(row)

start = time.time()
for index, row in ayers[:1].iterrows():

  id = row['postID']
  question = row['Question']
  answers = []

  for n in range(1,n_responses+1):
    n_tokens = 500
    answers.append(get_response(question, _max_tokens=n_tokens))

  with open(output_path, 'a') as csvfile:
      csvwriter = csv.writer(csvfile)
      row = [id, question]
      row.extend(answers)
      csvwriter.writerow(row)

end = time.time()  
print(id, end-start)