In [None]:
!pip install numpy
!pip install scipy
!pip install matplotlib
!pip install scikit-learn
!pip install nltk
!pip install pytest
!pip install pandas
!pip install torch
!pip install torchvision
!pip install transformers
!pip install datasets
!pip install streamlit
!pip install openai
!pip install plotly
!pip install statistics

In [None]:
import pandas as pd
import openai
from transformers import pipeline
import plotly.express as px
import plotly.graph_objects as go
import statistics

model = "text-davinci-003"

openai.api_key = "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"

Create the dataset with garden path sentences.

In [None]:
gardenPathDataset = pd.read_csv("GP_BERT.csv")

gardenPathSentences = []

for row in range(len(gardenPathDataset['Code'])):
  code = gardenPathDataset['Code'][row]
  if code[:2] == 'GP':
    context = gardenPathDataset['Context'][row]
    question = gardenPathDataset['Question'][row]
    answer = gardenPathDataset['Answer'][row]
    gardenPathSentences.append([code, context, question, answer])

Prompt the OpenAI model as selected above on different questions.

In [None]:
# Baseline prompting.

for i in range(len(gardenPathSentences)):
  sentence = gardenPathSentences[i]
  prompt = sentence[1] + ' ' + sentence[2]
  response = openai.Completion.create(engine=model, prompt=prompt, max_tokens=50)
  sentence.append(response.choices[0].text)

In [None]:
# Chain of thought prompting.

for i in range(len(gardenPathSentences)):
  sentence = gardenPathSentences[i]
  prompt = 'Q: The driver worried about the dispatcher handed a pink slip. Who was handed a pink slip? A: The dispatcher. Q:' + sentence[1] + ' ' + sentence[2] + 'A:'
  response = openai.Completion.create(engine=model, prompt=prompt, max_tokens=50)
  sentence.append(response.choices[0].text)

In [None]:
# Context prompting.

prompt = "The child read a story hugged the nanny. What did the child do?"
response = openai.Completion.create(engine=model, prompt=prompt, max_tokens=50)
response

<OpenAIObject text_completion id=cmpl-7PmL5hqhTNE8PelWIx60WhRU6dbjd at 0x7f5f1f4d1cb0> JSON: {
  "id": "cmpl-7PmL5hqhTNE8PelWIx60WhRU6dbjd",
  "object": "text_completion",
  "created": 1686378371,
  "model": "text-davinci-003",
  "choices": [
    {
      "text": "\n\nThe child gave the nanny a hug after reading the story.",
      "index": 0,
      "logprobs": null,
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 16,
    "completion_tokens": 15,
    "total_tokens": 31
  }
}

In [None]:
# Save output into a .csv file.

GP_GPT3 = pd.DataFrame(gardenPathSentences)

GP_GPT3.columns =['Type', 'Context', 'Question', 'Answer', 'Output']

GP_GPT3.to_csv('GP_GPT3 CoT for PE.csv', index=False, header=False)

df = GP_GPT3

In [None]:
# Alternatively: Read in an old file.

GP_GPT3 = pd.read_csv('GP_GPT3 CoT for AM.csv')

GP_GPT3.columns = ['Type', 'Context', 'Question', 'Answer', 'Output']

df = GP_GPT3

Function to plot.

In [None]:
def stacked_plot(answers_df):
    colors = {"Garden Path": "#DDAA33", "Local Coherence": "#BB5566"}
    patterns = {"Ambiguous": "", "Unambiguous": "."}

    #print(answers_df)
    answers_df['Question Type'] = pd.Categorical(answers_df['Question Type'], ["Agent Matrix", "Patient Matrix", "Ambiguous Argument", "Matrix Action", "Embedded Action"])
    answers_df = answers_df.sort_values(['Question Type', 'Correct'],ascending=[True, True])
    print(answers_df)

    temp = answers_df.iloc[::4][['Correct', 'Question Type', 'Ambiguity', 'Structure']]
    # fig = go.Figure(go.Bar(x=temp['Question Type'], y = temp['Correct'], color=temp['Structure'], color_discrete_sequence=temp['Structure'].map(colors), pattern_shape=temp['Ambiguity'], pattern_shape_sequence=temp['Ambiguity'].map(patterns)))

    # testing1 = {'title': 'Baseline Evaluation of GPT-3'}

    fig = go.Figure(go.Bar(x=temp['Question Type'], y = temp['Correct'], marker_color=temp['Structure'].map(colors), marker_pattern={"shape": temp['Ambiguity'].map(patterns)}))

    for i in range(1,4):
        temp_prev = answers_df.iloc[i-1::4][['Correct', 'Question Type', 'Ambiguity', 'Structure']].reset_index()
        temp = answers_df.iloc[i::4][['Correct', 'Question Type', 'Ambiguity', 'Structure']].reset_index()
        temp['Correct_edit'] = temp['Correct']-temp_prev['Correct']
        #fig.add_trace(go.Bar(x=temp['Question Type'], y = temp['Correct_edit'],
        #marker_color=temp['Structure'].map(colors), marker_pattern={"shape": temp['Ambiguity'].map(patterns)}))
        fig.add_trace(go.Bar(x=temp['Question Type'], y = temp['Correct'],
        marker_color=temp['Structure'].map(colors), marker_pattern={"shape": temp['Ambiguity'].map(patterns)}))

    fig.update_layout(
        #barmode='stack',
        font_family="Times New Roman",
        font_color ="black",
        font_size=24,
        title="CoT w/ Ambiguous Argument Evaluation of GPT-3"
        #plot_bgcolor='rgba(0,0,0,0)',
        #gridcolor='gray'
    )

    fig.update_xaxes(title_text='Question Type',
        title_font_family="Times New Roman",
        title_font_color ="black",
        title_font_size=24,
    )

    fig.update_yaxes(title_text='Accuracy',
        title_font_family="Times New Roman",
        title_font_color ="black",
        title_font_size=24,
    )


    fig.show()

Extract the answers from the model's output and score it.

In [None]:
QAer = pipeline('question-answering')
scores = {v: [] for v in df.Type.unique()}
answers = {v: [] for v in df.Type.unique()}
out = []

for index, row in df.iterrows():
  QA_input = {
      'question': row['Question'],
      'context': row['Output']
  }
  res = QAer(QA_input)
  scores[row['Type']].append(res['score'])

  if res['answer'].strip(".").lower() in row['Answer'].split('\n'):
        correct = 1
  else:
      correct = 0
  answers[row['Type']].append(correct)
  out.append([row['Type'], row['Context'], row['Question'], row['Answer'], res['answer'], res['score'], correct])

final = {v: statistics.mean(scores[v]) for v in scores.keys()}
final_ans = {v: statistics.mean(answers[v]) for v in answers.keys()}
answers_df = pd.DataFrame(final_ans.items(), columns=['Type', 'Correct']).sort_values(by=['Correct'])
scores_df = pd.DataFrame(final.items(), columns=['Type', 'Score']).sort_values(by=['Score'])

names = {"AM": "Agent Matrix", "PM": "Patient Matrix", "PE": "Ambiguous Argument", "A1": "Matrix Action", "A2": "Embedded Action", "U": "Unambiguous", "A": "Ambiguous", "GP": "Garden Path", "LC": "Local Coherence"}
answers_df['Question Type'] = answers_df['Type'].str.extract(r'-([A-Z0-9]*)$')
answers_df['Ambiguity'] = answers_df['Type'].str.extract(r'-([A-Z0-9]*)-')
answers_df['Structure'] = answers_df['Type'].str.extract(r'^([A-Z0-9]*)-')
answers_df['Question Type'] = answers_df['Question Type'].map(names)
answers_df['Ambiguity'] = answers_df['Ambiguity'].map(names)
answers_df['Structure'] = answers_df['Structure'].map(names)
answers_df['Question Type'] = pd.Categorical(answers_df['Question Type'], ["Agent Matrix", "Patient Matrix", "Ambiguous Argument", "Matrix Action", "Embedded Action"])

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
stacked_plot(answers_df)

      Type  Correct       Question Type    Ambiguity    Structure
0  GP-A-AM    0.850        Agent Matrix    Ambiguous  Garden Path
4  GP-U-AM    0.925        Agent Matrix  Unambiguous  Garden Path
5  GP-U-PM    0.900      Patient Matrix  Unambiguous  Garden Path
1  GP-A-PM    0.925      Patient Matrix    Ambiguous  Garden Path
2  GP-A-PE    0.750  Ambiguous Argument    Ambiguous  Garden Path
6  GP-U-PE    0.950  Ambiguous Argument  Unambiguous  Garden Path
3  GP-A-A1    0.175       Matrix Action    Ambiguous  Garden Path
7  GP-U-A1    0.525       Matrix Action  Unambiguous  Garden Path
8  GP-A-A2    0.425     Embedded Action    Ambiguous  Garden Path
9  GP-U-A2    0.550     Embedded Action  Unambiguous  Garden Path
