# Connections Analysis Notebook

Purpose is to evaluate the performance of GPT-3.5 on Connections game.

By: Elsie Wang

Date: 03/18/24

## Overhead

In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm
from json import load
from openai import OpenAI
import csv
from collections import defaultdict

from transformers import AutoTokenizer

In [2]:
from json import load

KEY = load(open('./SECRETS.json', 'r'))
client = OpenAI(api_key=KEY['OpenAIKey'])

### Load Data

In [49]:
answers_dict = defaultdict(list)
with open("data/answers.csv", newline='') as csvfile:
    # Create a CSV reader object
    reader = csv.reader(csvfile)
    
    # Iterate over each row in the CSV file
    for row in reader:
        if len(row) == 4:
            continue
        else:
            answers_dict['Game_ID'].append(row[0])
            answers_dict['Words'].append([row[i].replace("'", "").strip("[]\'").strip() for i in range(1,5)])
            answers_dict['Color'].append(row[-1])
            if len(row) == 8:
                answers_dict['Category'].append(f"{row[5]}, {row[6]}")
            else:
                answers_dict['Category'].append(row[5])
answers_df = pd.DataFrame(answers_dict)
answers_df.head()

Unnamed: 0,Game_ID,Words,Color,Category
0,1,"[HAIL, RAIN, SLEET, SNOW]",yellow,WET WEATHER
1,1,"[BUCKS, HEAT, JAZZ, NETS]",green,NBA TEAMS
2,1,"[OPTION, RETURN, SHIFT, TAB]",blue,KEYBOARD KEYS
3,1,"[KAYAK, LEVEL, MOM, RACECAR]",purple,PALINDROMES
4,2,"[BOOT, LOAFER, PUMP, SNEAKER]",yellow,FOOTWEAR


In [242]:
prompts_dict = defaultdict(list)
with open("data/prompt.csv", newline='') as csvfile:
    # Create a CSV reader object
    reader = csv.reader(csvfile)
    
    # Iterate over each row in the CSV file
    for row in reader:
        if len(row) == 2:
            continue
        else:
            prompts_dict['Game_ID'].append(row[0])
            prompts_dict['Options'].append([row[i].replace("'", "").strip("[]").strip() for i in range(1, len(row))])
prompts_df = pd.DataFrame(prompts_dict)
prompts_df.head()

Unnamed: 0,Game_ID,Options
0,1,"[HAIL, LEVEL, RETURN, OPTION, SNOW, NETS, TAB,..."
1,2,"[MILE, LOAFER, LEAGUE, TIME, BOOT, ESSENCE, SN..."
2,3,"[POM, TENOR, WOLF, PEKE, SCARF, KING, PIT, GOB..."
3,4,"[REEBOK, DUST, SPIDER, ADIDAS, CABARET, SWEEP,..."
4,5,"[HULU, LOW, KETCHUP, RELISH, GREEN, GLUM, SCAR..."


## Analysis

### Test By Game

In [16]:
def run_eval(words, game_id):
    """ Return GPT-3.5 answer and total logprobs to Connetions game
    """
    # New York Times instructions
    instructions = "Find four groups of four items that share something in common.\nCategories will always be more specific than \"5-LETTER-WORDS,\" \"NAMES\" or \"VERBS.\"\n"
    prompt = f"{instructions}The words are:\n{words}\nNo word is used twice. Format the category words in csv. Do not give a reason or number it."

    # Define parameters for the completion
    response = client.chat.completions.create(
        model="gpt-3.5-turbo", 
        max_tokens=200, 
        temperature=0.7,
        stop=None,  
        messages=[{"role": "user", "content": prompt}],
        logprobs=True
    )

    # Get the response text
    answer = response.choices[0].message.content
    prob = calc_prob(response)

    print(answer)
    print(prob)
    # Print and record the answer
    return answer, prob, response
    # Record the answer to a file
#     with open("answers.txt", "a") as f:
#         f.write(answer + "\n")

In [17]:
prompts = prompts_df['Words']
answer, prob, response= run_eval(str(prompts[0]).lower(), 1)

InternalServerError: Error code: 500 - {'error': {'message': 'The server had an error processing your request. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if you keep seeing this error. (Please include the request ID req_6e7106a65c242c4e481045d7d1da39cb in your email.)', 'type': 'server_error', 'param': None, 'code': None}}

### Test By Category

GPT will be tested on whether they can find the word that has the most in common with three words given a set of options. One example is given below:

**Prompt**:

Find the word that shares the most in common with ['BLUE', 'GREEN', 'PURPLE'] among the follwoing words and ONLY the following words:

['HULU', 'YELLOW', 'KETCHUP', 'RELISH', 'SCARF', 'LETTUCE', 'PRIME', 'MUSTARD', 'TOMATO', 'MAYO', 'PEACOCK', 'NETFLIX', 'TARTAR'].

DO NOT REPEAT THE THREE WORDS GIVEN. Give a one word answer. E.g. 'BLUE'

**Response**:

'YELLOW'

In [261]:
def calc_prob(response):
    """ Returns probability of response generated by GPT
    """
    logprobs = []
    contents = response.choices[0].logprobs.content
    for content in contents:
        logprobs.append(content.logprob)
    return np.exp(sum(logprobs))

In [262]:
def run_eval_category(words, options, correct_answer):
    """ Prompts GPT to answer categories individually and returns response and probability (e.g. RED, BLUE, YELLOW, ____)
    """
    
    prompt = f"Find the word that shares the most in common with {words} among the follwoing words and ONLY the following words:\n{options}.\nDO NOT REPEAT THE THREE WORDS GIVEN. Give a one word answer. E.g. 'BLUE'"
    answers = []
    probs = []
    # Prompt GPT 10 times and get most common response
    for i in range(10):
        response = client.chat.completions.create(
            model="gpt-3.5-turbo", 
            max_tokens=200, 
            temperature=0.7,
            stop=None,  
            messages=[{"role": "user", "content": prompt}],
            logprobs=True
        )
        answers.append(response.choices[0].message.content)
        probs.append(calc_prob(response))
        
    answer = max(set(answers), key=answers.count) # Get most common answer
    indices = [i for i in range(10) if answers[i] == answer]
    prob = np.mean([probs[index] for index in indices]) # Average out probs of answer
    
    correct = answer == correct_answer # Whether GPT guessed correctly

    # Print and record the answer
    return answer, prob, correct
    # Record the answer to a file
#     with open("answers.txt", "a") as f:
#         f.write(answer + "\n")

In [263]:
# Combine answers and prompts df
def filter_options(row):
    """ Returns a list of options without the words in the Words column
    """
    return [word for word in row['Options'] if word not in row['Words']]

# Apply the function to create the new options column
merged = answers_df.merge(prompts_df, on='Game_ID', how='left')
merged['Options'] = merged.apply(filter_options, axis=1)
merged['Prompt'] = merged['Words'].apply(lambda x: x[:3])
merged['Correct_Answer'] = merged['Words'].apply(lambda x: x[-1])
merged.head()

Unnamed: 0,Game_ID,Words,Color,Category,Options,Prompt,Correct_Answer
0,1,"[HAIL, RAIN, SLEET, SNOW]",yellow,WET WEATHER,"[LEVEL, RETURN, OPTION, NETS, TAB, KAYAK, HEAT...","[HAIL, RAIN, SLEET]",SNOW
1,1,"[BUCKS, HEAT, JAZZ, NETS]",green,NBA TEAMS,"[HAIL, LEVEL, RETURN, OPTION, SNOW, TAB, KAYAK...","[BUCKS, HEAT, JAZZ]",NETS
2,1,"[OPTION, RETURN, SHIFT, TAB]",blue,KEYBOARD KEYS,"[HAIL, LEVEL, SNOW, NETS, KAYAK, HEAT, JAZZ, S...","[OPTION, RETURN, SHIFT]",TAB
3,1,"[KAYAK, LEVEL, MOM, RACECAR]",purple,PALINDROMES,"[HAIL, RETURN, OPTION, SNOW, NETS, TAB, HEAT, ...","[KAYAK, LEVEL, MOM]",RACECAR
4,2,"[BOOT, LOAFER, PUMP, SNEAKER]",yellow,FOOTWEAR,"[MILE, LEAGUE, TIME, ESSENCE, US, PEOPLE, SEA,...","[BOOT, LOAFER, PUMP]",SNEAKER


In [283]:
responses = []
probs = []
correct_ = []

# Tests GPT on all categories 
for index, row in tqdm(merged.iterrows(), total=len(merged)):
    prompt = row['Prompt']
    options = row['Options']
    correct_answer = row['Correct_Answer']
    
    answer, prob, correct = run_eval_category(prompt, options, correct_answer)
    responses.append(answer)
    probs.append(prob)
    correct_.append(correct)

100%|███████████████████████████████████████████| 40/40 [56:47<00:00, 85.20s/it]


In [305]:
# Add results to column
merged['Response'] = responses
merged['Probabilities'] = probs
merged['Correct'] = correct_
merged.head()

# Write to csv
merged.to_csv('data/results.csv', index=False)

merged.head()

Unnamed: 0,Game_ID,Words,Color,Category,Options,Prompt,Correct_Answer,Response,Probabilities,Correct
0,1,"[HAIL, RAIN, SLEET, SNOW]",yellow,WET WEATHER,"[LEVEL, RETURN, OPTION, NETS, TAB, KAYAK, HEAT...","[HAIL, RAIN, SLEET]",SNOW,HEAT,0.657348,False
1,1,"[BUCKS, HEAT, JAZZ, NETS]",green,NBA TEAMS,"[HAIL, LEVEL, RETURN, OPTION, SNOW, TAB, KAYAK...","[BUCKS, HEAT, JAZZ]",NETS,HEAT,0.810663,False
2,1,"[OPTION, RETURN, SHIFT, TAB]",blue,KEYBOARD KEYS,"[HAIL, LEVEL, SNOW, NETS, KAYAK, HEAT, JAZZ, S...","[OPTION, RETURN, SHIFT]",TAB,RETURN,0.725834,False
3,1,"[KAYAK, LEVEL, MOM, RACECAR]",purple,PALINDROMES,"[HAIL, RETURN, OPTION, SNOW, NETS, TAB, HEAT, ...","[KAYAK, LEVEL, MOM]",RACECAR,LEVEL,0.860018,False
4,2,"[BOOT, LOAFER, PUMP, SNEAKER]",yellow,FOOTWEAR,"[MILE, LEAGUE, TIME, ESSENCE, US, PEOPLE, SEA,...","[BOOT, LOAFER, PUMP]",SNEAKER,FOOT,0.976352,False
