In [1]:
import pandas as pd
import numpy as np
from time import sleep
from openai import OpenAI
from private_data import API_KEY

In [2]:
# Main query function
client = OpenAI(api_key=API_KEY)
def ask_query(query):
    my_query = [ {"role": "system", "content": "Give direct answer without any explanation."},
        {"role": "user", "content": query} ] 
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo", messages=my_query
    )
    return str(completion.choices[0].message.content)

In [3]:
def country_analogy_testing(df1, df2, equiv_func):
    result = []

    for i in range(min(len(df1), len(df2))):
        entity_1 = df1.iloc[i]['Entity']
        target_1 = df1.iloc[i]['Target']
        
        entity_2 = df2.iloc[i]['Entity']
        target_2 = df2.iloc[i]['Target']
        
        prompt = f"{entity_1}:{target_1}::{entity_2}:X; what is X?"
        answer = target_2
        response = ask_query(prompt)
        status = equiv_func(response, answer)
        
        result.append({'Prompt': prompt, 'Answer': answer, 'GPT-reply': response, 'Status': status})
        print(f"Prompt: {prompt} \t|\t Answer: {answer} \t|\t Response: {response} \t|\t Status: {status}")
    
    return result

In [12]:
# Loading Dataset 
athlete_df = pd.read_csv('../data/Athletes.csv')
athlete_df = athlete_df[['Name', 'Nation' , 'Sport']]
# # # Shuffle and resize the data
athlete_df = athlete_df.sample(frac=1).reset_index(drop=True).head(100)

athlete_df.head(10)

Unnamed: 0,Name,Nation,Sport
0,Johan Santana,Venezuela,Baseball
1,Cristiano Ronaldo,Portugal,Soccer
2,Adam Scott,Australia,Golf
3,Serena Williams,United States,Tennis
4,Novak Djokovic,Serbia,Tennis
5,Matthew Stafford,United States,Football
6,Jeff Gordon,United States,Racing
7,Joe Haden,United States,Football
8,Mesut Özil,Germany,Soccer
9,Derrick Rose,United States,Basketball


In [16]:
# Preprocessing 

def normalize(entity):
    entity = entity.lower()
    
    return entity
    

def are_equivalent(response, answer):
    response = normalize(response)
    answer = normalize(answer)

    if answer in response:
        return True
    if response in answer:
        return True
        
    return False

def does_gpt_know(person, country):
    # check if deathdate is nan
    if pd.isnull(country):
        return False
    
    query = f"Which country does {person} play for?"
    response = ask_query(query)
    print(f"Person: {person} \t|\t Country: {country} \t|\t Response: {response} \t|\t Status: {are_equivalent(response, country)}")
    return are_equivalent(response, country)

def preprocess(df):
    correct_answers = []
    for index, row in df.iterrows():
        if does_gpt_know(row['Name'], row['Nation']):
            correct_answers.append(row)
        if len(correct_answers) == 20:
            break
    new_df = pd.DataFrame(correct_answers)
    return new_df

In [17]:
known_df = preprocess(athlete_df)
known_df

Person: Johan Santana 	|	 Country: Venezuela 	|	 Response: Venezuela. 	|	 Status: True
Person: Cristiano Ronaldo 	|	 Country: Portugal 	|	 Response: Portugal. 	|	 Status: True
Person: Adam Scott 	|	 Country: Australia 	|	 Response: Australia. 	|	 Status: True
Person: Serena Williams 	|	 Country: United States 	|	 Response: United States. 	|	 Status: True
Person: Novak Djokovic 	|	 Country: Serbia 	|	 Response: Serbia. 	|	 Status: True
Person: Matthew Stafford 	|	 Country: United States 	|	 Response: Los Angeles Rams 	|	 Status: False
Person: Jeff Gordon 	|	 Country: United States 	|	 Response: United States 	|	 Status: True
Person: Joe Haden 	|	 Country: United States 	|	 Response: United States of America 	|	 Status: True
Person: Mesut Özil 	|	 Country: Germany 	|	 Response: Germany. 	|	 Status: True
Person: Derrick Rose 	|	 Country: United States 	|	 Response: United States. 	|	 Status: True
Person: Carlos Dunlap 	|	 Country: United States 	|	 Response: United States. 	|	 Status: Tru

Unnamed: 0,Name,Nation,Sport
0,Johan Santana,Venezuela,Baseball
1,Cristiano Ronaldo,Portugal,Soccer
2,Adam Scott,Australia,Golf
3,Serena Williams,United States,Tennis
4,Novak Djokovic,Serbia,Tennis
6,Jeff Gordon,United States,Racing
7,Joe Haden,United States,Football
8,Mesut Özil,Germany,Soccer
9,Derrick Rose,United States,Basketball
10,Carlos Dunlap,United States,Football


In [20]:
# divide into 2 parts for analogy testing
known_df = known_df.sample(frac=1).reset_index(drop=True)
known_df.rename(columns={'Name': 'Entity', 'Nation': 'Target'}, inplace=True)
n = len(known_df)
df1 = known_df.head(n//2)
df2 = known_df.tail(n-n//2)

In [21]:
result = country_analogy_testing(df1, df2, are_equivalent)

Prompt: Johan Santana:(Venezuela)::Lionel Messi:X; what is X? 	|	 Answer: Argentina 	|	 Response: Argentina 	|	 Status: True
Prompt: Sergio Agüero:(Argentina)::Carlos Dunlap:X; what is X? 	|	 Answer: United States 	|	 Response: X is the United States. 	|	 Status: True
Prompt: Dirk Nowitzki:(Germany)::Joe Haden:X; what is X? 	|	 Answer: United States 	|	 Response: United States 	|	 Status: True
Prompt: Steven Gerrard:(England)::Novak Djokovic:X; what is X? 	|	 Answer: Serbia 	|	 Response: Serbia 	|	 Status: True
Prompt: Serena Williams:(United States)::Derrick Rose:X; what is X? 	|	 Answer: United States 	|	 Response: Chicago Bulls 	|	 Status: False
Prompt: Matt Ryan:(United States)::Cristiano Ronaldo:X; what is X? 	|	 Answer: Portugal 	|	 Response: Portugal 	|	 Status: True
Prompt: Matt Cain:(United States)::Mesut Özil:X; what is X? 	|	 Answer: Germany 	|	 Response: Germany 	|	 Status: True
Prompt: Robinson Cano:(Dominican Republic)::Jeff Gordon:X; what is X? 	|	 Answer: United States 

In [22]:
result_df = pd.DataFrame(columns=['Prompt', 'Answer', 'GPT-reply', 'Status'], data=result)
result_df.to_csv('../results/result: athlete-country.csv')
result_df


Unnamed: 0,Prompt,Answer,GPT-reply,Status
0,Johan Santana:(Venezuela)::Lionel Messi:X; wha...,Argentina,Argentina,True
1,Sergio Agüero:(Argentina)::Carlos Dunlap:X; wh...,United States,X is the United States.,True
2,Dirk Nowitzki:(Germany)::Joe Haden:X; what is X?,United States,United States,True
3,Steven Gerrard:(England)::Novak Djokovic:X; wh...,Serbia,Serbia,True
4,Serena Williams:(United States)::Derrick Rose:...,United States,Chicago Bulls,False
5,Matt Ryan:(United States)::Cristiano Ronaldo:X...,Portugal,Portugal,True
6,Matt Cain:(United States)::Mesut Özil:X; what ...,Germany,Germany,True
7,Robinson Cano:(Dominican Republic)::Jeff Gordo...,United States,USA,False
8,Manny Pacquiao:(Philippines)::Zack Greinke:X; ...,United States,United States,True
9,Adam Scott:(Australia)::Joe Mauer:X; what is X?,United States,United States,True


NameError: name 'result_df' is not defined

In [6]:

# Your data
data = {
    'name': ['Kwon Ri-se', 'Toots Thielemans', 'Joe Santos', 'Alan Coren', 'Ed Cassidy', 'Frederick Chiluba', 'Gail Dolgin', 'Mark Strand', 'Kitty Wells', 'Ãˆve Curie', 'Helmut Schmidt', 'Tommy Makem'],
    'death_month': ['September', 'August', 'March', 'October', 'December', 'June', 'October', 'November', 'July', 'October', 'November', 'August'],
    'death_year': [2014, 2016, 2016, 2007, 2012, 2011, 2010, 2014, 2012, 2007, 2015, 2007],
    'death_date': ['September 2014', 'August 2016', 'March 2016', 'October 2007', 'December 2012', 'June 2011', 'October 2010', 'November 2014', 'July 2012', 'October 2007', 'November 2015', 'August 2007']
}

# Create a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
df


Unnamed: 0,name,death_month,death_year,death_date
0,Kwon Ri-se,September,2014,September 2014
1,Toots Thielemans,August,2016,August 2016
2,Joe Santos,March,2016,March 2016
3,Alan Coren,October,2007,October 2007
4,Ed Cassidy,December,2012,December 2012
5,Frederick Chiluba,June,2011,June 2011
6,Gail Dolgin,October,2010,October 2010
7,Mark Strand,November,2014,November 2014
8,Kitty Wells,July,2012,July 2012
9,Ãˆve Curie,October,2007,October 2007


In [25]:
df.rename(columns={'name': 'Entity', 'death_date': 'Target'}, inplace=True)
li1 = []
li2 = []
for i1, row1 in df.iterrows():
    df_rand = df.sample(frac=1).reset_index(drop=True)
    c = 0
    for i2, row2 in df_rand.iterrows():
        if row1['Entity'] != row2['Entity']:
            li1.append(row1)
            li2.append(row2)
            c += 1
        if c == 5: break

df1 = pd.DataFrame(li1)
df2 = pd.DataFrame(li2)


In [26]:
print(df1.head(5), df2.head(5), sep='\n\n')
print(len(df1), len(df2))

       Entity death_month  death_year          Target
0  Kwon Ri-se   September        2014  September 2014
0  Kwon Ri-se   September        2014  September 2014
0  Kwon Ri-se   September        2014  September 2014
0  Kwon Ri-se   September        2014  September 2014
0  Kwon Ri-se   September        2014  September 2014

              Entity death_month  death_year         Target
0  Frederick Chiluba        June        2011      June 2011
1        Gail Dolgin     October        2010   October 2010
2         Ãˆve Curie     October        2007   October 2007
4         Alan Coren     October        2007   October 2007
5         Ed Cassidy    December        2012  December 2012
60 60


In [27]:
result = deathdate_analogy_testing(df1, df2, are_equivalent)

Prompt: Kwon Ri-se:(September 2014)::Frederick Chiluba:X; what is X? 	|	 Answer: June 2011 	|	 Response: Levy Mwanawasa 	|	 Status: False
Prompt: Kwon Ri-se:(September 2014)::Gail Dolgin:X; what is X? 	|	 Answer: October 2010 	|	 Response: November 2010 	|	 Status: False
Prompt: Kwon Ri-se:(September 2014)::Ãˆve Curie:X; what is X? 	|	 Answer: October 2007 	|	 Response: October 2012 	|	 Status: False
Prompt: Kwon Ri-se:(September 2014)::Alan Coren:X; what is X? 	|	 Answer: October 2007 	|	 Response: September 2014 	|	 Status: False
Prompt: Kwon Ri-se:(September 2014)::Ed Cassidy:X; what is X? 	|	 Answer: December 2012 	|	 Response: ? 	|	 Status: False
Prompt: Toots Thielemans:(August 2016)::Kwon Ri-se:X; what is X? 	|	 Answer: September 2014 	|	 Response: Hana Medical Rehabilitation Hospital. 	|	 Status: False
Prompt: Toots Thielemans:(August 2016)::Mark Strand:X; what is X? 	|	 Answer: November 2014 	|	 Response: November 2014 	|	 Status: True
Prompt: Toots Thielemans:(August 2016)::G

In [28]:
result_df = pd.DataFrame(columns=['Prompt', 'Answer', 'GPT-reply', 'Status'], data=result)
result_df.to_csv('../results/result: person-deathdate (analogy+date+own knowledge).csv')
result_df

Unnamed: 0,Prompt,Answer,GPT-reply,Status
0,Kwon Ri-se:(September 2014)::Frederick Chiluba...,June 2011,Levy Mwanawasa,False
1,Kwon Ri-se:(September 2014)::Gail Dolgin:X; wh...,October 2010,November 2010,False
2,Kwon Ri-se:(September 2014)::Ãˆve Curie:X; wha...,October 2007,October 2012,False
3,Kwon Ri-se:(September 2014)::Alan Coren:X; wha...,October 2007,September 2014,False
4,Kwon Ri-se:(September 2014)::Ed Cassidy:X; wha...,December 2012,?,False
5,Toots Thielemans:(August 2016)::Kwon Ri-se:X; ...,September 2014,Hana Medical Rehabilitation Hospital.,False
6,Toots Thielemans:(August 2016)::Mark Strand:X;...,November 2014,November 2014,True
7,Toots Thielemans:(August 2016)::Gail Dolgin:X;...,October 2010,X is the missing date or timeframe.,False
8,Toots Thielemans:(August 2016)::Frederick Chil...,June 2011,Former President of Zambia,False
9,Toots Thielemans:(August 2016)::Helmut Schmidt...,November 2015,November 2015,True


In [30]:
false_count = len(result_df[result_df['Status'] == False])
true_count = len(result_df[result_df['Status'] == True])
print(f"False:{false_count}, True:{true_count}")


False:50, True:10
