## Importing Libraries & API key

In [1]:
import pandas as pd
import numpy as np
from time import sleep
from openai import OpenAI
from private_data import API_KEY

## Requesting function

In [2]:
## demo request
my_query = [ {"role": "system", "content": "Give direct answer only in 1 or 2 words."},
        {"role": "user", "content": "What continent is Portugal in?"} ] 
client = OpenAI(api_key=API_KEY)
completion = client.chat.completions.create(
    model="gpt-3.5-turbo", messages=my_query
)

print(str(completion.choices[0].message.content)) 

Europe


In [26]:
# Main query function
def ask_query(query):
    my_query = [ {"role": "system", "content": "Give direct answer only in 1 or 2 words."},
        {"role": "user", "content": query} ] 
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo", messages=my_query
    )
    return str(completion.choices[0].message.content)

# Country - Continent Analogy Test

## Loading the Dataset

In [11]:
country_continent_df = pd.read_csv('../data/Countries-Continents.csv')
country_continent_df.head()

Unnamed: 0,Continent,Country
0,Africa,Algeria
1,Africa,Angola
2,Africa,Benin
3,Africa,Botswana
4,Africa,Burkina


In [33]:
# Shuffle and resize the data
country_continent_df = country_continent_df.sample(frac=1).reset_index(drop=True).head(20)
country_continent_df

Unnamed: 0,Continent,Country
0,North America,Barbados
1,North America,Mexico
2,Africa,Djibouti
3,Europe,Iceland
4,Europe,Armenia
5,Africa,Cape Verde
6,North America,Nicaragua
7,Asia,Maldives
8,Africa,Tunisia
9,Europe,Ukraine


## Preprocessing the data

In [42]:
def are_equivalent(response, answer):
    return answer.lower() in response.lower()

def does_gpt_know(country, continent):
    query = f"Which continent does {country} belong to?"
    response = ask_query(query)
    print(f"Country: {country} \t|\t Continent: {continent} \t|\t Response: {response}")
    return are_equivalent(response, continent)

def preprocess(df):
    new_df = df[df.apply(lambda x: does_gpt_know(x['Country'], x['Continent']), axis=1)]
    return new_df

In [37]:
known_df = preprocess(country_continent_df)
known_df

#Country: Barbados #Continent: North America #Response: North America
#Country: Mexico #Continent: North America #Response: North America
#Country: Djibouti #Continent: Africa #Response: Africa
#Country: Iceland #Continent: Europe #Response: Europe
#Country: Armenia #Continent: Europe #Response: Europe
#Country: Cape Verde #Continent: Africa #Response: Africa
#Country: Nicaragua #Continent: North America #Response: North America
#Country: Maldives #Continent: Asia #Response: Asia
#Country: Tunisia #Continent: Africa #Response: Africa
#Country: Ukraine #Continent: Europe #Response: Europe
#Country: Bahrain #Continent: Asia #Response: Asia
#Country: Macedonia #Continent: Europe #Response: Europe
#Country: Marshall Islands #Continent: Oceania #Response: Oceania
#Country: San Marino #Continent: Europe #Response: Europe
#Country: Suriname #Continent: South America #Response: South America
#Country: Bhutan #Continent: Asia #Response: Asia.
#Country: Colombia #Continent: South America #Respon

Unnamed: 0,Continent,Country
0,North America,Barbados
1,North America,Mexico
2,Africa,Djibouti
3,Europe,Iceland
4,Europe,Armenia
5,Africa,Cape Verde
6,North America,Nicaragua
7,Asia,Maldives
8,Africa,Tunisia
9,Europe,Ukraine


In [39]:
# divide into 2 parts for analogy testing
n = len(known_df)
df1 = known_df.head(n//2)
df2 = known_df.tail(n-n//2)

## Analogy Testing

In [49]:
def execute_analogy_testing(df1, df2):
    result = []

    for i in range(min(len(df1), len(df2))):
        country_1 = df1.iloc[i]['Country']
        continent_1 = df1.iloc[i]['Continent']
        
        country_2 = df2.iloc[i]['Country']
        continent_2 = df2.iloc[i]['Continent']
        
        prompt = f"{country_1}:{continent_1}::{country_2}:X; What is X?"
        answer = continent_2
        response = ask_query(prompt)
        status = are_equivalent(response, answer)
        
        result.append({'Prompt': prompt, 'Answer': answer, 'GPT-reply': response, 'Status': status})
        print(f"Prompt: {prompt} \t|\t Answer: {answer} \t|\t Response: {response} \t|\t Status: {status}")
    
    return result

In [50]:
result = execute_analogy_testing(df1, df2)

Prompt: Barbados:North America::Bahrain:X; What is X? 	|	 Answer: Asia 	|	 Response: Asia 	|	 Status: True
Prompt: Mexico:North America::Macedonia:X; What is X? 	|	 Answer: Europe 	|	 Response: Europe 	|	 Status: True
Prompt: Djibouti:Africa::Marshall Islands:X; What is X? 	|	 Answer: Oceania 	|	 Response: Oceania 	|	 Status: True
Prompt: Iceland:Europe::San Marino:X; What is X? 	|	 Answer: Europe 	|	 Response: Europe 	|	 Status: True
Prompt: Armenia:Europe::Suriname:X; What is X? 	|	 Answer: South America 	|	 Response: South America 	|	 Status: True
Prompt: Cape Verde:Africa::Bhutan:X; What is X? 	|	 Answer: Asia 	|	 Response: Asia 	|	 Status: True
Prompt: Nicaragua:North America::Colombia:X; What is X? 	|	 Answer: South America 	|	 Response: South America 	|	 Status: True
Prompt: Maldives:Asia::Kuwait:X; What is X? 	|	 Answer: Asia 	|	 Response: Asia 	|	 Status: True
Prompt: Tunisia:Africa::Korea, North:X; What is X? 	|	 Answer: Asia 	|	 Response: Asia 	|	 Status: True
Prompt: Ukrain

In [53]:
result_df = pd.DataFrame(columns=['Prompt', 'Answer', 'GPT-reply', 'Status'], data=result)
result_df.to_csv('../results/result: country-continent.csv')
result_df

Unnamed: 0,Prompt,Answer,GPT-reply,Status
0,Barbados:North America::Bahrain:X; What is X?,Asia,Asia,True
1,Mexico:North America::Macedonia:X; What is X?,Europe,Europe,True
2,Djibouti:Africa::Marshall Islands:X; What is X?,Oceania,Oceania,True
3,Iceland:Europe::San Marino:X; What is X?,Europe,Europe,True
4,Armenia:Europe::Suriname:X; What is X?,South America,South America,True
5,Cape Verde:Africa::Bhutan:X; What is X?,Asia,Asia,True
6,Nicaragua:North America::Colombia:X; What is X?,South America,South America,True
7,Maldives:Asia::Kuwait:X; What is X?,Asia,Asia,True
8,"Tunisia:Africa::Korea, North:X; What is X?",Asia,Asia,True
9,Ukraine:Europe::Monaco:X; What is X?,Europe,Europe,True


# Famous Person - Occupation Analogy Test

In [16]:
# Loading Dataset 
person_occupation_df = pd.read_csv('../data/pantheon.tsv', sep='\t')
person_occupation_df = person_occupation_df[['name', 'countryName', 'occupation']]
# Shuffle and resize the data
person_occupation_df = person_occupation_df.sample(frac=1).reset_index(drop=True).head(100)

person_occupation_df

Unnamed: 0,name,countryName,occupation
count,100,99,100
unique,100,42,32
top,Henry Hudson,UNITED STATES,POLITICIAN
freq,1,13,21


In [27]:
# Preprocessing 

def normalize_occupation(occupation):
    occupation = occupation.lower()
    if 'player' in occupation:
        occupation = occupation.replace('player', '')
    if 'actor' in occupation:
        return 'actor'
    if 'actress' in occupation:
        return 'actor'
    if 'artist' in occupation:
        return 'painter'
    
    return occupation
    

def are_equivalent(response, answer):
    response = normalize_occupation(response)
    answer = normalize_occupation(answer)

    if answer in response:
        return True
    if response in answer:
        return True
    return False

def does_gpt_know(person, occupation):
    query = f"what is the occupation of {person}?"
    response = ask_query(query)
    print(f"Person: {person} \t|\t Occupation: {occupation} \t|\t Response: {response}")
    return are_equivalent(response, occupation)

def preprocess(df):
    correct_answers = []
    for index, row in df.iterrows():
        if does_gpt_know(row['name'], row['occupation']):
            correct_answers.append(row)
        if len(correct_answers) == 20:
            break
    new_df = pd.DataFrame(correct_answers)
    return new_df

    

In [19]:
known_df = preprocess(person_occupation_df)
known_df

Person: Henry Hudson 	|	 Occupation: EXPLORER 	|	 Response: Explorer
Person: Arthur Honegger 	|	 Occupation: COMPOSER 	|	 Response: composer
Person: Pope Clement I 	|	 Occupation: RELIGIOUS FIGURE 	|	 Response: Pope.
Person: Ananda 	|	 Occupation: RELIGIOUS FIGURE 	|	 Response: unknown
Person: Karl Briullov 	|	 Occupation: PAINTER 	|	 Response: Painter.
Person: Jonathan Edwards 	|	 Occupation: ATHLETE 	|	 Response: theologian
Person: Brian P. Schmidt 	|	 Occupation: ASTRONOMER 	|	 Response: Astronomer
Person: Fedor Emelianenko 	|	 Occupation: MARTIAL ARTS 	|	 Response: MMA fighter.
Person: Teti 	|	 Occupation: POLITICIAN 	|	 Response: unknown
Person: Terence 	|	 Occupation: WRITER 	|	 Response: Unknown
Person: Jeb Bush 	|	 Occupation: POLITICIAN 	|	 Response: politician
Person: Mohammad Ali Rajai 	|	 Occupation: POLITICIAN 	|	 Response: politician
Person: Sigismund, Holy Roman Emperor 	|	 Occupation: POLITICIAN 	|	 Response: Emperor
Person: Alexander Graham Bell 	|	 Occupation: INVENTO

Unnamed: 0,name,countryName,occupation
0,Henry Hudson,UNITED KINGDOM,EXPLORER
1,Arthur Honegger,FRANCE,COMPOSER
4,Karl Briullov,Russia,PAINTER
6,Brian P. Schmidt,UNITED STATES,ASTRONOMER
10,Jeb Bush,United States,POLITICIAN
11,Mohammad Ali Rajai,Iran,POLITICIAN
13,Alexander Graham Bell,UNITED KINGDOM,INVENTOR
16,Lucas Grabeel,UNITED STATES,ACTOR
17,Tommy Haas,Germany,TENNIS PLAYER
19,Alizé Cornet,FRANCE,TENNIS PLAYER


## Global Analogy Tester

In [5]:
def global_analogy_testing(df1, df2, equiv_func):
    result = []

    for i in range(min(len(df1), len(df2))):
        entity_1 = df1.iloc[i]['Entity']
        target_1 = df1.iloc[i]['Target']
        
        entity_2 = df2.iloc[i]['Entity']
        target_2 = df2.iloc[i]['Target']
        
        prompt = f"{entity_1}:{target_1}::{entity_2}:X; What is X?"
        answer = target_2
        response = ask_query(prompt)
        status = equiv_func(response, answer)
        
        result.append({'Prompt': prompt, 'Answer': answer, 'GPT-reply': response, 'Status': status})
        print(f"Prompt: {prompt} \t|\t Answer: {answer} \t|\t Response: {response} \t|\t Status: {status}")
    
    return result

In [23]:
# divide into 2 parts for analogy testing
known_df = known_df.sample(frac=1).reset_index(drop=True)
known_df.rename(columns={'name': 'Entity', 'occupation': 'Target'}, inplace=True)
n = len(known_df)
df1 = known_df.head(n//2)
df2 = known_df.tail(n-n//2)

In [28]:
result = global_analogy_testing(df1, df2, are_equivalent)

Prompt: Pete Conrad:ASTRONAUT::Alizé Cornet:X; What is X? 	|	 Answer: TENNIS PLAYER 	|	 Response: TENNIS 	|	 Status: True
Prompt: Kajsa Bergqvist:ATHLETE::Karl Briullov:X; What is X? 	|	 Answer: PAINTER 	|	 Response: ARTIST 	|	 Status: True
Prompt: Mohammad Ali Rajai:POLITICIAN::Robert Wagner:X; What is X? 	|	 Answer: ACTOR 	|	 Response: ACTOR 	|	 Status: True
Prompt: Alexander Graham Bell:INVENTOR::Jeb Bush:X; What is X? 	|	 Answer: POLITICIAN 	|	 Response: Politician. 	|	 Status: True
Prompt: Tommy Haas:TENNIS PLAYER::Koji Murofushi:X; What is X? 	|	 Answer: ATHLETE 	|	 Response: HAMMER THROWER 	|	 Status: False
Prompt: Roman Jakobson:LINGUIST::Joschka Fischer:X; What is X? 	|	 Answer: POLITICIAN 	|	 Response: Politician 	|	 Status: True
Prompt: Brian P. Schmidt:ASTRONOMER::Arthur Honegger:X; What is X? 	|	 Answer: COMPOSER 	|	 Response: COMPOSER 	|	 Status: True
Prompt: Henry Hudson:EXPLORER::Lucas Grabeel:X; What is X? 	|	 Answer: ACTOR 	|	 Response: ACTOR 	|	 Status: True
Prompt: 

In [29]:
result_df = pd.DataFrame(columns=['Prompt', 'Answer', 'GPT-reply', 'Status'], data=result)
result_df.to_csv('../results/result: person-occupation.csv')
result_df

Unnamed: 0,Prompt,Answer,GPT-reply,Status
0,Pete Conrad:ASTRONAUT::Alizé Cornet:X; What is X?,TENNIS PLAYER,TENNIS,True
1,Kajsa Bergqvist:ATHLETE::Karl Briullov:X; What...,PAINTER,ARTIST,True
2,Mohammad Ali Rajai:POLITICIAN::Robert Wagner:X...,ACTOR,ACTOR,True
3,Alexander Graham Bell:INVENTOR::Jeb Bush:X; Wh...,POLITICIAN,Politician.,True
4,Tommy Haas:TENNIS PLAYER::Koji Murofushi:X; Wh...,ATHLETE,HAMMER THROWER,False
5,Roman Jakobson:LINGUIST::Joschka Fischer:X; Wh...,POLITICIAN,Politician,True
6,Brian P. Schmidt:ASTRONOMER::Arthur Honegger:X...,COMPOSER,COMPOSER,True
7,Henry Hudson:EXPLORER::Lucas Grabeel:X; What i...,ACTOR,ACTOR,True
8,Apuleius:WRITER::Farinelli:X; What is X?,SINGER,Singer,True
9,Antisthenes:PHILOSOPHER::Robert C. Merton:X; W...,ECONOMIST,ECONOMIST,True


# Famous person - Birthdate Analogy Test

In [34]:
# Special query function and analogy testing
def ask_query(query):
    my_query = [ {"role": "system", "content": "Give direct answer only a date(YYYY-MM-DD). Analogically estimate it using your own knowledge base"},
        {"role": "user", "content": query} ] 
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo", messages=my_query
    )
    return str(completion.choices[0].message.content)

def birthdate_analogy_testing(df1, df2, equiv_func):
    result = []

    for i in range(min(len(df1), len(df2))):
        entity_1 = df1.iloc[i]['Entity']
        target_1 = df1.iloc[i]['Target']
        
        entity_2 = df2.iloc[i]['Entity']
        target_2 = df2.iloc[i]['Target']
        
        prompt = f"{entity_1}:({target_1})::{entity_2}:X; Estimate the date X?"
        answer = target_2
        response = ask_query(prompt)
        status = equiv_func(response, answer)
        
        result.append({'Prompt': prompt, 'Answer': answer, 'GPT-reply': response, 'Status': status})
        print(f"Prompt: {prompt} \t|\t Answer: {answer} \t|\t Response: {response} \t|\t Status: {status}")
    
    return result

In [30]:
# Loading Dataset 
person_birthdate_df = pd.read_csv('../data/famous-birthdates.csv', sep = ' ')
person_birthdate_df = person_birthdate_df[['firstname', 'lastname' , 'birthDate']]
person_birthdate_df['name'] = person_birthdate_df['firstname'] + ' ' + person_birthdate_df['lastname']
# Shuffle and resize the data
person_birthdate_df = person_birthdate_df.sample(frac=1).reset_index(drop=True).head(100)

person_birthdate_df.head(10)

Unnamed: 0,firstname,lastname,birthDate,name
0,Daniel K,Inouye,1924-09-07,Daniel K Inouye
1,Virgil,Thomson,1896-11-25,Virgil Thomson
2,Jessie,Armstead,1970-10-26,Jessie Armstead
3,Baltasar,Garzon,1955-10-26,Baltasar Garzon
4,Susan,Cheever,1943-07-31,Susan Cheever
5,Andreas,Papandreou,1919-02-05,Andreas Papandreou
6,Rich,Kotite,1942-10-13,Rich Kotite
7,Hugh,Grant,1960-09-09,Hugh Grant
8,Walker,Evans,1903-11-03,Walker Evans
9,Sally,Hemings,1773-01-01,Sally Hemings


In [31]:
# Preprocessing 

def normalize(entity):
    entity = entity.lower()
    
    return entity
    

def are_equivalent(response, answer):
    response = normalize(response)
    answer = normalize(answer)

    if answer in response:
        return True
    if response in answer:
        return True
    return False

def does_gpt_know(person, birthdate):
    # check if birthdate is nan
    if pd.isnull(birthdate):
        return False
    
    query = f"what is the birthdate(YYYY-MM-DD) of {person}?"
    response = ask_query(query)
    print(f"Person: {person} \t|\t Birth Date: {birthdate} \t|\t Response: {response}")
    return are_equivalent(response, birthdate)

def preprocess(df):
    correct_answers = []
    for index, row in df.iterrows():
        if does_gpt_know(row['name'], row['birthDate']):
            correct_answers.append(row)
        if len(correct_answers) == 20:
            break
    new_df = pd.DataFrame(correct_answers)
    return new_df

In [21]:
known_df = preprocess(person_birthdate_df)
known_df

Person: Roy Rogers 	|	 Birth Date: 1911-11-05 	|	 Response: 1911-11-05
Person: William H Ginsburg 	|	 Birth Date: 1943-03-25 	|	 Response: Unknown
Person: Tubby Smith 	|	 Birth Date: 1951-06-30 	|	 Response: 1951-06-30
Person: Jorge Posada 	|	 Birth Date: 1971-08-17 	|	 Response: 1971-08-17
Person: David Cay Johnston 	|	 Birth Date: 1948-12-24 	|	 Response: 1948-01-24
Person: Eileen Atkins 	|	 Birth Date: 1934-06-16 	|	 Response: 1934-06-16
Person: Jim Furyk 	|	 Birth Date: 1970-05-12 	|	 Response: 1970-05-12
Person: D J R Bruckner 	|	 Birth Date: 1933-11-26 	|	 Response: Unknown
Person: Jay Garner 	|	 Birth Date: 1938-04-15 	|	 Response: Unknown
Person: Ethan Hawke 	|	 Birth Date: 1970-11-06 	|	 Response: 1970-11-06
Person: Bill Parcells 	|	 Birth Date: 1941-08-22 	|	 Response: 1941-08-22
Person: George F Will 	|	 Birth Date: 1941-05-04 	|	 Response: 1941-05-04
Person: Ronald Steel 	|	 Birth Date: 1931-03-25 	|	 Response: unknown
Person: Etienne Tshisekedi 	|	 Birth Date: 1932-12-14 	

Unnamed: 0,firstname,lastname,birthDate,name
0,Roy,Rogers,1911-11-05,Roy Rogers
2,Tubby,Smith,1951-06-30,Tubby Smith
3,Jorge,Posada,1971-08-17,Jorge Posada
6,Eileen,Atkins,1934-06-16,Eileen Atkins
7,Jim,Furyk,1970-05-12,Jim Furyk
10,Ethan,Hawke,1970-11-06,Ethan Hawke
11,Bill,Parcells,1941-08-22,Bill Parcells
12,George F,Will,1941-05-04,George F Will
14,Etienne,Tshisekedi,1932-12-14,Etienne Tshisekedi
15,Charlie,Parker,1920-08-29,Charlie Parker


In [23]:
# divide into 2 parts for analogy testing
known_df = known_df.sample(frac=1).reset_index(drop=True)
known_df.rename(columns={'name': 'Entity', 'birthDate': 'Target'}, inplace=True)
n = len(known_df)
df1 = known_df.head(n//2)
df2 = known_df.tail(n-n//2)

In [35]:
result = birthdate_analogy_testing(df1, df2, are_equivalent)

Prompt: Tubby Smith:(1951-06-30)::Jim Furyk:X; Estimate the date X? 	|	 Answer: 1970-05-12 	|	 Response: X: 1970-05-12 	|	 Status: True
Prompt: Eileen Atkins:(1934-06-16)::Bill Parcells:X; Estimate the date X? 	|	 Answer: 1941-08-22 	|	 Response: X would be 1941-08-22. 	|	 Status: True
Prompt: Willie Nelson:(1933-04-29)::Shaun Ellis:X; Estimate the date X? 	|	 Answer: 1977-06-24 	|	 Response: Estimate the date X: 1970-01-01 	|	 Status: False
Prompt: Ethan Hawke:(1970-11-06)::Adam Sandler:X; Estimate the date X? 	|	 Answer: 1966-09-09 	|	 Response: Adam Sandler's birthdate is September 9th, 1966. (1966-09-09) 	|	 Status: True
Prompt: Tilda Swinton:(1960-11-05)::Ringo Starr:X; Estimate the date X? 	|	 Answer: 1940-07-07 	|	 Response: Ringo Starr's birthdate is July 7th, 1940. 	|	 Status: False
Prompt: Charles Kuralt:(1934-09-10)::Roy Rogers:X; Estimate the date X? 	|	 Answer: 1911-11-05 	|	 Response: Roy Rogers was born on November 5, 1911. Therefore, X would be estimated to be 1911-11-0

In [36]:
result_df = pd.DataFrame(columns=['Prompt', 'Answer', 'GPT-reply', 'Status'], data=result)
result_df.to_csv('../results/result: person-birthdate (analogy+date+own knowledge).csv')
result_df

Unnamed: 0,Prompt,Answer,GPT-reply,Status
0,Tubby Smith:(1951-06-30)::Jim Furyk:X; Estimat...,1970-05-12,X: 1970-05-12,True
1,Eileen Atkins:(1934-06-16)::Bill Parcells:X; E...,1941-08-22,X would be 1941-08-22.,True
2,Willie Nelson:(1933-04-29)::Shaun Ellis:X; Est...,1977-06-24,Estimate the date X: 1970-01-01,False
3,Ethan Hawke:(1970-11-06)::Adam Sandler:X; Esti...,1966-09-09,"Adam Sandler's birthdate is September 9th, 196...",True
4,Tilda Swinton:(1960-11-05)::Ringo Starr:X; Est...,1940-07-07,"Ringo Starr's birthdate is July 7th, 1940.",False
5,Charles Kuralt:(1934-09-10)::Roy Rogers:X; Est...,1911-11-05,"Roy Rogers was born on November 5, 1911. There...",True
6,Etienne Tshisekedi:(1932-12-14)::Jorge Posada:...,1971-08-17,Jorge Posada was born on 1971-08-17.,True
7,George F Will:(1941-05-04)::Kate Hudson:X; Est...,1979-04-19,Based on the information that George F. Will w...,False
8,Charlie Parker:(1920-08-29)::Elliott Abrams:X;...,1948-01-24,"Elliott Abrams was born on 1948-01-24, therefo...",True
9,Tim Wakefield:(1966-08-02)::Norman Lear:X; Est...,1922-07-27,"Norman Lear was born on July 27, 1922.",False
