# Title

In [1]:
from openai import OpenAI
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
client = OpenAI()
detokenize = TreebankWordDetokenizer().detokenize

In [3]:
files = nltk.corpus.masc_tagged.fileids()
corpus = [s for f in files for s in nltk.corpus.masc_tagged.sents(f)]
corpus = [s for s in corpus if 4 < len(s) < 11]
corpus = [[w for w in s if w not in ":.!?'[]\"\'“”"] for s in corpus]

In [4]:
'Out filtered corpus contains {} sentences'.format(len(corpus))

'Out filtered corpus contains 8767 sentences'

Some useful functions for categorizing sentences, quickly getting a handle on those sentences.

In [5]:
def tags_present_in_sentence(s):
    tagged = nltk.pos_tag(s)
    tags = [w[1] for w in tagged]
    return tags

def display_sampling(corpus, condition, k = 5, seed = None):
    random.seed(seed)
    to_display = random.choices([s for s in corpus if condition(s)], k = k)
    for s in to_display:
        print(detokenize(s))
        
def num_satisfying(corpus, condition):
    return len([s for s in corpus if condition(s)]) 

In [6]:
#conditions

def is_true(s):
    return True

def ends_in_VBD(s):
    "Ends in a verb in the past tense"
    return nltk.pos_tag(s)[-1][1] in ['VBD']

def contains_two(s):
    return 'two' in s or 'Two' in s

def contains_CD(s):
    "Contains a numerical reference"
    return "CD" in tags_present_in_sentence(s)

def contains_sparrow(s):
    return 'SPARROW' in s or 'Sparrow' in s or 'sparrow' in s

def contains_ship(s):
    return 'ship' in s or 'Ship' in s

def contains_president(s):
    return 'President' in s or 'president' in s

In [7]:
num_satisfying(corpus, contains_president)

27

In [8]:
display_sampling(corpus, contains_president, k = 10)

I supported the president
Vice President Gore, on the environment
Thank you, Mr. President
Waking up with a new president
Ralph Price 1st Vice President Cash Management Operations
Waking up with a new president
Waking up with a new president
I supported the president
Waking up with a new president
Mr. President, new question, two minutes


In [9]:
conditions = [ends_in_VBD, contains_two, contains_CD, contains_sparrow, contains_ship, contains_president]

The above is helpful for playing around and getting sense of what our sentences look like. Now build a dataframe including the sentences and the result of our conditions.

In [10]:
def add_condition_column(df, cond):
    df[cond.__name__] = df.sentences.apply(cond)


In [11]:
def df_from_corpus_and_conditions(corpus, conditions):
    df = pd.DataFrame(pd.Series(corpus, name = 'sentences'))
    for cond in conditions:
        add_condition_column(df, cond)
    return df
    

In [15]:
df_from_corpus_and_conditions(corpus, conditions).head()

Unnamed: 0,sentences,ends_in_VBD,contains_two,contains_CD,contains_sparrow,contains_ship,contains_president
0,"[Only, the, subjects, tonight, and, the, quest...",False,False,False,False,False,False
1,"[The, format, tonight, is, that, of, a, conver...",False,False,False,False,False,False
2,"[Good, evening, ,, Governor, Bush, ,, Vice, Pr...",False,False,False,False,False,True
3,"[Governor, Bush, ,, the, first, question, goes...",False,False,False,False,False,False
4,"[I, have, ,, I, have]",False,False,False,False,False,False


Now that we have a dataset (and a suite of tools for adding to the dataset), let's focus on getting the LLM to learn the condition in context and evaluate the LLM's learning. 

In [175]:
def split_train_test(df, cond, train_k = 5, test_k = 10, seed = None):

    satisfying = df.loc[df[cond.__name__]== True]
    try: train, test = train_test_split(satisfying, train_size = train_k, test_size = test_k, random_state = seed) #might not be enough samples to use test_k
    except: train, test = train_test_split(satisfying, train_size = train_k, random_state = seed) #in which case, just use the remainder

    not_satisfying = df.loc[df[cond.__name__]== False]
    train = pd.concat([train, not_satisfying.sample(train.shape[0], random_state = seed)])
    test = pd.concat([test, not_satisfying.sample(test.shape[0], random_state = seed)])
    test = test.sample(frac = 1) #this randomizes the order, so that we don't show a bunch of True followed by a bunch of False


    return train, test

In [191]:
def single_test_prompt(train, test_s, cond):
    p = "Each of the following sentences is labelled True/False according to whether or not it follows a simple rule a child might guess:\n\n"
    
    for s, label in zip(train['sentences'], train[cond.__name__]):
        p += "\"" + detokenize(s) + "\"; " + str(label) +  "\n"

#    p += "\nFirst, state the condition as briefly as possible. Then, state 'Label: True' if the following sentence satisfies the condition and 'Label: False' otherwise.\n\n"
#    p += "\nReply 'Condition: <describe the secret condition briefly>' and on a new line write 'Condition satisfied: <True or False>' where the condition is satisfied if the next sentence satisfies the condition:\n\n"
    p += "\nYour job is to learn the rule, thinking carefully about the previous examples. Does the sentence:\n\n"
    p += "\"" + detokenize(test_s) + "\""
    p += "\n\nfollow the rule? Give a 'Yes' or 'No' answer, with no explanation."
    return p

In [134]:
def get_response(p):
    response = client.completions.create(
                model = "gpt-3.5-turbo-instruct",
                prompt = p
                )
    return response.choices[0].text
#    return response

In [135]:
def evaluate_icl(train, test, cond, verbose = True):
    llm_labels_correctly = []
    for i, test_s in enumerate(test.sentences):
        p = single_test_prompt(train, test_s, cond)
        resp = get_response(p)
        llm_label = 'Yes' in resp or 'yes' in resp or 'YES' in resp
        actual_label = test[cond.__name__].iloc[i]
        if verbose:
            if llm_label != actual_label:
                print('On test sentence: '+ detokenize(test_s) + ' -- the LLM incorrectly evaluates: ' + str(llm_label))
        llm_labels_correctly.append(llm_label == actual_label)
    correct = 0
    for label in llm_labels_correctly:
        if label:
            correct += 1
    accuracy = correct/len(llm_labels_correctly)
    print('LLM learning accuracy is: ' + str(accuracy))
    return accuracy

### Putting it all together now:

In [177]:
df = df_from_corpus_and_conditions(corpus, conditions)    

In [178]:
df.head()

Unnamed: 0,sentences,ends_in_VBD,contains_two,contains_CD,contains_sparrow,contains_ship,contains_president
0,"[Only, the, subjects, tonight, and, the, quest...",False,False,False,False,False,False
1,"[The, format, tonight, is, that, of, a, conver...",False,False,False,False,False,False
2,"[Good, evening, ,, Governor, Bush, ,, Vice, Pr...",False,False,False,False,False,True
3,"[Governor, Bush, ,, the, first, question, goes...",False,False,False,False,False,False
4,"[I, have, ,, I, have]",False,False,False,False,False,False


In [216]:
test_cond = contains_two

In [217]:
train, test = split_train_test(df, test_cond, test_k = 20)

In [218]:
p = single_test_prompt(train, test.sentences.iloc[1], test_cond)

In [219]:
print(p)

Each of the following sentences is labelled True/False according to whether or not it follows a simple rule a child might guess:

"Mr. President, new question, two minutes"; True
"Waiter Two teas"; True
"six nine four eight seven three two two six two"; True
"seven six four nine nine one two two six"; True
"The candidate is allowed two minutes to answer"; True
"Wyvern closes his eyes, and falls silent"; False
"The know-how is the propagating organization"; False
"Let Mr. Thompson speak"; False
"Compare prices before you buy any significant item"; False
"I expect your things are outside"; False

Your job is to learn the rule, thinking carefully about the previous examples. Does the sentence:

"I went into the room and sat down"

follow the rule? Give a 'Yes' or 'No' answer, with no explanation.


In [220]:
response = client.completions.create(
                model = "gpt-3.5-turbo-instruct",
                prompt = p
                )


In [221]:
print(response.choices[0].text)



Yes


In [222]:
evaluate_icl(train, test, test_cond)

On test sentence: I went into the room and sat down -- the LLM incorrectly evaluates: True
On test sentence: Or Cantonese, as the case may be -- the LLM incorrectly evaluates: True
On test sentence: He was being chased by two thugs -- the LLM incorrectly evaluates: False
On test sentence: uh I first trained up two -- the LLM incorrectly evaluates: False
On test sentence: Um, I didn't really have a favorite -- the LLM incorrectly evaluates: True
On test sentence: Huge difference between the two -- the LLM incorrectly evaluates: False
On test sentence: Agree with the previous two comments -- the LLM incorrectly evaluates: False
On test sentence: There are two theories to arguing with women -- the LLM incorrectly evaluates: False
On test sentence: We paid down the debt for two years -- the LLM incorrectly evaluates: False
On test sentence: six two seven four two seven three eight O _ -- the LLM incorrectly evaluates: False
On test sentence: for the two weeks that he's gone -- the LLM inco

0.65

In [296]:
evaluate_icl(train, test, contains_sparrow)

On test sentence: JACK SPARROW Good doggie -- the LLM incorrectly evaluates: False
On test sentence: JACK SPARROW Technic - -- the LLM incorrectly evaluates: False
On test sentence: JACK SPARROW What did the bird say -- the LLM incorrectly evaluates: False
On test sentence: JACK SPARROW As you were, gents -- the LLM incorrectly evaluates: False
On test sentence: JACK SPARROW Oh bugger -- the LLM incorrectly evaluates: False
On test sentence: JACK SPARROW Hey -- the LLM incorrectly evaluates: False
On test sentence: JACK SPARROW Wup -- the LLM incorrectly evaluates: False
On test sentence: JACK SPARROW Save me -- the LLM incorrectly evaluates: False
On test sentence: JACK SPARROW Pirate -- the LLM incorrectly evaluates: False
LLM learning accuracy is: 0.55


0.55

In [271]:
evaluate_icl(train, test, contains_sparrow)

On test sentence: JACK SPARROW Pirate -- the LLM incorrectly evaluatesFalse
On test sentence: JACK SPARROW You're a diamond, mate -- the LLM incorrectly evaluatesFalse


0.5

In [269]:
for i, s in enumerate(test.sentences):
    print(str(i) + ':' + detokenize(s))

0:three O _ three eight three three eight seven four
1:How's that
2:JACK SPARROW Pirate
3:JACK SPARROW You're a diamond, mate
