# Title

In [59]:
from openai import OpenAI
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [60]:
client = OpenAI()
detokenize = TreebankWordDetokenizer().detokenize

In [122]:
corpus = pd.concat([pd.Series([s for s in nltk.corpus.udhr.sents(f) if len(s) > 10], name = f[:-7]) for f in nltk.corpus.udhr.fileids() if 'Latin1' in f], axis = 1)
min_num_sentences = min([len(nltk.corpus.udhr.sents(f)) for f in nltk.corpus.udhr.fileids() if 'Latin1' in f])
corpus = corpus.head(min_num_sentences)

In [124]:
languages = pd.Series([lang for lang in corpus.columns])

Now that we have a dataset (and a suite of tools for adding to the dataset), let's focus on getting the LLM to learn the condition in context and evaluate the LLM's learning. 

In [125]:
def build_training_sentences_true(corpus, lang, train_k = 3, seed = None):
    training_sentences_true = corpus[lang].sample(train_k, random_state = seed)
    return training_sentences_true

def build_training_sentences_false(corpus, lang, train_k = 3, seed = None):
    sample_space_false = corpus.loc[:, corpus.columns != lang]
    training_sentences_false = pd.Series([sample_space_false.sample(axis = 0, random_state = seed).sample(axis = 1, random_state = seed).iloc[0, 0] for _ in range(train_k)])
    return training_sentences_false

In [126]:
def build_test_sentences_true(corpus, lang, training_sentences_true, test_k = 10, seed = None):
    sample_space_true = corpus[~corpus.index.isin(training_sentences_true.index)][lang]
    test_sentences_true = sample_space_true.sample(test_k, random_state = seed)
    return test_sentences_true
    
def build_test_sentences_false(corpus, lang, training_sentences_false, test_k = 10, seed = None):
    sample_space_false = corpus.loc[:, corpus.columns != lang]
    sample_space_false = sample_space_false[~sample_space_false.index.isin(training_sentences_false.index)]
    test_sentences_false = pd.Series([sample_space_false.sample(axis = 0, random_state = seed).sample(axis = 1, random_state = seed).iloc[0, 0] for _ in range(test_k)])
    return test_sentences_false
    

In [127]:
def build_train_test_data(corpus, lang, train_k = 3, test_k = 10, seed = None):
    
    train_true = build_training_sentences_true(corpus, lang, train_k, seed)
    train_false = build_training_sentences_false(corpus, lang, train_k, seed)

    test_true = build_test_sentences_true(corpus, lang, train_true, test_k, seed)
    test_false = build_test_sentences_false(corpus, lang, train_false, test_k, seed)

    return train_true, train_false, test_true, test_false

In [128]:
def prompt(train_sentences_true, train_sentences_false):
    p = "Each of the following sentences is labeled 'Yes' if it follows a secret rule and labeled 'No' otherwise:\n\n"
    
    for s in train_sentences_true:
        p += "\"" + detokenize(s)[:50] + "\": Yes;\n"

    for s in train_sentences_false:
        p += "\"" + detokenize(s)[:50] + "\": No;\n"

#    p += "\nFirst, state the condition as briefly as possible. Then, state 'Label: True' if the following sentence satisfies the condition and 'Label: False' otherwise.\n\n"
#    p += "\nReply 'Condition: <describe the secret condition briefly>' and on a new line write 'Condition satisfied: <True or False>' where the condition is satisfied if the next sentence satisfies the condition:\n\n"
    p += "\nYour job is to learn the secret rule, thinking carefully about the previous examples. Explain the rule."

    return p

In [129]:
def get_response(p):
    response = client.completions.create(
                model = "gpt-3.5-turbo-instruct",
                prompt = p,
                max_tokens = 500
                )
    return response.choices[0].text

In [130]:
example_language = 'Norwegian'

In [117]:
train_true = build_training_sentences_true(corpus, example_language)
train_false = build_training_sentences_false(corpus, example_language)
test_true = build_test_sentences_true(corpus, example_language, train_true)
test_false = build_test_sentences_false(corpus, example_language, train_false)

In [118]:
p = prompt(train_true, train_false)

In [119]:
print(p)

Each of the following sentences is labeled 'Yes' if it follows a secret rule and labeled 'No' otherwise:

"Enhver som er anklaget for en straffbar handling h": Yes;
"Heller ikke skal det kunne idømmes strengere straf": Yes;
"VERDENSERKLÆRINGEN OM MENNESKERETTIGHETENE INNLEDN": Yes;
"Tikina e 7.": No;
"Inqaku lesi - 5 Akukho namnye oyakuphathwa gadalal": No;
"Akakchíl tsalap (art . 6).": No;

Your job is to learn the secret rule, thinking carefully about the previous examples. Explain the rule.


In [120]:
response = get_response(p)

In [121]:
print(response)



The secret rule is that all sentences must contain a complete word or phrase followed by a single character that is a punctuation mark, either a period or a question mark. Additionally, the entire sentence must be written in all uppercase letters.


In [338]:
train_true, train_false, test_true, test_false = build_train_test_data(corpus, 'English')

In [345]:
train_false

0     [Ca, chiquítimoi, sipuacë, ', itima, ', icën, .]
1    [Heg, jalankeun, eta, hak, -, hak, teh, boh, k...
2    [Aretikele, 6, Mongwe, le, mongwe, o, na, le, ...
dtype: object

In [370]:
accuracy_table

Unnamed: 0,Accuracy
Norwegian,0.8
NigerianPidginEnglish,0.775
Vlach,0.75
Campa_Pajonalino,0.725
Cakchiquel,0.675
Tojol-abal,0.65
Quechua,0.625
Tzotzil,0.625
Hmong_Miao-Sichuan-Guizhou-Yunnan,0.625
IrishGaelic_Gaeilge,0.625
