In [1]:
from fp_dataset_artifacts.utils import init_openai

init_openai()

In [2]:
from datasets import list_datasets, load_dataset, list_metrics, load_metric

data = load_dataset('snli')
data

Reusing dataset snli (/home/x/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
})

In [3]:
import openai

In [4]:
# Reference: https://beta.openai.com/docs/guides/classifications

In [12]:
"""
Create the train.jsonl file

{"text": "good film, but very glum.", "label": "Positive", "metadata": {"source":"example.com"}}
{"text": "i sympathize with the plight of these families, but the movie doesn't do a very good job conveying the issue at hand.", "label": "Negative", "metadata": {"source":"example.com"}}

... (more rows)
"""

def build_jsonl(data):
    import json
    
    with open('snli_train.jsonl', 'w') as f:
        for x in data:
            f.write(json.dumps(x) + '\n') 

build_jsonl(data['train'])

In [13]:
# Upload the examples
openai.File.create(file=open("snli_train.jsonl"), purpose="classifications")

InvalidRequestError: Expected file to have the JSONL format with 'text' key and (optional) 'metadata' key.

In [21]:
# Classification expects certain format
int2label = data['train'].features['label'].int2str

def format_jsonl(x):
     return {
         'text': x['hypothesis'],
         'metadata': {'premise': x['premise']},
         'label': int2label(x['label'])
     }
    
format_jsonl(data['train'][0])

{'text': 'A person is training his horse for a competition.',
 'metadata': {'premise': 'A person on a horse jumps over a broken down airplane.'},
 'label': 'neutral'}

In [23]:
# Build the examples with the correct formatting. Hypothesis is used as the text since it's known for model
# to perform pretty well without premises due to dataset artifact.
def build_jsonl(data):
    import json
    
    with open('snli_train.jsonl', 'w') as f:
        for x in data:
            if x['label'] >= 0:
                f.write(json.dumps(format_jsonl(x)) + '\n') 

build_jsonl(data['train'])

In [24]:
openai.File.create(file=open("snli_train.jsonl"), purpose="classifications")

<File file id=file-PbSzGW2gSR0pOTYlfiDsgIrf at 0x7f32be57eea0> JSON: {
  "bytes": 92713950,
  "created_at": 1638407457,
  "filename": "snli_train.jsonl",
  "id": "file-PbSzGW2gSR0pOTYlfiDsgIrf",
  "object": "file",
  "purpose": "classifications",
  "status": "uploaded",
  "status_details": null
}

In [25]:
# Let's try with a test set
data['test'][0]

{'premise': 'This church choir sings to the masses as they sing joyous songs from the book at a church.',
 'hypothesis': 'The church has cracks in the ceiling.',
 'label': 1}

In [27]:
openai.Classification.create(
    file="file-PbSzGW2gSR0pOTYlfiDsgIrf",
    query="The church has cracks in the ceiling.",
    search_model="ada", 
    model="curie", 
    max_examples=3
)

InvalidRequestError: File is still processing.  Check back later.

In [28]:
# It seems to take some time to train

In [30]:
openai.Classification.create(
    file="file-PbSzGW2gSR0pOTYlfiDsgIrf",
    query="The church has cracks in the ceiling.",
    search_model="ada", 
    model="curie", 
    max_examples=3
)

<OpenAIObject classification at 0x7f32be568720> JSON: {
  "completion": "cmpl-4AUy9WeoDihb7mRGCFiuWhyV93F0i",
  "file": "file-PbSzGW2gSR0pOTYlfiDsgIrf",
  "label": "Entailment",
  "model": "curie:2020-05-03",
  "object": "classification",
  "search_model": "ada:2020-05-03",
  "selected_examples": [
    {
      "document": 1,
      "label": "Entailment",
      "object": "search_result",
      "score": 77.784,
      "text": "Man paints a ceiling."
    },
    {
      "document": 0,
      "label": "Entailment",
      "object": "search_result",
      "score": 58.753,
      "text": "The market has a ceiling."
    },
    {
      "document": 2,
      "label": "Entailment",
      "object": "search_result",
      "score": 78.084,
      "text": "This is a ceiling in a cafeteria."
    }
  ],
    {
      "code": "wrong_labels",
      "message": "Two class labels should be provided at minimum. Only 1 label is found."
    }
  ]
}

In [33]:
# The classification model predicts entailment even though it should be neutral
int2label(1)

'neutral'

In [44]:
# Let's gather some stats on 100 test data points
test = data['test'][:100]
texts = test['hypothesis']
labels = list(map(lambda x: int2label(x) if x >= 0 else 'neutral', test['label']))

In [45]:
# TODO: Use yield instead of return
def make_classification(x):
    return openai.Classification.create(
        file="file-PbSzGW2gSR0pOTYlfiDsgIrf",
        query=x,
        search_model="ada", 
        model="curie", 
        max_examples=3
    )


results = []

for text in texts:
    results.append(make_classification(text))

In [48]:
outputs = [x['label'] for x in results]
outputs

['Entailment',
 'Contradiction',
 'Entailment',
 'Entailment',
 'Neutral',
 'Contradiction',
 'Neutral',
 'Neutral',
 'Entailment',
 'Contradiction',
 'Neutral',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Neutral',
 'Neutral',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Entailment',
 'Entailment',
 'Contradiction',
 'Neutral',
 'Contradiction',
 'Entailment',
 'Entailment',
 'Neutral',
 'Contradiction',
 'Neutral',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Entailment',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Neutral',
 'Entailment',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Neutral',
 'Contradiction',
 'Entailment',
 'Neutral',
 'Contradiction',
 'Entailment',
 'Contradiction',
 'Neutral',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Entailment',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 'Contradiction',
 

In [53]:
accuracy = [label.capitalize() == output for label, output in zip(labels, outputs)]
sum(accuracy)

44

In [54]:
# 44% is slightly better than 33%, but it's very low compared to fine-tuned BERT-based models
# It's actually surprising that it's getting 44% at all simply based on hypothesis alone.

In [55]:
# Save the outputs

def build_jsonl(data):
    import json
    
    with open('snli_preds.jsonl', 'w') as f:
        for x in data:
            f.write(json.dumps(x) + '\n')
            
build_jsonl(results)