# Libraries

In [48]:
# using conda library 'ontogpt_fork'; naming irrelevant as this was previously going to be used to work on a different project.

import pandas as pd
import re
import numpy as np
import os
import random
from tqdm import tqdm # progress bar tracking

import json


pd.set_option('display.max_rows', None)
pd.set_option('display.width', 50)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

# pre-processing pipeline
import pprint
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# NER & RE
import spacy

from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch

from itertools import combinations

import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

import torch
import torch.nn as nn
import torch.nn.functional as F

import json
import jsonschema
from jsonschema import validate

# prompts for OpenAI

import openai_secret_manager
import openai

import tiktoken
from typing import List, Tuple

from datasets import load_dataset


# Import & explore dataset

In [2]:
aylien = pd.read_pickle('./datasets/Aylien_68628.pickle')

In [3]:
# rename id col.
aylien.rename(columns={'id': 'article_id'}, inplace=True)

In [4]:
print(len(aylien))
aylien.info()

68628
<class 'pandas.core.frame.DataFrame'>
Int64Index: 68628 entries, 0 to 68627
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype                  
---  ------        --------------  -----                  
 0   article_id    68628 non-null  int64                  
 1   title         68628 non-null  object                 
 2   published_at  68628 non-null  datetime64[ns, tzutc()]
 3   source        68628 non-null  object                 
 4   body          68628 non-null  object                 
dtypes: datetime64[ns, tzutc()](1), int64(1), object(3)
memory usage: 3.1+ MB


In [5]:
aylien.head(3)

Unnamed: 0,article_id,title,published_at,source,body
0,5594565918,"Alcohol Is Creating, Not Fixing, Your Anxiety",2023-04-30 04:56:45+00:00,Medium,Opponent process theory and the self-regulatin...
1,5594560310,Petrol Subsidy Removal: Buhari hands over toug...,2023-04-30 04:55:40+00:00,Latest Nigerian News,"Nigeria oil resources, especially petrol, seem..."
2,5594561296,Ria Atayde on why losing weight is difficult f...,2023-04-30 04:53:55+00:00,Vietnam Explorer News Channel,Ria Atayde clapped back at body shamers as she...


In [6]:
# Isolate 100 articles to work with:

# Set the seed for reproducibility
seed_value = 2354

# Set the seed
aylien_1000 = aylien.sample(n=1000, random_state=seed_value)

print(len(aylien_1000))
aylien_1000.head()

1000


Unnamed: 0,article_id,title,published_at,source,body
29776,5537090048,Tonix Pharmaceuticals Fast Forwards Its Fibrom...,2023-04-13 16:14:55+00:00,Benzinga,by\n\nTonix Pharmaceuticals Holding Corp TNXP ...
23329,5550152910,Here's why you feel breathless when you have l...,2023-04-17 17:51:26+00:00,AOL UK,People who suffer from long COVID tend to feel...
2745,5587977912,Doctor Warns That Oral Sex Is Causing An Epide...,2023-04-28 04:20:23+00:00,The Hollywood Unlocked,Doctors say oral sex is a factor in the rise o...
4143,5585965760,How an app that tracks your coughs could save ...,2023-04-27 15:20:13+00:00,Digital Trend,“Our feeling is that cough tracking is for eve...
42658,5488961237,'Lessons not learned' over Aberdeen hospital p...,2023-04-06 13:27:43+00:00,BBC UK,We probably give little thought to how taps or...


# Load CoNLL-2003 dataset to compare NER (benchmark)
* CoNLL-2003 is an annotated dataset used to assess NER.

* load_dataset produces datafile split into train, test, validation.
* accessible by subsetting via conll2003["train"], for example, and converting to pandas df.
* furthermore, re ner_tags:
In the Hugging Face datasets library's CoNLL-2003 dataset, the integer values in the "ner_tags" column correspond to the following named entity tags:


    * 0: "O" (Outside) - Indicates that a token does not belong to any named entity.
    * 1: "B-PER" (Beginning of a Person entity) - Marks the beginning of a person's name.
    * 2: "I-PER" (Inside of a Person entity) - Marks the continuation of a person's name.
    * 3: "B-ORG" (Beginning of an Organization entity) - Marks the beginning of an organization's name.
    * 4: "I-ORG" (Inside of an Organization entity) - Marks the continuation of an organization's name.
    * 5: "B-LOC" (Beginning of a Location entity) - Marks the beginning of a location name.
    * 6: "I-LOC" (Inside of a Location entity) - Marks the continuation of a location name.
    * 7: "B-MISC" (Beginning of a Miscellaneous entity) - Marks the beginning of a miscellaneous entity.
    * 8: "I-MISC" (Inside of a Miscellaneous entity) - Marks the continuation of a miscellaneous entity.

In [7]:
conll2003 = load_dataset("conll2003")

Found cached dataset conll2003 (C:/Users/David/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
train_subset = conll2003["train"].select(range(len(conll2003["train"])))
conll2003_train = train_subset.to_pandas()
print(len(conll2003_train))
# 14,041 records.

14041


In [9]:
test_subset = conll2003["test"].select(range(len(conll2003["test"])))
conll2003_test = test_subset.to_pandas()
print(len(conll2003_test))
# 3453 records.

3453


In [10]:
# Rename the 'id' column to 'sent_id'
conll2003_test = conll2003_test.rename(columns={'id': 'sent_id'})

conll2003_test.head()

Unnamed: 0,sent_id,tokens,pos_tags,chunk_tags,ner_tags
0,0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, ...","[21, 8, 22, 37, 22, 22, 6, 22, 15, 12, 21, 7]","[11, 0, 11, 21, 11, 12, 0, 11, 13, 11, 12, 0]","[0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
1,1,"[Nadim, Ladki]","[22, 22]","[11, 12]","[1, 2]"
2,2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]","[22, 6, 22, 22, 23, 11]","[11, 0, 11, 12, 12, 12]","[5, 0, 5, 6, 6, 0]"
3,3,"[Japan, began, the, defence, of, their, Asian,...","[22, 38, 12, 21, 15, 29, 16, 22, 21, 15, 12, 1...","[11, 21, 11, 12, 13, 11, 12, 12, 12, 13, 11, 1...","[5, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,"[But, China, saw, their, luck, desert, them, i...","[10, 22, 38, 29, 21, 37, 28, 15, 12, 21, 21, 1...","[0, 11, 21, 11, 12, 21, 11, 13, 11, 12, 12, 13...","[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Re-map list of ints in ner_tags to the actual tags.

In [11]:
# Create string of input data from tokens:

# Convert comma-split list to a single string
conll2003_test['input_text'] = conll2003_test['tokens'].apply(lambda x: ' '.join(x))

In [12]:
conll2003_test.head()

Unnamed: 0,sent_id,tokens,pos_tags,chunk_tags,ner_tags,input_text
0,0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, ...","[21, 8, 22, 37, 22, 22, 6, 22, 15, 12, 21, 7]","[11, 0, 11, 21, 11, 12, 0, 11, 13, 11, 12, 0]","[0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0]","SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRI..."
1,1,"[Nadim, Ladki]","[22, 22]","[11, 12]","[1, 2]",Nadim Ladki
2,2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]","[22, 6, 22, 22, 23, 11]","[11, 0, 11, 12, 12, 12]","[5, 0, 5, 6, 6, 0]","AL-AIN , United Arab Emirates 1996-12-06"
3,3,"[Japan, began, the, defence, of, their, Asian,...","[22, 38, 12, 21, 15, 29, 16, 22, 21, 15, 12, 1...","[11, 21, 11, 12, 13, 11, 12, 12, 12, 13, 11, 1...","[5, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, ...",Japan began the defence of their Asian Cup tit...
4,4,"[But, China, saw, their, luck, desert, them, i...","[10, 22, 38, 29, 21, 37, 28, 15, 12, 21, 21, 1...","[0, 11, 21, 11, 12, 21, 11, 13, 11, 12, 12, 13...","[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",But China saw their luck desert them in the se...


In [13]:
# Define the mapping dictionary
tag_mapping = {
    0: "O",
    1: "B-PER",
    2: "I-PER",
    3: "B-ORG",
    4: "I-ORG",
    5: "B-LOC",
    6: "I-LOC",
    7: "B-MISC",
    8: "I-MISC"
}

# Map the integers to the corresponding tags
conll2003_test['mapped_ner_tags'] = conll2003_test['ner_tags'].map(lambda tags: [tag_mapping[tag] for tag in tags])

In [14]:
conll2003_test.head()

Unnamed: 0,sent_id,tokens,pos_tags,chunk_tags,ner_tags,input_text,mapped_ner_tags
0,0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, ...","[21, 8, 22, 37, 22, 22, 6, 22, 15, 12, 21, 7]","[11, 0, 11, 21, 11, 12, 0, 11, 13, 11, 12, 0]","[0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0]","SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRI...","[O, O, B-LOC, O, O, O, O, B-PER, O, O, O, O]"
1,1,"[Nadim, Ladki]","[22, 22]","[11, 12]","[1, 2]",Nadim Ladki,"[B-PER, I-PER]"
2,2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]","[22, 6, 22, 22, 23, 11]","[11, 0, 11, 12, 12, 12]","[5, 0, 5, 6, 6, 0]","AL-AIN , United Arab Emirates 1996-12-06","[B-LOC, O, B-LOC, I-LOC, I-LOC, O]"
3,3,"[Japan, began, the, defence, of, their, Asian,...","[22, 38, 12, 21, 15, 29, 16, 22, 21, 15, 12, 1...","[11, 21, 11, 12, 13, 11, 12, 12, 12, 13, 11, 1...","[5, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, ...",Japan began the defence of their Asian Cup tit...,"[B-LOC, O, O, O, O, O, B-MISC, I-MISC, O, O, O..."
4,4,"[But, China, saw, their, luck, desert, them, i...","[10, 22, 38, 29, 21, 37, 28, 15, 12, 21, 21, 1...","[0, 11, 21, 11, 12, 21, 11, 13, 11, 12, 12, 13...","[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",But China saw their luck desert them in the se...,"[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O,..."


In [15]:
# create new col being comma-separated list of any detected orgs.
def extract_organizations(row):
    tokens = row['tokens']
    tags = row['mapped_ner_tags']
    orgs = []
    org = ''
    
    for i, tag in enumerate(tags):
        if tag == 'B-ORG':
            org = tokens[i]
        elif tag == 'I-ORG':
            if org:
                org += ' ' + tokens[i]
        else:
            if org:
                orgs.append(org)
                org = ''
    
    if org:
        orgs.append(org)
    
    return ', '.join(orgs)

# Apply the function to create the 'orgs' column
conll2003_test['orgs'] = conll2003_test.apply(extract_organizations, axis=1)

In [16]:
conll2003_test.head()

Unnamed: 0,sent_id,tokens,pos_tags,chunk_tags,ner_tags,input_text,mapped_ner_tags,orgs
0,0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, ...","[21, 8, 22, 37, 22, 22, 6, 22, 15, 12, 21, 7]","[11, 0, 11, 21, 11, 12, 0, 11, 13, 11, 12, 0]","[0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0]","SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRI...","[O, O, B-LOC, O, O, O, O, B-PER, O, O, O, O]",
1,1,"[Nadim, Ladki]","[22, 22]","[11, 12]","[1, 2]",Nadim Ladki,"[B-PER, I-PER]",
2,2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]","[22, 6, 22, 22, 23, 11]","[11, 0, 11, 12, 12, 12]","[5, 0, 5, 6, 6, 0]","AL-AIN , United Arab Emirates 1996-12-06","[B-LOC, O, B-LOC, I-LOC, I-LOC, O]",
3,3,"[Japan, began, the, defence, of, their, Asian,...","[22, 38, 12, 21, 15, 29, 16, 22, 21, 15, 12, 1...","[11, 21, 11, 12, 13, 11, 12, 12, 12, 13, 11, 1...","[5, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, ...",Japan began the defence of their Asian Cup tit...,"[B-LOC, O, O, O, O, O, B-MISC, I-MISC, O, O, O...",
4,4,"[But, China, saw, their, luck, desert, them, i...","[10, 22, 38, 29, 21, 37, 28, 15, 12, 21, 21, 1...","[0, 11, 21, 11, 12, 21, 11, 13, 11, 12, 12, 13...","[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",But China saw their luck desert them in the se...,"[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O,...",


In [17]:
# Sort the DataFrame in descending order based on the length of the list in 'tokens' column and extract the top 5 rows
top_5_rows = conll2003_test.sort_values(by='tokens', key=lambda x: x.apply(len), ascending=False)[:5]

for i in top_5_rows['tokens']:
    print(i)

['Barbarians' '-' '15' '-' 'Tim' 'Stimpson' '(' 'England' ')' ';' '14' '-'
 'Nigel' 'Walker' '(' 'Wales' ')' ',' '13' '-' 'Allan' 'Bateman' '('
 'Wales' ')' ',' '12' '-' 'Gregor' 'Townsend' '(' 'Scotland' ')' ',' '11'
 '-' 'Tony' 'Underwood' '(' 'England' ')' ';' '10' '-' 'Rob' 'Andrew' '('
 'England' ')' ',' '9' '-' 'Rob' 'Howley' '(' 'Wales' ')' ';' '8' '-'
 'Scott' 'Quinnell' '(' 'Wales' ')' ',' '7' '-' 'Neil' 'Back' '('
 'England' ')' ',' '6' '-' 'Dale' 'McIntosh' '(' 'Pontypridd' ')' ',' '5'
 '-' 'Ian' 'Jones' '(' 'New' 'Zealand' ')' ',' '4' '-' 'Craig' 'Quinnell'
 '(' 'Wales' ')' ',' '3' '-' 'Darren' 'Garforth' '(' 'Leicester' ')' ','
 '2' '-' 'Norm' 'Hewitt' '(' 'New' 'Zealand' ')' ',' '1' '-' 'Nick'
 'Popplewell' '(' 'Ireland' ')' '.']
['Australia' '-' '15' '-' 'Matthew' 'Burke' ';' '14' '-' 'Joe' 'Roff' ','
 '13' '-' 'Daniel' 'Herbert' ',' '12' '-' 'Tim' 'Horan' '(' 'captain' ')'
 ',' '11' '-' 'David' 'Campese' ';' '10' '-' 'Pat' 'Howard' ',' '9' '-'
 'Sam' 'Payne' ';' '8' '-'

In [18]:
conll2003_test_orgs = conll2003_test[conll2003_test['orgs'].str.strip().ne('')]

# Reset the index
conll2003_test_orgs = conll2003_test_orgs.reset_index(drop=True)

print(len(conll2003_test_orgs))
conll2003_test_orgs.head()

# i.e. 1229 rows in test_conll2003 contain an org (at least 1 B-ORG)

1229


Unnamed: 0,sent_id,tokens,pos_tags,chunk_tags,ner_tags,input_text,mapped_ner_tags,orgs
0,19,"[Japan, ,, co-hosts, of, the, World, Cup, in, ...","[22, 6, 42, 15, 12, 22, 22, 15, 11, 10, 38, 16...","[11, 0, 21, 13, 11, 12, 12, 13, 11, 0, 21, 11,...","[5, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, ...","Japan , co-hosts of the World Cup in 2002 and ...","[B-LOC, O, O, O, O, B-MISC, I-MISC, O, O, O, O...",FIFA
1,22,"[RUGBY, UNION, -, CUTTITTA, BACK, FOR, ITALY, ...","[22, 21, 8, 22, 22, 15, 16, 22, 22, 22, 7]","[11, 12, 0, 11, 12, 13, 11, 12, 12, 12, 0]","[3, 4, 0, 1, 0, 0, 5, 0, 0, 0, 0]",RUGBY UNION - CUTTITTA BACK FOR ITALY AFTER A ...,"[B-ORG, I-ORG, O, B-PER, O, O, B-LOC, O, O, O, O]",RUGBY UNION
2,141,"[Plymouth, 4, Exeter, 1]","[21, 11, 22, 11]","[11, 12, 12, 12]","[3, 0, 3, 0]",Plymouth 4 Exeter 1,"[B-ORG, O, B-ORG, O]","Plymouth, Exeter"
3,144,"[Dutch, forward, Reggie, Blinker, had, his, in...","[16, 16, 22, 22, 38, 29, 16, 21, 40, 15, 22, 1...","[11, 12, 12, 12, 21, 11, 12, 12, 21, 13, 11, 1...","[7, 0, 1, 2, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, ...",Dutch forward Reggie Blinker had his indefinit...,"[B-MISC, O, B-PER, I-PER, O, O, O, O, O, O, B-...","FIFA, Sheffield Wednesday, Liverpool"
4,145,"[Blinker, missed, his, club, 's, last, two, ga...","[21, 38, 29, 21, 27, 16, 11, 24, 15, 22, 38, 1...","[11, 21, 11, 12, 11, 12, 12, 12, 13, 11, 21, 1...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, ...",Blinker missed his club 's last two games afte...,"[B-PER, O, O, O, O, O, O, O, O, B-ORG, O, O, O...","FIFA, Wednesday, Udinese, Feyenoord"


In [19]:
index_sample = 4

# Print data in 'tokens' column at index 4
print(conll2003_test_orgs.at[index_sample, 'tokens'])
print('\n')
# Print data in 'orgs' column at index 4
print(conll2003_test_orgs.at[index_sample, 'mapped_ner_tags']) # 3-4 indicating org
print('\n')
# Print data in 'orgs' column at index 4
print(conll2003_test_orgs.at[index_sample, 'orgs'])

['Blinker' 'missed' 'his' 'club' "'s" 'last' 'two' 'games' 'after' 'FIFA'
 'slapped' 'a' 'worldwide' 'ban' 'on' 'him' 'for' 'appearing' 'to' 'sign'
 'contracts' 'for' 'both' 'Wednesday' 'and' 'Udinese' 'while' 'he' 'was'
 'playing' 'for' 'Feyenoord' '.']


['B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O']


FIFA, Wednesday, Udinese, Feyenoord


* Initially thought above was an error (Wednesday) however the full sentence possibly implies that it is actually an org.

In [20]:
# Find the row where 'mapped_ner_tags' contains 'I-ORG'
index_sample = conll2003_test_orgs[conll2003_test_orgs['mapped_ner_tags'
                                                      ].apply(lambda tags: 'I-ORG' in tags)
                                  ].index[1] # change index here for new example.

# Print data in 'tokens' column at the identified index
print(conll2003_test_orgs.at[index_sample, 'tokens'])
print('\n')

# Print data in 'mapped_ner_tags' column at the identified index
print(conll2003_test_orgs.at[index_sample, 'mapped_ner_tags'])
print('\n')

# Print data in 'orgs' column at the identified index
print(conll2003_test_orgs.at[index_sample, 'orgs'])

['Dutch' 'forward' 'Reggie' 'Blinker' 'had' 'his' 'indefinite'
 'suspension' 'lifted' 'by' 'FIFA' 'on' 'Friday' 'and' 'was' 'set' 'to'
 'make' 'his' 'Sheffield' 'Wednesday' 'comeback' 'against' 'Liverpool'
 'on' 'Saturday' '.']


['B-MISC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'B-ORG', 'O', 'O', 'O']


FIFA, Sheffield Wednesday, Liverpool


* Above example confirms previous commentary.

# Run prompt generation to detect only orgs.
* Attempting to use similar prompt format as used in main KG_construction, for consistency.
* Aim is to benchmark NER.
    * Despite the main research being KGC (triple extraction) -- the triples are centres on the main organisation detected in the Aylien article.
    * Hence it makes sense to benchmark against NER, focussing on orgs.
    * The conll-2003 dataset is also based on news articles.
    
We will work from the full test dataset, also feeding in sentences that were not annotated with orgs. This way we can see if the prompt results pick up any orgs not previously detected.

# Prompt generation (reduced version)

In [21]:
print(len(conll2003_test))
conll2003_test.head()

3453


Unnamed: 0,sent_id,tokens,pos_tags,chunk_tags,ner_tags,input_text,mapped_ner_tags,orgs
0,0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, ...","[21, 8, 22, 37, 22, 22, 6, 22, 15, 12, 21, 7]","[11, 0, 11, 21, 11, 12, 0, 11, 13, 11, 12, 0]","[0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0]","SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRI...","[O, O, B-LOC, O, O, O, O, B-PER, O, O, O, O]",
1,1,"[Nadim, Ladki]","[22, 22]","[11, 12]","[1, 2]",Nadim Ladki,"[B-PER, I-PER]",
2,2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]","[22, 6, 22, 22, 23, 11]","[11, 0, 11, 12, 12, 12]","[5, 0, 5, 6, 6, 0]","AL-AIN , United Arab Emirates 1996-12-06","[B-LOC, O, B-LOC, I-LOC, I-LOC, O]",
3,3,"[Japan, began, the, defence, of, their, Asian,...","[22, 38, 12, 21, 15, 29, 16, 22, 21, 15, 12, 1...","[11, 21, 11, 12, 13, 11, 12, 12, 12, 13, 11, 1...","[5, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, ...",Japan began the defence of their Asian Cup tit...,"[B-LOC, O, O, O, O, O, B-MISC, I-MISC, O, O, O...",
4,4,"[But, China, saw, their, luck, desert, them, i...","[10, 22, 38, 29, 21, 37, 28, 15, 12, 21, 21, 1...","[0, 11, 21, 11, 12, 21, 11, 13, 11, 12, 12, 13...","[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",But China saw their luck desert them in the se...,"[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O,...",


#### Previous prompt used in main

In [22]:
# Create a function to generate the prompt based on the row values
def generate_prompt(row):

    input_text = row['input_text']
    sent_id = row['sent_id']
    
    prompt = f'''
    For any organisations detected in this article (if any), give me information as follows and nothing else.
    The article id should always be present for reference.
    Include any government organisations discussed, if they are the main organisation discussed.
    ###
    <
    article_id: {sent_id}
    orgs: <comma-separated list of any organisations discussed/NA>.
    >
    ###
    The article is: {input_text}.
    ###
    '''
    
    #return prompt.replace('\n', '')
    return prompt

# Add a new 'prompt' column by applying the generate_prompt function to each row
conll2003_test['prompt'] = conll2003_test.apply(generate_prompt, axis=1)

In [23]:
print(len(conll2003_test))
conll2003_test.head()

3453


Unnamed: 0,sent_id,tokens,pos_tags,chunk_tags,ner_tags,input_text,mapped_ner_tags,orgs,prompt
0,0,"[SOCCER, -, JAPAN, GET, LUCKY, WIN, ,, CHINA, ...","[21, 8, 22, 37, 22, 22, 6, 22, 15, 12, 21, 7]","[11, 0, 11, 21, 11, 12, 0, 11, 13, 11, 12, 0]","[0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0]","SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRI...","[O, O, B-LOC, O, O, O, O, B-PER, O, O, O, O]",,\n For any organisations detected in this a...
1,1,"[Nadim, Ladki]","[22, 22]","[11, 12]","[1, 2]",Nadim Ladki,"[B-PER, I-PER]",,\n For any organisations detected in this a...
2,2,"[AL-AIN, ,, United, Arab, Emirates, 1996-12-06]","[22, 6, 22, 22, 23, 11]","[11, 0, 11, 12, 12, 12]","[5, 0, 5, 6, 6, 0]","AL-AIN , United Arab Emirates 1996-12-06","[B-LOC, O, B-LOC, I-LOC, I-LOC, O]",,\n For any organisations detected in this a...
3,3,"[Japan, began, the, defence, of, their, Asian,...","[22, 38, 12, 21, 15, 29, 16, 22, 21, 15, 12, 1...","[11, 21, 11, 12, 13, 11, 12, 12, 12, 13, 11, 1...","[5, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, ...",Japan began the defence of their Asian Cup tit...,"[B-LOC, O, O, O, O, O, B-MISC, I-MISC, O, O, O...",,\n For any organisations detected in this a...
4,4,"[But, China, saw, their, luck, desert, them, i...","[10, 22, 38, 29, 21, 37, 28, 15, 12, 21, 21, 1...","[0, 11, 21, 11, 12, 21, 11, 13, 11, 12, 12, 13...","[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",But China saw their luck desert them in the se...,"[O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O,...",,\n For any organisations detected in this a...


In [24]:
all_prompts = conll2003_test['prompt'].tolist()

In [25]:
# observe single prompt as example.

all_prompts[0]

'\n    For any organisations detected in this article (if any), give me information as follows and nothing else.\n    The article id should always be present for reference.\n    Include any government organisations discussed, if they are the main organisation discussed.\n    ###\n    <\n    article_id: 0\n    orgs: <comma-separated list of any organisations discussed/NA>.\n    >\n    ###\n    The article is: SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT ..\n    ###\n    '

# Exploring prompts

## Investigate average length of tokens

* Important for pricing forecast and LLM API restrictions

### with tiktoken

In [26]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
#encoding = tiktoken.get_encoding("cl100k_base")

In [27]:
# fns to count strings in list of prompts:

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def count_tokens_in_list(prompt_list: list, encoding_name: str) -> list:
    """Returns a list of integers representing the number of tokens in each string in the input list."""
    token_counts = []
    for prompt in prompt_list:
        num_tokens = num_tokens_from_string(prompt, encoding_name)
        token_counts.append(num_tokens)
    return token_counts


In [28]:
encoding_name = "cl100k_base" # used for gpt-3.5-turbo
token_counts = count_tokens_in_list(all_prompts, encoding_name)

In [29]:
min_tokens = min(token_counts)
max_tokens = max(token_counts)
total_tokens = sum(i for i in token_counts if isinstance(i, int))
average_tokens = total_tokens / len(all_prompts)

print(f"Number of tokens on the smallest prompt: {min_tokens}")
print(f"Number of tokens on the largest prompt: {max_tokens}")
print(f"Total number of tokens for all prompts: {total_tokens}")
print(f"Average number of tokens in all_prompts: {average_tokens}")

Number of tokens on the smallest prompt: 87
Number of tokens on the largest prompt: 242
Total number of tokens for all prompts: 366751
Average number of tokens in all_prompts: 106.21227917752678


In [30]:
# Estimating max response tokens if prompt works correctly (update to reflect prompt used)

#num_tokens_from_string("facing risk: yes. type of risk: thisis some text for a risk type.", "cl100k_base")

num_tokens_from_string(
    
    '''
    article_id: {article_id}
    org: <main organisation discussed/NA, org2, org, org, org>.
    '''
                       ,"cl100k_base" )

29

### Truncate tokens of long prompts

* gpt-3.5-turbo has max tokens of 4,096 tokens
* This includes prompt and response tokens combined.
* response tokens should be short due to the attempt at prompt restrictions;
    * i.e. Provide answers only in the format of <facing risk: <'yes'/'no'>. type of risk: < risk type >.> and nothing else.
* so a generous estimate of response tokens would be 100, providing gpt-3.5-turbo successfully adheres to above prompting.
* Therefore truncate prompt tokens to 3500 to be safe.

In [31]:
def truncate_prompt(prompt: str, encoding_name: str, max_tokens: int) -> str:
    """Truncates a text string to the specified number of tokens."""
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(prompt)[:max_tokens]
    return encoding.decode(tokens)


def count_tokens_for_truncating(prompt_list: list, encoding_name: str, max_tokens: int) -> list:
    """Returns a list of strings with a maximum of max_tokens tokens."""
    token_counts = []
    truncated_prompts = []
    for prompt in prompt_list:
        num_tokens = num_tokens_from_string(prompt, encoding_name)
        if num_tokens > max_tokens:
            truncated_prompt = truncate_prompt(prompt, encoding_name, max_tokens)
            token_counts.append(max_tokens)
        else:
            truncated_prompt = prompt
            token_counts.append(num_tokens)
        truncated_prompts.append(truncated_prompt)
    return truncated_prompts, token_counts


In [32]:
encoding_name = "cl100k_base" # used for gpt-3.5-turbo
# encoding_name = "r50k_base" # used for GPT-3 models. todo check this.
max_tokens = 3500 # Update this based on model to be used in 'Generating responses' section to correspond to token limitations.
#all_prompts = # your list of prompts here

truncated_prompts, token_counts = count_tokens_for_truncating(all_prompts, encoding_name, max_tokens)

In [33]:
# Compare output to previous token counts.

token_counts_truncated_prompts = count_tokens_in_list(truncated_prompts, encoding_name)
min_tokens_truncated_prompts = min(token_counts_truncated_prompts)
max_tokens_truncated_prompts = max(token_counts_truncated_prompts)
total_tokens_truncated_prompts = sum(i for i in token_counts_truncated_prompts if isinstance(i, int))
average_tokens_truncated_prompts = total_tokens_truncated_prompts / len(truncated_prompts)

print(f"Number of tokens on the smallest prompt: {min_tokens_truncated_prompts}")
print(f"Number of tokens on the largest prompt: {max_tokens_truncated_prompts}")
print(f"Total number of tokens for all prompts: {total_tokens_truncated_prompts}")
print(f"Average number of tokens in all_prompts: {average_tokens_truncated_prompts}")

# Compare output to previous output (prior to truncation).

Number of tokens on the smallest prompt: 87
Number of tokens on the largest prompt: 242
Total number of tokens for all prompts: 366751
Average number of tokens in all_prompts: 106.21227917752678


In [34]:
print("Total number of truncated prompts: ",len(truncated_prompts))

print(truncated_prompts[:1])

Total number of truncated prompts:  3453
['\n    For any organisations detected in this article (if any), give me information as follows and nothing else.\n    The article id should always be present for reference.\n    Include any government organisations discussed, if they are the main organisation discussed.\n    ###\n    <\n    article_id: 0\n    orgs: <comma-separated list of any organisations discussed/NA>.\n    >\n    ###\n    The article is: SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT ..\n    ###\n    ']


# Feed prompts into LLM and populate output schema directly

In [35]:
responses = []

In [36]:
# additional code to handle possible rate limits.
import openai
import time

prompts = truncated_prompts
GPT_MODEL = "gpt-3.5-turbo"
openai.api_key = os.getenv('OPENAI_KEY')


for idx, query in enumerate(tqdm(prompts, desc="Processing prompts")):
    try:
        response = None

        while response is None:
            try:
                response = openai.ChatCompletion.create(
                    messages=[
                        {"role": "system", "content": "You answer questions in the specified format about the article I give you."},
                        {"role": "user", "content": query},
                    ],
                    model=GPT_MODEL,
                    temperature=0.5,
                    max_tokens=400, # max tokens in response.
                    n=1,
                )
            except openai.errors.TooManyRequestsError as e:
                # Sleep for the recommended duration
                time.sleep(e.response['Retry-After'])
                continue

        response_content = response["choices"][0]["message"]["content"]
        responses.append(response_content)

    except Exception as e:
        print(f"Error generating response for prompt at index {idx}: {e}")


Processing prompts:  96%|█████████▋| 3327/3453 [2:38:25<3:23:01, 96.68s/it]

Error generating response for prompt at index 3326: module 'openai' has no attribute 'errors'


Processing prompts: 100%|██████████| 3453/3453 [2:44:45<00:00,  2.86s/it]  


In [37]:
print(len(responses))

3452


In [46]:
responses[:4]

['### \n<article_id: 0\norgs: Japan, China>\n###',
 'Sorry, I am an AI language model and I do not have access to a specific article or its content unless it is provided to me. Can you please provide me with the article you want me to analyze?',
 'Sorry, I cannot provide the answer as the article content is not given.',
 '<article_id: 3\norgs: Japan, Syria\n>']

In [43]:
# export responses data to txt file:
# Specify the output file path
output_file = './benchmarking/results/benchmarking_org_ner_responses.txt'

# Write the list to the JSON file
with open(output_file, 'w') as file:
    json.dump(responses, file)

In [47]:
data = []

for response in responses:
    # Remove < and > characters and trailing /
    response = response.replace('<', '').replace('>', '').replace('/', '').replace('### ', '').replace('#', '')

    item = {}
    lines = response.split('\n')
    for line in lines:
        if ': ' in line:
            key, value = line.split(': ', 1)
            key = key.lower().strip()  # Convert key to lowercase and remove leading/trailing spaces
            if key == 'article_id':
                # Remove commas from value before converting to integer
                value = value.replace(',', '')
                item[key] = int(value)
            elif value.strip() in ["NA", "NA."]:
                item[key] = ""
            else:
                item[key] = value.strip()
    data.append(item)

# Now 'data' is a list of dictionaries with lowercase keys

# Specify the path and filename for the output JSON file
output_file = "./benchmarking/results/benchmarking_org_ner_responses_output.json"

# Write the populated_schemas list to the JSON file
with open(output_file, "w") as file:
    json.dump(data, file, indent=4)

print(f"Populated schemas saved to {output_file}")

Populated schemas saved to ./benchmarking/results/benchmarking_org_ner_responses_output.json


In [50]:
# validate the json file created above to the following schema.


# Define the schema
schema = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "properties": {
        "article_id": {
            "type": "integer",
            "description": "ID of the article"
        },
        "orgs": {
            "type": "string",
            "description": "Main organisation discussed or 'NA' if not applicable"

    },
    "required": ["article_id", "orgs"]
}
}

# Load your JSON data
with open('./benchmarking/results/benchmarking_org_ner_responses_output.json', 'r') as f:
    data = json.load(f)

# For each object in data, validate against the schema
for idx, item in enumerate(data):
    try:
        validate(instance=item, schema=schema)
    except jsonschema.exceptions.ValidationError as ve:
        print(f"Validation error for item at index {idx}: {ve}")


SchemaError: ['article_id', 'orgs'] is not of type 'object', 'boolean'

Failed validating 'type' in metaschema['properties']['properties']['additionalProperties']:
    {'$id': 'http://json-schema.org/draft-07/schema#',
     '$schema': 'http://json-schema.org/draft-07/schema#',
     'default': True,
     'definitions': {'nonNegativeInteger': {'minimum': 0,
                                            'type': 'integer'},
                     'nonNegativeIntegerDefault0': {'allOf': [{'$ref': '#/definitions/nonNegativeInteger'},
                                                              {'default': 0}]},
                     'schemaArray': {'items': {'$ref': '#'},
                                     'minItems': 1,
                                     'type': 'array'},
                     'simpleTypes': {'enum': ['array',
                                              'boolean',
                                              'integer',
                                              'null',
                                              'number',
                                              'object',
                                              'string']},
                     'stringArray': {'default': [],
                                     'items': {'type': 'string'},
                                     'type': 'array',
                                     'uniqueItems': True}},
     'properties': {'$comment': {'type': 'string'},
                    '$id': {'format': 'uri-reference', 'type': 'string'},
                    '$ref': {'format': 'uri-reference', 'type': 'string'},
                    '$schema': {'format': 'uri', 'type': 'string'},
                    'additionalItems': {'$ref': '#'},
                    'additionalProperties': {'$ref': '#'},
                    'allOf': {'$ref': '#/definitions/schemaArray'},
                    'anyOf': {'$ref': '#/definitions/schemaArray'},
                    'const': True,
                    'contains': {'$ref': '#'},
                    'contentEncoding': {'type': 'string'},
                    'contentMediaType': {'type': 'string'},
                    'default': True,
                    'definitions': {'additionalProperties': {'$ref': '#'},
                                    'default': {},
                                    'type': 'object'},
                    'dependencies': {'additionalProperties': {'anyOf': [{'$ref': '#'},
                                                                        {'$ref': '#/definitions/stringArray'}]},
                                     'type': 'object'},
                    'description': {'type': 'string'},
                    'else': {'$ref': '#'},
                    'enum': {'items': True, 'type': 'array'},
                    'examples': {'items': True, 'type': 'array'},
                    'exclusiveMaximum': {'type': 'number'},
                    'exclusiveMinimum': {'type': 'number'},
                    'format': {'type': 'string'},
                    'if': {'$ref': '#'},
                    'items': {'anyOf': [{'$ref': '#'},
                                        {'$ref': '#/definitions/schemaArray'}],
                              'default': True},
                    'maxItems': {'$ref': '#/definitions/nonNegativeInteger'},
                    'maxLength': {'$ref': '#/definitions/nonNegativeInteger'},
                    'maxProperties': {'$ref': '#/definitions/nonNegativeInteger'},
                    'maximum': {'type': 'number'},
                    'minItems': {'$ref': '#/definitions/nonNegativeIntegerDefault0'},
                    'minLength': {'$ref': '#/definitions/nonNegativeIntegerDefault0'},
                    'minProperties': {'$ref': '#/definitions/nonNegativeIntegerDefault0'},
                    'minimum': {'type': 'number'},
                    'multipleOf': {'exclusiveMinimum': 0, 'type': 'number'},
                    'not': {'$ref': '#'},
                    'oneOf': {'$ref': '#/definitions/schemaArray'},
                    'pattern': {'format': 'regex', 'type': 'string'},
                    'patternProperties': {'additionalProperties': {'$ref': '#'},
                                          'default': {},
                                          'propertyNames': {'format': 'regex'},
                                          'type': 'object'},
                    'properties': {'additionalProperties': {'$ref': '#'},
                                   'default': {},
                                   'type': 'object'},
                    'propertyNames': {'$ref': '#'},
                    'readOnly': {'default': False, 'type': 'boolean'},
                    'required': {'$ref': '#/definitions/stringArray'},
                    'then': {'$ref': '#'},
                    'title': {'type': 'string'},
                    'type': {'anyOf': [{'$ref': '#/definitions/simpleTypes'},
                                       {'items': {'$ref': '#/definitions/simpleTypes'},
                                        'minItems': 1,
                                        'type': 'array',
                                        'uniqueItems': True}]},
                    'uniqueItems': {'default': False, 'type': 'boolean'}},
     'title': 'Core schema meta-schema',
     'type': ['object', 'boolean']}

On schema['properties']['required']:
    ['article_id', 'orgs']