# Libraries

In [1]:
# using conda library 'ontogpt_fork'; naming irrelevant as this was previously going to be used to work on a different project.

import pandas as pd
import re
import numpy as np
import os
import random
from tqdm import tqdm # progress bar tracking

import json


pd.set_option('display.max_rows', None)
pd.set_option('display.width', 50)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

# pre-processing pipeline
import pprint
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# NER & RE
import spacy

from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch

from itertools import combinations

import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

import torch
import torch.nn as nn
import torch.nn.functional as F

# prompts for OpenAI
import json
import openai_secret_manager
import openai

import tiktoken
from typing import List, Tuple

# Import & explore dataset

In [2]:
aylien = pd.read_pickle('./datasets/Aylien_68628.pickle')

In [3]:
# rename id col.
aylien.rename(columns={'id': 'article_id'}, inplace=True)

In [4]:
print(len(aylien))
aylien.info()

68628
<class 'pandas.core.frame.DataFrame'>
Int64Index: 68628 entries, 0 to 68627
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype                  
---  ------        --------------  -----                  
 0   article_id    68628 non-null  int64                  
 1   title         68628 non-null  object                 
 2   published_at  68628 non-null  datetime64[ns, tzutc()]
 3   source        68628 non-null  object                 
 4   body          68628 non-null  object                 
dtypes: datetime64[ns, tzutc()](1), int64(1), object(3)
memory usage: 3.1+ MB


In [5]:
aylien.head(3)

Unnamed: 0,article_id,title,published_at,source,body
0,5594565918,"Alcohol Is Creating, Not Fixing, Your Anxiety",2023-04-30 04:56:45+00:00,Medium,Opponent process theory and the self-regulatin...
1,5594560310,Petrol Subsidy Removal: Buhari hands over toug...,2023-04-30 04:55:40+00:00,Latest Nigerian News,"Nigeria oil resources, especially petrol, seem..."
2,5594561296,Ria Atayde on why losing weight is difficult f...,2023-04-30 04:53:55+00:00,Vietnam Explorer News Channel,Ria Atayde clapped back at body shamers as she...


In [6]:
# Isolate 100 articles to work with:

# Set the seed for reproducibility
seed_value = 52

# Set the seed
aylien_100 = aylien.sample(n=100, random_state=seed_value)

print(len(aylien_100))
aylien_100.head()

100


Unnamed: 0,article_id,title,published_at,source,body
26897,5541797339,5 warning signs of poor blood circulation,2023-04-14 20:34:46+00:00,Newsfounded,"Heart attack, stroke or kidney failure: poor b..."
60244,5451482089,Sultana's Death In Custody: She had head injur...,2023-03-28 01:22:10+00:00,The Daily Star - Bangladesh,"Sultana Jasmine, a union-level land employee i..."
7247,5580830636,MAGA-Obsessed GOP Rep. Ronny Jackson Mocked Af...,2023-04-26 09:03:19+00:00,Vietnam Explorer News Channel,"Rep. Ronny Jackson (R-Tex.), already known for..."
52028,5467485064,Helping families cope with mental illness,2023-03-31 21:46:30+00:00,WOAI News 4 Sanantonio,SAN ANTONIO - One in five U.S. adults experien...
18981,5557299621,"As states cut Medicaid, some fear they'll mist...",2023-04-19 14:08:22+00:00,Vietnam Explorer News Channel,"Tonya Moore, 49, relies on doctor visits and p..."


In [7]:
aylien_100.head()

Unnamed: 0,article_id,title,published_at,source,body
26897,5541797339,5 warning signs of poor blood circulation,2023-04-14 20:34:46+00:00,Newsfounded,"Heart attack, stroke or kidney failure: poor b..."
60244,5451482089,Sultana's Death In Custody: She had head injur...,2023-03-28 01:22:10+00:00,The Daily Star - Bangladesh,"Sultana Jasmine, a union-level land employee i..."
7247,5580830636,MAGA-Obsessed GOP Rep. Ronny Jackson Mocked Af...,2023-04-26 09:03:19+00:00,Vietnam Explorer News Channel,"Rep. Ronny Jackson (R-Tex.), already known for..."
52028,5467485064,Helping families cope with mental illness,2023-03-31 21:46:30+00:00,WOAI News 4 Sanantonio,SAN ANTONIO - One in five U.S. adults experien...
18981,5557299621,"As states cut Medicaid, some fear they'll mist...",2023-04-19 14:08:22+00:00,Vietnam Explorer News Channel,"Tonya Moore, 49, relies on doctor visits and p..."


# Prompt generation

## Investigating potential prompts

1. "give me an answer only in cypher language that i can paste into neo4j to create a graph. What is the single main event and relation described by the article regarding IIC Padmalochan Manigarahi?  The article is: Mayurbhanj: One person was killed, while two others sustained critical injuries when they were brutally attacked by a youth on suspicion of practising black magic on Sunday night.\nThe incident has been reported from Kainal village in Jualibhanga panchayat under Morada police station limits in Odisha's Mayurbhanj district.\n  The deceased has been identified as Gauri Tudu (70) and the injured as Kuni Tudu (42) and Kalicharan Tudu (47).\n  According to reports, a youth named Durga Charan Tudu suddenly attacked the three while they were sitting in their home last night. Durga Charan Tudu allegedly attacked the three with a sharp weapon, which left septuagenarian Gauri Tudu dead on the spot, while two members of the family were severely injured and dropped unconscious. He bore a grudge against the Tudu family suspecting that they practised witchcraft and were responsible for several ills in the village.\n  Soon after the gruesome incident, the three were rushed to the nearby community health center where doctors declared Gauri Tudu brought dead. Kuni Tudu and Kalicharan Tudu were shifted to the PRM Hospital in Baripada as their condition deteriorated. Currently, Kuni Tudu and Kalicharan Tudu are undergoing treatment in the hospital and their condition is said to be critical.\n  On being informed, Morada police IIC Padmalochan Manigarahi and Chitrada outpost officer Niranjan Saren reached the spot with their teams. They sent Gauri's body for post-mortem and detained Durga Charan Tudu for interrogation. An investigation into the incident is underway."

* Notes on above:
    * only sometimes produces cypher query.
    * and if it does, difficult to get in a clean plaintext format that can be piped out.
    * cypher query often not accurate, if produced.
    * response from CGPT too large/cumbersome. defeats purpose.
    

In [8]:
# Create a function to generate the prompt based on the row values
def generate_prompt(row):

    body = row['body']
    article_id = row['article_id']
    
    prompt = f'''
    For the main organisation discussed in this article (if any), give me information as follows and nothing else.
    The article id should always be present for reference.
    Include any government organisations discussed, if they are the main organisation discussed.
    ###
    <
    article_id: {article_id}
    org: <main organisation discussed/NA>.
    location: <location_of_main_org/NA>
    risk_type: <risk_type_faced_by_main_org/NA>.
    items_sold: <comma_separated_list_of_items_sold_by_main_org/NA>.
    service_provided: <comma_separated_list_of_services_provided_by_main_org/NA>.
    business_relations: <comma_separated_list_of_orgs_with_business_relation_to_main_org/NA>.
    >
    ###
    The article is: {body}.
    ###
    '''
    
    #return prompt.replace('\n', '')
    return prompt

# Add a new 'prompt' column by applying the generate_prompt function to each row
aylien_100['prompt'] = aylien_100.apply(generate_prompt, axis=1)

In [9]:
aylien_100.head()

Unnamed: 0,article_id,title,published_at,source,body,prompt
26897,5541797339,5 warning signs of poor blood circulation,2023-04-14 20:34:46+00:00,Newsfounded,"Heart attack, stroke or kidney failure: poor b...",\n For the main organisation discussed in t...
60244,5451482089,Sultana's Death In Custody: She had head injur...,2023-03-28 01:22:10+00:00,The Daily Star - Bangladesh,"Sultana Jasmine, a union-level land employee i...",\n For the main organisation discussed in t...
7247,5580830636,MAGA-Obsessed GOP Rep. Ronny Jackson Mocked Af...,2023-04-26 09:03:19+00:00,Vietnam Explorer News Channel,"Rep. Ronny Jackson (R-Tex.), already known for...",\n For the main organisation discussed in t...
52028,5467485064,Helping families cope with mental illness,2023-03-31 21:46:30+00:00,WOAI News 4 Sanantonio,SAN ANTONIO - One in five U.S. adults experien...,\n For the main organisation discussed in t...
18981,5557299621,"As states cut Medicaid, some fear they'll mist...",2023-04-19 14:08:22+00:00,Vietnam Explorer News Channel,"Tonya Moore, 49, relies on doctor visits and p...",\n For the main organisation discussed in t...


In [10]:
all_prompts = aylien_100['prompt'].tolist()

In [11]:
# observe single prompt as example.

all_prompts[0]



# Investigate average length of tokens

* Important for pricing forecast and LLM API restrictions

### with nltk

In [12]:
total_tokens = sum(len(nltk.word_tokenize(prompt)) for prompt in all_prompts)
average_tokens = total_tokens / len(all_prompts)
max_tokens = max(len(nltk.word_tokenize(prompt)) for prompt in all_prompts)
min_tokens = min(len(nltk.word_tokenize(prompt)) for prompt in all_prompts)

print(f"Number of tokens on the smallest prompt: {min_tokens}")
print(f"Number of tokens on the largest prompt: {max_tokens}")
print(f"Total number of tokens for all prompts: {total_tokens}")
print(f"Average number of tokens in all_prompts: {average_tokens}")

Number of tokens on the smallest prompt: 110
Number of tokens on the largest prompt: 21715
Total number of tokens for all prompts: 102247
Average number of tokens in all_prompts: 1022.47


### with tiktoken

In [13]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
#encoding = tiktoken.get_encoding("cl100k_base")

In [14]:
# fns to count strings in list of prompts:

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def count_tokens_in_list(prompt_list: list, encoding_name: str) -> list:
    """Returns a list of integers representing the number of tokens in each string in the input list."""
    token_counts = []
    for prompt in prompt_list:
        num_tokens = num_tokens_from_string(prompt, encoding_name)
        token_counts.append(num_tokens)
    return token_counts


In [15]:
encoding_name = "cl100k_base" # used for gpt-3.5-turbo
token_counts = count_tokens_in_list(all_prompts, encoding_name)

In [16]:
min_tokens = min(token_counts)
max_tokens = max(token_counts)
total_tokens = sum(i for i in token_counts if isinstance(i, int))
average_tokens = total_tokens / len(all_prompts)

print(f"Number of tokens on the smallest prompt: {min_tokens}")
print(f"Number of tokens on the largest prompt: {max_tokens}")
print(f"Total number of tokens for all prompts: {total_tokens}")
print(f"Average number of tokens in all_prompts: {average_tokens}")

Number of tokens on the smallest prompt: 180
Number of tokens on the largest prompt: 29939
Total number of tokens for all prompts: 128837
Average number of tokens in all_prompts: 1288.37


In [17]:
# Estimating max response tokens if prompt works correctly (update to reflect prompt used)

#num_tokens_from_string("facing risk: yes. type of risk: thisis some text for a risk type.", "cl100k_base")

num_tokens_from_string(
    
    '''
    article_id: {article_id}
    org: <main organisation discussed/NA>.
    location: <location_of_main_org/NA>
    risk_type: <risk_type_faced_by_main_org/NA>.
    items_sold: <comma_separated_list_of_items_sold_by_main_org/NA>.
    service_provided: <comma_separated_list_of_services_provided_by_main_org/NA>.
    business_relations: <comma_separated_list_of_orgs_with_business_relation_to_main_org/NA>.
    '''
                       ,"cl100k_base" )

105

### Truncate tokens of long prompts

* gpt-3.5-turbo has max tokens of 4,096 tokens
* This includes prompt and response tokens combined.
* response tokens should be short due to the attempt at prompt restrictions;
    * i.e. Provide answers only in the format of <facing risk: <'yes'/'no'>. type of risk: < risk type >.> and nothing else.
* so a generous estimate of response tokens would be 100, providing gpt-3.5-turbo successfully adheres to above prompting.
* Therefore truncate prompt tokens to 3500 to be safe.

In [18]:
def truncate_prompt(prompt: str, encoding_name: str, max_tokens: int) -> str:
    """Truncates a text string to the specified number of tokens."""
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(prompt)[:max_tokens]
    return encoding.decode(tokens)


def count_tokens_for_truncating(prompt_list: list, encoding_name: str, max_tokens: int) -> list:
    """Returns a list of strings with a maximum of max_tokens tokens."""
    token_counts = []
    truncated_prompts = []
    for prompt in prompt_list:
        num_tokens = num_tokens_from_string(prompt, encoding_name)
        if num_tokens > max_tokens:
            truncated_prompt = truncate_prompt(prompt, encoding_name, max_tokens)
            token_counts.append(max_tokens)
        else:
            truncated_prompt = prompt
            token_counts.append(num_tokens)
        truncated_prompts.append(truncated_prompt)
    return truncated_prompts, token_counts


In [19]:
encoding_name = "cl100k_base" # used for gpt-3.5-turbo
# encoding_name = "r50k_base" # used for GPT-3 models. todo check this.
max_tokens = 3500 # Update this based on model to be used in 'Generating responses' section to correspond to token limitations.
#all_prompts = # your list of prompts here

truncated_prompts, token_counts = count_tokens_for_truncating(all_prompts, encoding_name, max_tokens)

In [20]:
# Compare output to previous token counts.

token_counts_truncated_prompts = count_tokens_in_list(truncated_prompts, encoding_name)
min_tokens_truncated_prompts = min(token_counts_truncated_prompts)
max_tokens_truncated_prompts = max(token_counts_truncated_prompts)
total_tokens_truncated_prompts = sum(i for i in token_counts_truncated_prompts if isinstance(i, int))
average_tokens_truncated_prompts = total_tokens_truncated_prompts / len(truncated_prompts)

print(f"Number of tokens on the smallest prompt: {min_tokens_truncated_prompts}")
print(f"Number of tokens on the largest prompt: {max_tokens_truncated_prompts}")
print(f"Total number of tokens for all prompts: {total_tokens_truncated_prompts}")
print(f"Average number of tokens in all_prompts: {average_tokens_truncated_prompts}")

# Compare output to previous output (prior to truncation).

Number of tokens on the smallest prompt: 180
Number of tokens on the largest prompt: 3500
Total number of tokens for all prompts: 90654
Average number of tokens in all_prompts: 906.54


In [21]:
print("Total number of prompts: ",len(truncated_prompts))

print(truncated_prompts[:1])

Total number of prompts:  100


### GPT-3.5-turbo

* Initially (prior v1 notebook) was testing  GPT-3 models with aim to reduce pricing.
* After reviewing pricing documentation, determined not necessary.

In [22]:
print(truncated_prompts[50])


    For the main organisation discussed in this article (if any), give me information as follows and nothing else.
    The article id should always be present for reference.
    Include any government organisations discussed, if they are the main organisation discussed.
    ###
    <
    article_id: 5526656871
    org: <main organisation discussed/NA>.
    location: <location_of_main_org/NA>
    risk_type: <risk_type_faced_by_main_org/NA>.
    items_sold: <comma_separated_list_of_items_sold_by_main_org/NA>.
    service_provided: <comma_separated_list_of_services_provided_by_main_org/NA>.
    business_relations: <comma_separated_list_of_orgs_with_business_relation_to_main_org/NA>.
    >
    ###
    The article is: Gonoshasthaya Kendra founder and valiant freedom fighter Dr Zafrullah Chowdhury passed away at the Gonoshasthaya Nagar Hospital in the capital last night.
 He was 81.
  Dr Zafrullah, the recipient of Swadhinata Padak, the highest civilian award of Bangladesh, was taken to the

In [23]:
# Structure to test for GPT-3.5-turbo (Chat completion model)

openai.api_key = os.getenv('OPENAI_KEY')
# if no key, gives error: cannot convert int to str.

#query = "Is NHS facing any business risks based on this article? Provide answers only in the format of <facing risk: <'yes'/'no'>. type of risk: <risk type>.> and nothing else. The article is: Infected Blood: This is the worst treatment disaster in the NHS"
# query = "Answer with yes or no. Is NHS facing any business risk based on this text? 'Infected Blood: This is the worst treatment disaster in the NHS'"
#query = "what is 2+1?"
#query = "Based on the article 'Infected Blood: This is the worst treatment disaster in the NHS,' is the NHS facing any business risks? Please answer 'Y' for yes or 'N' for no."
query = truncated_prompts[50]

gpt_model = "gpt-3.5-turbo"

try:
    response = openai.ChatCompletion.create(
        messages=[
        {'role': 'system', 'content': 'You answer questions in the specified format about the article I give you.'}, # content tokens are included in pricing & TPM limit.
        {'role': 'user', 'content': query},
    ]
        ,model = gpt_model
        ,temperature = 0
        ,max_tokens = 400 # limit response tokens.
        ,n = 1 # number of responses generated. Affects pricing.
    )
except Exception as e:
    print(f"Error generating response for prompt {i+1}: {e}")

    
response_content = response['choices'][0]['message']['content']

print("Response:", response_content, '\n')
print("Number of tokens:", num_tokens_from_string(response_content,"cl100k_base"))

Response: <article_id: 5526656871
org: Gonoshasthaya Kendra.
location: Bangladesh.
risk_type: N/A.
items_sold: N/A.
service_provided: N/A.
business_relations: N/A.
> 

Number of tokens: 49


# Feed prompts into LLM and populate output schema directly
* Above was populating to a dataframe.
* May be more beneficial to populate a nested schema.

### Simple output schema to test end-to-end process

In [24]:
reduced_trunc_prompts = truncated_prompts[:20]

len(reduced_trunc_prompts)

#print(reduced_trunc_prompts[:6])

20

In [25]:
# additional code to handle possible rate limits.
import openai
import time

prompts = reduced_trunc_prompts
GPT_MODEL = "gpt-3.5-turbo"
openai.api_key = os.getenv('OPENAI_KEY')

responses = []

for query in tqdm(prompts, desc="Processing prompts"):
    try:
        response = None

        while response is None:
            try:
                response = openai.ChatCompletion.create(
                    messages=[
                        {"role": "system", "content": "You answer questions in the specified format about the article I give you."},
                        {"role": "user", "content": query},
                    ],
                    model=GPT_MODEL,
                    temperature=0.5,
                    max_tokens=400, # max tokens in response.
                    n=1,
                )
            except openai.errors.TooManyRequestsError as e:
                # Sleep for the recommended duration
                time.sleep(e.response['Retry-After'])
                continue

        response_content = response["choices"][0]["message"]["content"]
        responses.append(response_content)

    except Exception as e:
        print(f"Error generating response for prompt {query}: {e}")

Processing prompts:  90%|█████████ | 18/20 [02:10<00:25, 12.73s/it]

Error generating response for prompt 
    For the main organisation discussed in this article (if any), give me information as follows and nothing else.
    The article id should always be present for reference.
    Include any government organisations discussed, if they are the main organisation discussed.
    ###
    <
    article_id: 5586472947
    org: <main organisation discussed/NA>.
    location: <location_of_main_org/NA>
    risk_type: <risk_type_faced_by_main_org/NA>.
    items_sold: <comma_separated_list_of_items_sold_by_main_org/NA>.
    service_provided: <comma_separated_list_of_services_provided_by_main_org/NA>.
    business_relations: <comma_separated_list_of_orgs_with_business_relation_to_main_org/NA>.
    >
    ###
    The article is: Present at the launch were Engr. Bien Mateo, Senior Vice President of SM Supermalls, Arch.
Jaime Silva from the Philippine Foundation for the Rehabilitation of the Disabled, Inc., Marlo Lucas from the Resources for the Blind Inc. , and Jay

Processing prompts: 100%|██████████| 20/20 [02:19<00:00,  6.99s/it]


In [37]:
print(len(responses))
responses

19


['<article_id: 5541797339\norg: NA\nlocation: NA\nrisk_type: Poor blood circulation leading to serious consequences such as heart attack, stroke, kidney failure, and even amputation.\nitems_sold: NA\nservice_provided: NA\nbusiness_relations: NA>',
 '<article_id: 5451482089\norg: Rapid Action Battalion (RAB), Naogaon Municipality-Chandipur Union Land Office.\nlocation: Rajshahi Medical College (RMC), Naogaon, Bogura.\nrisk_type: Death of a union-level land employee in custody.\nitems_sold: NA.\nservice_provided: NA.\nbusiness_relations: NA.>',
 '<article_id: 5580830636\norg: NA\nlocation: NA\nrisk_type: NA\nitems_sold: NA\nservice_provided: NA\nbusiness_relations: NA>',
 '<article_id: 5467485064\norg: NAMI (The National Alliance on Mental Health).\nlocation: U.S.\nrisk_type: NA.\nitems_sold: NA.\nservice_provided: Advocacy and classes to help families cope with loved ones struggling with mental illness.\nbusiness_relations: Sinclair Cares Mental Health Support and Hope (partner organiza