In [1]:
import os

ACCESS_TOKEN_PATH = os.path.pardir + "/api_keys/openai.key"

In [2]:
from openai import OpenAI
import tiktoken
import logging


class GPTCommunicator():

    def __init__(
            self, api_key_path: str, model_name: str = "gpt-3.5-turbo",
        ):

        # init client with api key file
        with open(api_key_path) as f:
            self.client = OpenAI(api_key=f.readline().strip())
        
        # context window limits; found at https://platform.openai.com/docs/models
        model_max_tokens = { 
            #"gpt-3.5-turbo-instruct": 4096,
            "gpt-3.5-turbo": 16385,
            "gpt-4": 8192,
            "gpt-4-32k": 32768,
        }

        # check for valid model name input
        if model_name not in model_max_tokens.keys():
            raise ValueError(f"Invalid model name; valid args include: {model_max_tokens.keys()}")
        self.model_name = model_name

        # set model attributes
        self.max_prompt_tokens = model_max_tokens[model_name] -  200 # buffer for response tokens
        self.system_role = "You are a helpful AI assistant."
        self.total_tokens_used = 0
        
        
    def post_prompt(self, text: str):

        try:
            response = self.client.chat.completions.create(
                model = self.model_name,
                messages = [
                    {"role": "system", "content": str(self.system_role)},
                    {"role": "user", "content": str(text)}
                ]
            )
            self.last_response = response
            self.total_tokens_used += int(response.usage.total_tokens)

        except Exception as e:
            logging.error(f"Failed to post prompt: {e}")
            return None
        
        return response.choices[0].message.content
    
    def count_tokens(self, text: str) -> int:

        encoding = tiktoken.encoding_for_model(self.model_name)
        num_tokens = len(encoding.encode(text))

        return num_tokens



In [3]:
gpt = GPTCommunicator(ACCESS_TOKEN_PATH)
response = gpt.post_prompt("Hello")
response

'Hello! How can I assist you today?'

In [4]:
gpt.total_tokens_used

28

In [5]:
gpt.last_response.usage

CompletionUsage(completion_tokens=9, prompt_tokens=19, total_tokens=28)

In [6]:
gpt.count_tokens(response)

9

In [7]:
gpt.post_prompt("Describe Europe in 3 sentences.")

'Europe is a continent located in the Northern Hemisphere, bordered by the Arctic Ocean to the north, the Atlantic Ocean to the west, and the Mediterranean Sea to the south. It is home to diverse cultures, languages, and histories, with 44 countries recognized by the United Nations. Europe has a rich heritage of art, architecture, music, and literature, and is a popular destination for tourists from around the world.'

In [8]:
gpt.total_tokens_used

136

In [9]:
gpt.last_response.usage

CompletionUsage(completion_tokens=83, prompt_tokens=25, total_tokens=108)

In [10]:
from datasets import load_dataset

wikitext = load_dataset("wikitext", "wikitext-2-raw-v1")
wikitext

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [11]:
wikitext["train"]["text"][:10]

['',
 ' = Valkyria Chronicles III = \n',
 '',
 ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n',
 " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more f

In [12]:
import numpy as np 

np.max([int(t.count(" = ") / 2) for t in wikitext["train"]["text"]])

3

In [13]:
def classify_string_type(text):
    if text == '':
        return "empty"
    
    title_delimiter = " = "
    header_delimiter = " = = "
    subheader_delimiter = " = = = "
    
    if len(text.split(subheader_delimiter)) > 1:
        return "subheader"
    
    elif len(text.split(header_delimiter)) > 1:
        return "header"
    
    elif len(text.split(title_delimiter)) > 1:
        return "title"
    
    else:
        return "content"

In [14]:
text = wikitext["train"]["text"][1]
print(text)
classify_string_type(text)

 = Valkyria Chronicles III = 



'title'

In [15]:
import pandas as pd

text_list = wikitext["train"]["text"]

df = pd.DataFrame()
df["text"] = text_list
df["text_type"] = list(map(lambda t: classify_string_type(t), text_list))
df.head(25)

Unnamed: 0,text,text_type
0,,empty
1,= Valkyria Chronicles III = \n,title
2,,empty
3,Senjō no Valkyria 3 : Unrecorded Chronicles (...,content
4,"The game began development in 2010 , carrying...",content
5,"It met with positive sales in Japan , and was...",content
6,,empty
7,= = Gameplay = = \n,header
8,,empty
9,"As with previous Valkyira Chronicles games , ...",content


In [16]:
df.text_type.value_counts()

text_type
content      17532
empty        12951
header        2922
subheader     2660
title          653
Name: count, dtype: int64

In [17]:
title_idx = df.index[df['text_type']=="title"].tolist()
df.iloc[title_idx[0]:title_idx[1]]

Unnamed: 0,text,text_type
1,= Valkyria Chronicles III = \n,title
2,,empty
3,Senjō no Valkyria 3 : Unrecorded Chronicles (...,content
4,"The game began development in 2010 , carrying...",content
5,"It met with positive sales in Japan , and was...",content
6,,empty
7,= = Gameplay = = \n,header
8,,empty
9,"As with previous Valkyira Chronicles games , ...",content
10,"The game 's battle system , the BliTZ system ...",content


In [18]:
passage = "\n".join(df.iloc[title_idx[0]:title_idx[1]]["text"])
print(passage[:5000])

 = Valkyria Chronicles III = 


 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . 

 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series n

In [19]:
print(passage[-5000:])

 " article based on the game 's PSN demo , felt that Valkyria Chronicles III provided a " profound feeling of closure " for the Valkyria Chronicles series . He praised its gameplay despite annoying limitations to aspects such as special abilities , and positively noted its shift in story to a tone similar to the first game . 

 PlayStation Official Magazine - UK praised the story 's blurring of Gallia 's moral standing , art style , and most points about its gameplay , positively noting the latter for both its continued quality and the tweaks to balance and content . Its one major criticism were multiple difficulty spikes , something that had affected the previous games . Heath Hindman of gaming website PlayStation Lifestyle praised the addition of non @-@ linear elements and improvements or removal of mechanics from Valkyria Chronicles II in addition to praising the returning gameplay style of previous games . He also positively noted the story 's serious tone . Points criticized in t

In [20]:
from collections import Counter

def segment_into_passages(text_list):
    
    #df = pd.DataFrame()
    #df["text"] = text_list
    #df["text_type"] = list(map(lambda t: classify_string_type(t), text_list))
    #title_idx = df.index[df['text_type']=="title"].tolist()

    text_type = list(map(lambda t: classify_string_type(t), text_list))

    type_counts = Counter(text_type) #dict storing counts of titles, headers, etc.

    title_idx = np.array([i for i,v in enumerate(text_type) if v == "title"])
    title_idx = np.append(title_idx, len(text_list)) #append for last passage
    title_idx_pairs = np.column_stack((title_idx[:-1], title_idx[1:]))

    passages = []

    for idx_pair in title_idx_pairs:
        start_i, end_i = idx_pair[0], idx_pair[1]
        passage = "\n".join(text_list[start_i:end_i])
        passages.append(passage)

    assert len(passages) == type_counts["title"], "Passage count should match number of titles"

    return passages

passages = segment_into_passages(text_list)
print(passages[0][:5000])

 = Valkyria Chronicles III = 


 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . 

 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series n

In [21]:
print(passages[0][-5000:])

 " article based on the game 's PSN demo , felt that Valkyria Chronicles III provided a " profound feeling of closure " for the Valkyria Chronicles series . He praised its gameplay despite annoying limitations to aspects such as special abilities , and positively noted its shift in story to a tone similar to the first game . 

 PlayStation Official Magazine - UK praised the story 's blurring of Gallia 's moral standing , art style , and most points about its gameplay , positively noting the latter for both its continued quality and the tweaks to balance and content . Its one major criticism were multiple difficulty spikes , something that had affected the previous games . Heath Hindman of gaming website PlayStation Lifestyle praised the addition of non @-@ linear elements and improvements or removal of mechanics from Valkyria Chronicles II in addition to praising the returning gameplay style of previous games . He also positively noted the story 's serious tone . Points criticized in t

In [22]:
print(passages[-1][-5000:])

ck hands of London 's Big Ben that it stopped , leading to unsuccessful attempts to disrupt the roosts with netting , repellent chemical on the ledges and broadcasts of common starling alarm calls . An entire episode of The Goon Show in 1954 was a parody of the futile efforts to disrupt the large common starling roosts in central London . 

 Where it is introduced , the common starling is unprotected by legislation , and extensive control plans may be initiated . Common starlings can be prevented from using nest boxes by ensuring that the access holes are smaller than the 1 @.@ 5 in ( 38 mm ) diameter they need , and the removal of perches discourages them from visiting bird feeders . 

 Western Australia banned the import of common starlings in 1895 . New flocks arriving from the east are routinely shot , while the less cautious juveniles are trapped and netted . New methods are being developed , such as tagging one bird and tracking it back to establish where other members of the flo

In [23]:
text_list[-10:]

[' Western Australia banned the import of common starlings in 1895 . New flocks arriving from the east are routinely shot , while the less cautious juveniles are trapped and netted . New methods are being developed , such as tagging one bird and tracking it back to establish where other members of the flock roost . Another technique is to analyse the DNA of Australian common starling populations to track where the migration from eastern to western Australia is occurring so that better preventive strategies can be used . By 2009 , only 300 common starlings were left in Western Australia , and the state committed a further A $ 400 @,@ 000 in that year to continue the eradication programme . \n',
 ' In the United States , common starlings are exempt from the Migratory Bird Treaty Act , which prohibits the taking or killing of migratory birds . No permit is required to remove nests and eggs or kill juveniles or adults . Research was undertaken in 1966 to identify a suitable avicide that wo

In [24]:
gpt.count_tokens(passages[0])

4486

In [25]:
passage_token_counts = list(map(lambda p: gpt.count_tokens(p), passages))
passage_token_counts[:4]

[4486, 4638, 3913, 832]

In [26]:
print("Passage token counts\n")
print(f"MEAN: {np.mean(passage_token_counts)}")
print(f"STD: {np.std(passage_token_counts)}")
print(f"MIN: {np.min(passage_token_counts)}")
print(f"MAX: {np.max(passage_token_counts)}")

Passage token counts

MEAN: 3732.914241960184
STD: 3143.9339231795843
MIN: 7
MAX: 20498


In [27]:
# limit to smaller for now

gpt.max_prompt_tokens = 2048

print(f"{len([n for n in passage_token_counts if n > gpt.max_prompt_tokens])} / {len(passages)} passages greater than limit")

418 / 653 passages greater than limit


In [28]:
# elimite passages greater than our model's max token limit

valid_idx = [i for i,v in enumerate(passage_token_counts) if v <= gpt.max_prompt_tokens]
valid_passages = [v for i,v in enumerate(passages) if i in valid_idx]

# double check these passages are below the limit
print(f"largest passage after trim is {np.max(list(map(lambda p: gpt.count_tokens(p), valid_passages)))} tokens")

largest passage after trim is 2025 tokens
