In [26]:
from io import StringIO
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
import nltk
import re


In [4]:
output_string = StringIO()
with open('/app/PenaltyBoxIII-OperatingManual.pdf', 'rb') as fin:
    extract_text_to_fp(fin, output_string, laparams=LAParams(),
                       output_type='text', codec=None) #can be 'html', 'xml', 'text', 'tag'
    
# Get the extracted text
extracted_text = output_string.getvalue()

# tokenize the text
tokens = nltk.word_tokenize(extracted_text)

print(tokens[0:50])

['Penalty', 'Box', 'III', 'Nordhavn', '55', 'Yacht', 'Operating', 'Manual', '(', 'May', '2023', ')', 'Penalty', 'Box', 'III', 'Operating', 'Manual', '(', 'May', '2023', ')', 'Table', 'of', 'Contents', '1.0', 'Introduction', '................................................................................................................................................', '1', '1.1', 'Objectives', '.......................................................................................................................................................', '2', '1.2', 'Disclaimer', '.......................................................................................................................................................', '2', 'Important', 'Boat', 'Numbers', 'and', 'Information', '.....................................................................................................', '3', '1.3', '2.0', 'General', 'Description', '........................................................

In [22]:
def group_words(words, group_size=100):
    grouped_words = {}
    group = []
    key_counter = 1
    word_count = 0

    for word in words:
        group.append(word)
        word_count += 1

        # Check if we have reached the group size or if the word ends with a period
        if word_count >= group_size and word.endswith('.'):
            # Join the words in the group with a space and add to the dictionary
            grouped_words[f"group_{key_counter}"] = " ".join(group)
            key_counter += 1
            group = []
            word_count = 0
    
    # Add any remaining words as the last group
    if group:
        grouped_words[f"group_{key_counter}"] = " ".join(group)
    
    return grouped_words

In [23]:
grouped_dict = group_words(tokens)

group_1: Penalty Box III Nordhavn 55 Yacht Operating Manual ( May 2023 ) Penalty Box III Operating Manual ( May 2023 ) Table of Contents 1.0 Introduction ................................................................................................................................................ 1 1.1 Objectives ....................................................................................................................................................... 2 1.2 Disclaimer ....................................................................................................................................................... 2 Important Boat Numbers and Information ..................................................................................................... 3 1.3 2.0 General Description ..................................................................................................................................... 4 2.1 Exterior .........................................

In [68]:
def clean_text(text):
    # Remove multiple periods and other special characters
    text = re.sub(r'\.{2,}', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'\s+([,.?!:;)’”])', r'\1', text)    
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r"‘ ", "'", text)
    text = re.sub(r'“ ', '“', text)

# Strip leading and trailing whitespace

    text = text.strip()
    return text

def clean_groups(grouped_dict):
    cleaned_dict = {}
    for key, value in grouped_dict.items():
        cleaned_dict[key] = clean_text(value)
    return cleaned_dict


In [69]:
cleaned_dict = clean_groups(grouped_dict)
for key, value in cleaned_dict.items():
    print(f"{key}: {value}\n")

group_1: Penalty Box III Nordhavn 55 Yacht Operating Manual (May 2023) Penalty Box III Operating Manual (May 2023) Table of Contents 1.0 Introduction 1 1.1 Objectives 2 1.2 Disclaimer 2 Important Boat Numbers and Information 3 1.3 2.0 General Description 4 2.1 Exterior 4 Interior 5 2.2 2.3 Flybridge 8 2.4 Sundeck 8 3.0 Safety Equipment 9 4.0 Domestic Systems 10 4.2.1 4.2.2 4.2.3 4.2.4 4.2.5 4.1 Freshwater 10 4.1.1 Hot Water 10 4.1.2 Water fill 10 4.1.3 Watermaker 11 4.2 Galley Appliances

group_2: 11 Refrigerator and Freezer 12 Stove 12 Barbecue 13 Ice Maker 13 Vacuum Cleaner 13 4.3 Marine Toilet 13 4.3.1 Waste Pump Out 14 4.4 Showers 15 4.5 Furnace 15 4.6 Air Conditioner 16 4.7 Entertainment Center 16 5.0 Electrical Systems (Power Management) 17 5.1.1 5.1.2 5.1.3 5.1 120-volt AC System 17 Shore power (50-amp, 240-volt) 17 AC Distribution Panel 19 Reverse Polarity 19 5.2 Generator 20 Starting the Generator 20 Stopping the Generator 20 5.3 12-volt DC Electrical System

group_3: 21 12-vo

In [42]:
cleaned_dict['group_86']

'This points toward the filter being used. If the RPM of one of the engines begins decrease, you will need to switch to the backup fuel filter. To switch fuel filters: 1. Slow the engines down to idle and shift the engines out of gear, 2. Have a crewmember take watch as you enter the engine room, 3. Turn the valve handle at the front of the Racor filter so that it points toward the backup filter. The valve handle has a pointed end that points to the filter that is active.'

# Get embeddings from openai

In [78]:
from openai import OpenAI
import os

# Get the API key from the environment
api_key = os.getenv("OPENAI_API_KEY")


client = OpenAI()

client.embeddings.create(
  model="text-embedding-ada-002",
  input=cleaned_dict['group_86'],
  encoding_format="float"
)


CreateEmbeddingResponse(data=[Embedding(embedding=[-0.0031266944, -0.011334888, -0.0055681313, -0.008031086, 0.00422741, 0.010474178, -0.012083043, -0.0155722285, 0.006650639, 0.011685792, 0.025119489, 0.03514345, 0.0076139723, 0.0015054147, 0.0023537106, -0.0036613275, 0.01872375, -0.014671794, 0.0058462066, -0.01683019, -0.01628728, 0.0110568125, -0.0032226965, -0.025966957, -0.0083886115, -0.0031647643, 0.021358848, -0.0039956803, -0.019703636, -0.018273534, 0.011387855, -0.014353993, 0.001190097, -0.023821803, -0.0036381546, -0.021716373, -0.001542657, -0.03037644, -0.013691909, -0.005879311, 0.0274103, 0.006081247, -0.009937889, -0.009097042, 0.00096747105, 0.007064442, 0.00767356, -0.0006724296, -0.019915504, 0.024841413, 0.002936345, -0.0054721287, -0.015333879, -0.026655525, 0.010189481, -0.008329024, -0.028602052, 0.0063063554, 0.009169871, -0.010229207, 0.0008317437, -0.0061342134, -0.01464531, 0.011990352, -0.004475692, -0.0017594898, 0.021332365, 0.0095803635, -0.01612838, 