In [1]:
import openai
import os
import tiktoken
from pathlib import Path
from tqdm import tqdm
from pathlib import Path
import PyPDF2

## Step 1: convert the pdf to text files

In [None]:
def convert_pdf_to_text(pdf_path, output_folder):
    """
    Converts a PDF file to a text file by extracting the text from each page of the PDF.

    This function opens a PDF file, reads its contents, and extracts text from each page. 
    The extracted text is then saved into a new text file. The name of the output text file is the same 
    as the input PDF file, but with a .txt extension. The output text file is saved in the specified 
    output folder.

    Parameters:
    pdf_path: The path to the PDF file that needs to be converted. This can be a string or a pathlib.Path object.
    output_folder: The path to the folder where the output text file will be saved. This can be a string or a pathlib.Path object.

    Returns:
    None: This function doesn't return anything. However, it prints a message with the path of the 
          generated text file after successfully converting and saving the text.
    
    """
    pdf_file_path = Path(pdf_path)
    output_folder_path = Path(output_folder)

    with pdf_file_path.open('rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        # Extract text from each page
        text = ''
        for page in pdf_reader.pages:
            if page.extract_text() is not None:
                text += page.extract_text()

        # Remove extension from the PDF filename and create output file path
        output_file = output_folder_path / f'{pdf_file_path.stem}.txt'

        # Save the extracted text to a text file
        with output_file.open('w') as txt_file:
            txt_file.write(text)

        print(f'PDF converted to text and saved at: {output_file}')

pdf_folder_path = ''  # Specify the PDF folder path
output_folder_path = ''  # Specify the output folder path

# Iterate over PDF files in the folder
pdf_folder = Path(pdf_folder_path)
for pdf_file in pdf_folder.glob('*.pdf'):
    convert_pdf_to_text(pdf_file, output_folder_path)


## Step 2: clean some parts of the text files 

In [None]:
def clean_text_file(input_file, output_file):
    """
    Cleans a text file by removing specific patterns and phrases, and saves the cleaned text to a new file.
    After cleaning the text, the function writes the cleaned text to the specified output file.

    Parameters:
    input_file (str): The path to the text file that needs to be cleaned.
    output_file (str): The path to the file where the cleaned text will be saved.

    Returns:
    None: This function doesn't return anything but prints a message with the path of the 
          output file after successfully cleaning and saving the text.
    """
    
    with open(input_file, 'r') as file:
        text = file.read()

    # Remove references like [95] or [88, 89] or [107 –109]
    # text = re.sub(r'\[\d+(?:[-,]\d+)?\]', '', text)
    text = re.sub(r'\[\d+(\s*[,-]\s*\d+)?\]', '', text)

    # Connect words separated by a dash on different lines
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
    
    # Remove the 4-digit number before "Roeb E et al."
    text = re.sub(r'\d{4} Roeb E et al.', 'Roeb E et al.', text)

    # Remove specific phrases
    phrases_to_remove = [
        "Roeb E et al. Aktualisierte S2k-Leitlinie nicht-alkoholische …Z Gastroenterol 2022; 60: 1346 –1421 | © 2022. Thieme. All rights reserved. Leitlinie",
        "Heruntergeladen von: Nadine Fischer. Urheberrechtlich geschützt.",
        "Roeb E et al. Aktualisierte S2k-Leitlinie nicht-alkoholische …Z Gastroenterol 2022; 60: 1346 –1421 | © 2022. Thieme.  All rights reserved. Heruntergeladen von: Nadine Fischer. Urheberrechtlich geschützt",
        "Roeb E et al. Aktualisierte S2k-Leitlinie nicht-alkoholische …Z Gastroenterol 2022; 60: 1346 –1421 | © 2022. Thieme. All rights reserved.Leitlinie"
        "Morbus Fabry – Leitlinien für Diagnostik und Therapie in der Neurologie",
        "Leitlinien für Diagnostik und Therapie in der Neurologie © DGN 2023",
        "Leitlinien für Diagnostik und Therapie in der Neurologie"
    ]
    for phrase in phrases_to_remove:
        text = text.replace(phrase, '')

    

    # Write the cleaned text to the output file
    with open(output_file, 'w') as file:
        file.write(text)

    print(f"Text file cleaned and saved at: {output_file}")


# Specify the input text file and the output file path
input_file_path = ''
output_file_path = ''

# Call the function to clean the text file
clean_text_file(input_file_path, output_file_path)

## Step 3: Fix the spacing issue in the files

In [2]:
openai.api_key = "" #add your API key

### Create blocks

In [3]:
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
    """ 
    Calculates the total number of tokens used by a list of messages based on a specific model's encoding.

    This function iterates over a list of messages and computes the total number
    of tokens required to encode these messages using a specified model's encoding scheme. 

    If the specified model's encoding is not found, it defaults to using the 'cl100k_base' encoding and
    issues a warning.

    Parameters:
    messages (list of dicts): A list of messages, where each message is a dictionary containing key-value pairs.
                              The keys represent different parts of the message, like 'name', 'text', etc.
    model (str): The model name which defines the encoding scheme to be used. Default is 'gpt-3.5-turbo-0613'.

    Returns:
    int: The total number of tokens required to encode all the messages in the list according to the specified
         model's encoding.
    """
    
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens_per_message = 3
    tokens_per_name = 1
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            try:
                num_tokens += len(encoding.encode(value))
            except TypeError:
                print(f"{value=}")
                raise
            if key == "name":
                num_tokens += tokens_per_name
        num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [4]:
def num_tokens_from_string(string, model="gpt-3.5-turbo-0613") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [5]:
def split_text_into_blocks(text: str, model="gpt-3.5-turbo-0613", max_tokens=2000):
    """
    Splits a given text into smaller blocks based on token count limits for a specific model's encoding.

    The function divides the text into sentences and then groups these sentences into blocks. 
    Each block's total token count is kept under a specified maximum limit.

    Parameters:
    text (str): The input text to be split into blocks.
    model (str): The model name that defines the encoding scheme for counting tokens. Default is 'gpt-3.5-turbo-0613'.
    max_tokens (int): The maximum number of tokens allowed in each block. Default is 2000.

    Returns:
    list of str: A list where each element is a string block of the original text. 
                 Each block contains sentences from the text and has a token count less than or equal to `max_tokens`.
    """
    # Split the text into sentences by full stop
    sentences = text.split('.')

    text_blocks = []
    current_block = []

    tokens_in_block = 50
    for sentence in sentences:
        # Add the full stop back to the sentence and a space
        sentence = sentence.strip() + '. '

        sentence_tokens = num_tokens_from_string(sentence, model)
        if tokens_in_block + sentence_tokens > max_tokens:
            # If adding the next sentence exceeds the limit, finalize the current block and start a new one
            text_blocks.append("".join(current_block))
            current_block = [sentence]
            tokens_in_block = 50 + sentence_tokens
        else:
            current_block.append(sentence)
            tokens_in_block += sentence_tokens

    # Don't forget the last block
    if current_block:
        text_blocks.append("".join(current_block))

    return text_blocks


### Get files

In [6]:
step1 = [f.name for f in Path('text_step1').glob("*.txt")]

In [1]:
step2 = [f.name for f in Path('text_step2').glob("*.txt")]
already_corrected = [f.name for f in Path('text_step2').glob("*.txt") if f.name in step1]
assert not len(already_corrected), f"{already_corrected} already corrected"

In [8]:
step1_filtered = [f for f in step1 if f not in already_corrected]
step1_filtered

['051-001l_S3_Funktionelle_Koerperbeschwerden_2018-11.txt',
 '065-002l_S2k_Venenthrombose-Lungenembolie_2023-03.txt']

# construct prompt and call API in loop

In [9]:
prompt = "Your task is to correct the spacing errors in the following German text. Please ensure that you do not add, delete, or rearrange any words. Only adjust the spaces between words as necessary."

In [10]:
skipped = []

In [11]:
step1_filtered

['051-001l_S3_Funktionelle_Koerperbeschwerden_2018-11.txt',
 '065-002l_S2k_Venenthrombose-Lungenembolie_2023-03.txt']

In [12]:
for f in step1_filtered:
    # read file and split into blocks
    with open(f"text_step1/{f}", "r") as fh:
        text = ' '.join(fh.readlines()).replace('\n', ' ')
    blocks = split_text_into_blocks(text)

    # construct prompts
    prompts = []
    for block in blocks:
        _prompt = [{
            "role": "system",
            "content": prompt
        },
                   {
            "role": "user",
            "content": block
        }]
        num_token = num_tokens_from_messages(_prompt)
        assert num_token < 2048
        prompts.append(_prompt)
    
    print(f"{f=} results in {len(prompts)} API calls.")
    if len(prompts) > 70:
        skipped.append(f)
        continue
    expected_cost = len(prompts) * (4 * (0.0015 + 0.002)) * 100
    print(f"Expected cost: {expected_cost:.2f} cent")
    
    # call API
    completions = []
    for i, p in tqdm(enumerate(prompts)):
        stem, suffix = Path(f).stem, Path(f).suffix
        part_number = str(i + 1).zfill(3)
        part_file = Path(f'parts/{stem}_{part_number}{suffix}')
        if Path(part_file).is_file():
            with open(part_file, 'r') as fh:
                content = fh.read()
        else:
            completion = openai.ChatCompletion.create(
              model="gpt-3.5-turbo-0613",
              messages=p,
              max_tokens=2048  # probably 4096
            )
            content = completion.choices[0].message['content']
            with open(part_file, 'w') as fh:
                fh.write(content)

        completions.append(content)
        
    # write to file in step2
    full_text = '\n'.join(content for content in completions)
    with open(f'text_step2/{f}', 'w') as fh:
        fh.write(full_text)

    # delete part files
    part_files = [part for part in Path('parts').glob("*.txt")]
    for p in part_files: p.unlink()
    print('done')
    #break

f='051-001l_S3_Funktionelle_Koerperbeschwerden_2018-11.txt' results in 65 API calls.
Expected cost: 91.00 cent


65it [02:18,  2.14s/it]

done
f='065-002l_S2k_Venenthrombose-Lungenembolie_2023-03.txt' results in 84 API calls.



