In [None]:
import os
import openai

In [None]:
os.environ["OPENAI_API_BASE"] = ""
os.environ["OPENAI_API_KEY"] = ""
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE") # your endpoint should look like the following https://YOUR_RESOURCE_NAME.openai.azure.com/
openai.api_type = 'azure'
openai.api_version = '2023-05-15' # this may change in the future

In [None]:
# Get texts from corrected folder, and names

texts = []
names = []
for file in os.listdir("texts/med"):
    if file.endswith(".txt"):
        with open(os.path.join("texts/med", file), "r", encoding="utf-8") as f:
            texts.append(f.read())
            name = file[:-4]
            # name_1 = name.split('.')[0]
            # name_2 = name.split('.')[1].split('_')[1]
            # name = name_1 + '_' + name_2
            names.append(name)

In [None]:
# Output folder
output_folder = "clean_texts/med"

In [None]:
for text in texts:
    if text == "" or len(text.split()) < 20:
        print(names[texts.index(text)], "is empty or too short")
        continue
    text_name = names[texts.index(text)]
    page = text_name.split('_')[-1].split('page')[0]
    prompt = f"""The following text in triple backticks is the page number {page} of a medical text. This text could have some words separated, misspelled words, typos or strange characters, or wrong uppercase words, fix this without modify any sentence or paragraph. Keep the following in mind:
    1. Don't put de triple backticks in the answer or any other character or extra text. Just give fixed text.
    2. If you find something that could be the number of the page, delete it.
    3. If you can organize something, like tables or information, do it.
    4. Delete text as copyright, for example: "Certara USA, Inc. 2020. All rights reserved".
    5. The vignettes should be "-".
    6. Delete sponsor contact information.
    7. If you find tables, charts, figures, delete them.
    8. Delete "Clinical Trial Results" titles and similar.
    Text: ```{text}```
    Remember just fix the text and organize information without adding or deleting any sentence or paragraph. Just delete figures, tables and similar. Just save paragraphs, sentences and lists.
    """
    response = openai.ChatCompletion.create(
            engine="gpt-35-turbo-rfmanrique",
            messages=[{'role': 'user', 'content': prompt}],
            ).choices[0].message["content"]
    
    # Save the response in "sections_corrected" folder
    with open(os.path.join(output_folder, text_name + ".txt"), "w", encoding="utf-8") as f:
        f.write(response)

In [None]:
import os
import json

# Define the folder path where the .txt files are located
folder_path = "clean_texts_sections"

# List all .txt files in the folder
txt_files = [file for file in os.listdir(folder_path) if file.endswith(".txt")]

# Iterate through the .txt files and convert them to .json
for txt_file in txt_files:
    # Construct the full file paths
    txt_file_path = os.path.join(folder_path, txt_file)
    json_file_path = os.path.join(folder_path, txt_file.replace(".txt", ".json"))

    # Read the content from the .txt file
    with open(txt_file_path, "r", encoding="utf-8") as txt_file:
        content = txt_file.read()

    # Write the content to a new .json file
    with open(json_file_path, "w", encoding="utf-8") as json_file:
        # Since the content is already in JSON format, there's no need to parse it
        json_file.write(content)

    print(f"Converted {txt_file} to {json_file_path}")

print("Conversion complete.")

In [None]:
# For each jso, get sections from clean_texts_sections folder and save them in a dict
sections = {}
for file in os.listdir("clean_texts_sections"):
    if file.endswith(".json"):
        with open(os.path.join("clean_texts_sections", file), "r", encoding="utf-8") as f:
            sections[file[:-5]] = json.load(f)["sections"]

In [None]:
'1000-MT_MED_9011_sections'[:-9]

In [None]:
texts[names.index('1000-MT_MED_9011_sections')]

In [None]:
for text_name in sections.keys():
    i = 0
    for section in sections[text_name]:
        text_name = text_name.split('.')[0]
        page = text_name.split('_')[-1].split('page')[0]
        prompt = f"""The following text in triple backticks is the page number {page} of a medical text. This text could have some words separated, misspelled words, typos or strange characters, or wrong uppercase words, fix this without modify any sentence or paragraph. Keep the following in mind:
        1. Don't put de triple backticks in the answer or any other character or extra text. Just give fixed text.
        2. If you find something that could be the number of the page, delete it.
        3. If you can organize something, like tables or information, do it.
        4. Delete text as copyright, for example: "United States, NCT02949128 | Protocol, ALXN1210-aHUS-311 © Certara USA, Inc. 2020. All rights reserved"
        Text: ```{text}```
        Remember just fix the text.
        """

        response = openai.ChatCompletion.create(
                engine="gpt-35-turbo-16k-rfmanrique",
                messages=[{'role': 'user', 'content': prompt}],
                ).choices[0].message["content"]
        
        with open(os.path.join("clean_texts", f"{names[texts.index(text)]}_sections_{i}.json"), "w", encoding="utf-8") as f:
            f.write(response)
        i = i + 1