In [None]:
#Imports
import io
import os
import json
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import tqdm as tq
import openai

from openai import OpenAI
OPENAI_API_KEY = '???'

# Replace 'YOUR_OPENAI_API_KEY' with your actual API key
client = OpenAI(
    api_key=OPENAI_API_KEY,
    organization='???'
)


In [None]:
### Loading csv file into memory

# Get the current working directory
current_directory = os.getcwd()

# File name
file_name = "kw_grouped_postprocess.csv"

# File path
file_path = os.path.join(current_directory, file_name)

# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file into a DataFrame
    kw_hit_grouped_df = pd.read_csv(file_path)
    print("File loaded successfully.")
    # Now you can work with the DataFrame 'df'
else:
    print(f"Error: File '{file_name}' not found in the current directory.")

In [None]:
### This function sends raw text to GPT chat completion for cleanup.  Temperature set to 0.0 and max tokens = 2500

def OCR_clean(text):
    """
    Running text through GPT 3.5 for OCR cleanup.
    """
    # Define the prompt for the OpenAI model
    prompt = (f" Clean up the supplied text string, focusing on correcting poor OCR and eliminating unwanted line breaks: '{text}'. "
              "Return a text string with cleaned text.")

    # Call the OpenAI API using the new method
    response = client.chat.completions.create(model="gpt-3.5-turbo",
    messages=[{"role": "system", "content": "Clean up poorly scanned text."},
              {"role": "user", "content": prompt}],
    max_tokens=2500,
    temperature=0.0)

    # Extract text from the response
    answer = response.choices[0].message.content.strip()

    # print answer
    # print (answer)

    # input("Press Enter to continue...")

    return (answer)

# Example usage:
# text_input = "The mission to explore Mars has captured the imagination of people globally."
# result = exp_space(text_input)
# print(result)

In [None]:
# Main Execution Loop

# Function to process each row
def process_row(article):
    return OCR_clean(article)

# Adding a new column 'GPT_clean' to the dataframe
def parallel_ocr_clean(df, max_workers=5):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {executor.submit(process_row, row['article']): idx for idx, row in df.iterrows()}
        
        # Create a tqdm progress bar
        with tqdm(total=len(df), desc="Processing", unit="row") as pbar:
            for future in as_completed(future_to_index):
                idx = future_to_index[future]
                try:
                    result = future.result()
                    df.at[idx, 'GPT_clean'] = result
                except Exception as e:
                    df.at[idx, 'GPT_clean'] = None
                    print(f"Exception for index {idx}: {e}")
                pbar.update(1)  # Update the progress bar

# Run the parallel processing
parallel_ocr_clean(kw_hit_grouped_df)

# Display the dataframe with the new column
# print(kw_hit_grouped_df.info)

print ("Finished!  Don't forget to save to disk!")

In [None]:
print(kw_hit_grouped_df.info)

In [None]:
kw_hit_grouped_df.to_csv("AS_explor_cleaned.csv", index = False)