#### **1. Load packages and env, set up client, paths etc**

In [None]:
from dotenv import load_dotenv
from openai import OpenAI
import os 
import fitz
import ast
import time
from langchain.schema import Document
import json
import pandas as pd
import matplotlib.pyplot as plt

## Load environment variables from the .env file
load_dotenv()

## Set up client
client = OpenAI() # defaults to getting the key using os.environ.get("OPENAI_API_KEY")
MODEL = "gpt-4o-mini"

## Folder paths set here
pdfFolder = os.getenv('PDF_AVSLAG_PATH')
txtFolder = os.getenv('TXT_AVSLAG_PATH')

# For plot outputs
deniedPosPlotPath = os.getenv('DENIED_POS_PLOT')
deniedNegPlotPath = os.getenv('DENIED_NEG_PLOT')

commentsPath = os.getenv('JSON_AVSLAG_COMMENTS')
catsPath = os.getenv('JSON_AVSLAG_CATS')
imgPath = os.getenv('PLOT_PATH')


#### **2. Helper functions**

In [None]:
def writeText(filePath, content):
    ## FUNCTION: Creates and writes to new .txt-file at filePath 
    ## INPUT: Path to write to, and which content to write to
    ## OUTPUT: none
    with open(filePath, 'w') as file:
        file.write(content)
    
def cleanJSONresponse(jsonResponse):
    ## FUNCTION: Turns an openai API-response to a usable json
    ## INPUT: content part of Response from openAI API client
    ## OUTPUT: content included between '{' and '}'
    try:
        # Attempt to find the first and last curly brace to extract valid JSON
        start = jsonResponse.index('{')
        end = jsonResponse.rindex('}') + 1
        json_str = jsonResponse[start:end]
        return json.loads(json_str)
    except (ValueError, json.JSONDecodeError) as e:
        print(f"Failed to clean and parse JSON: {e}")
        return None

def readText(filePath):
    ## FUNCTION: Returns content from a .txt file at filePath
    ## INPUT: Filepath
    ## OUTPUT : Content of the .txt-file

    with open(filePath, 'r') as file:
        content = file.read()
    return content

def loadOnePDF(filePath):
    ## FUNCTION: Loads one pdf to document
    ## INPUT: Path of pdf-file
    ## OUTPUT : Document containing content of all pages in that pdf-file

    text = ""
    with fitz.open(filePath) as doc:
        for page in doc:
            text += page.get_text()
    return Document(page_content=text, metadata={"source": filePath})

def loadPDFs(folderPath):
    ## FUNCTION: Loads all pdf-files to documents
    ## INPUT: Path of folder containing pdf-files
    ## OUTPUT : Document list containing content of all pages in all pdf-files
    docs = []
    for filename in os.listdir(folderPath):
        if filename.endswith(".pdf"):
            filePath = os.path.join(folderPath, filename)
            docs.append(loadOnePDF(filePath))
    return docs

def PDFtoTXT(pdfFolder, txtFolder, verbose = False):
    ## FUNCTION: Take folder of pdf's, fill an empty folder with .txt's - one for each pdf with it's content.
    ## INPUT: the path to a folder of pdfs
    ## OUTPUT: For each PDF in the input folder, returns a .txt with the content of the pdf

    # Ensure the text folder exists
    os.makedirs(txtFolder, exist_ok=True)
    
    documents = loadPDFs(pdfFolder)

    for document in documents:
        # Extract the filename without extension and append .txt
        pdfFilename = os.path.basename(document.metadata['source'])
        txtFilename = os.path.splitext(pdfFilename)[0] + '.txt'
        txtPath = os.path.join(txtFolder, txtFilename)

        # Write document content to a text file
        writeText(txtPath, document.page_content)
        if verbose:
            print(f"Converted {pdfFilename} to {txtFilename}")


def hasComments(filepath):
    ## FUNCTION: Asks GPT to identify whether a txt document has comments or not, i.e. separates sifted from non-sifted documents.
    ## INPUT: txtFile
    ## OUTPUT: True or False

    fileContent = readText(filepath)

    messages = [
    {"role": "system", "content": "You are a helpful assistant that determines whether comments are present for any of the graded categories in an application review."},
    {"role": "user", "content": f"""
    Please check the following text and determine if there are comments for any of the categories that have been graded. Return "True" if there are comments for any or all of the categories. If there are no comments, return "False".
    {fileContent}
    """}
    ]

    response = client.chat.completions.create(
        model=MODEL,
        messages=messages,
        temperature=0.7
    )
    return response.choices[0].message.content.strip().lower() == 'true'

def getComments(filepath):
    ## FUNCTION: Asks GPT to return a json with comments.
    ## INPUT: path to txtFile
    ## OUTPUT: json

    fileContent = readText(filepath)

    messages = [
        {"role": "system", "content": "You are method in a script. You extract and summarizes the essence of comments from research proposal reviews, identifying both positive and negative remarks. Your output is in JSON format with two keys: 'positive' and 'negative', each containing a list of short comments."},
        {"role": "user", "content": f"""
        Extract and summarize the comments from the following text. Identify both positive and negative remarks and present them as two separate lists of short comments in JSON format with keys 'positive' and 'negative'.

        {fileContent}
        """}
    ]

    response = client.chat.completions.create(
        model=MODEL,
        messages=messages,
        temperature=0.7
    )
    
    # Print inside the function for debugging

    # Extract the response text and ensure it's valid JSON
    response_text = response.choices[0].message.content.strip()
    response_json = cleanJSONresponse(response_text)
    return response_json

def extractJSON(response):
    content = response.choices[0].message.content
    
    # Remove the backticks and json keyword from the content
    if content.startswith('```json'):
        content = content[7:]  # Remove the starting ```json
    if content.endswith('```'):
        content = content[:-3]  # Remove the ending ```

    return content.strip()

# for each file, input comments and receive true/false for that file based on predefined categories
def categorizeComments(comments, positiveCat, negativeCat):
    positiveCatStr = '\n'.join([f"{i+1}. {cat}" for i, cat in enumerate(positiveCat)])
    negativeCatStr = '\n'.join([f"{i+1}. {cat}" for i, cat in enumerate(negativeCat)])

    systemMsgContent = f"""You are a helpful assistant that summarizes the essence of comments from research proposal reviews and matches them to predefined categories. The categories are as follows:

    Positive remarks:
    {positiveCatStr}

    Negative remarks:
    {negativeCatStr}
    """

    messages = [
        {"role": "system", "content": systemMsgContent},
        {"role": "user", "content": f"""
        Please summarize the comments from the following text. For each comment, identify which (if any) of the predefined categories it matches best - if it does not match any of the predefined categories, identify it as "OTHER". Return a JSON format of all categories, with a true or false value depending on if the category is present in the comments or not. Do not include repeats of categories. Categorize them under 'Positive remarks' and 'Negative remarks'.

        {comments}
        """}
    ]
    
    response = client.chat.completions.create(
        model=MODEL,
        messages=messages,
        temperature=0.7
    )
    
    jsonContent = extractJSON(response)

    return jsonContent

# stores comments from all files
def processDirectory(dirpath):
    commentsDict = {}

    for filename in os.listdir(dirpath):
        if filename.endswith('.txt'): 
            filepath = os.path.join(dirpath, filename)
            if hasComments(filepath):
                comments = getComments(filepath)
                commentsDict[filename] = comments

    return commentsDict

# identifies categories based on comments
def findCategories(commentsSummary):
    # Extract all positive and negative comments
    commentsPositive = []
    commentsNegative = []

    for file, comments in commentsSummary.items():
        commentsPositive.extend(comments.get('positive', []))
        commentsNegative.extend(comments.get('negative', []))

    # Create messages for OpenAI client
    msgsPos = [
        {"role": "system", "content": "You are a method in a script. Your input is a list of positive remarks as JSON. Your output should be a list (python) of 10 category names that best represent these remarks. Make sure that the categories include descriptive words. Don't just write ''Work plan'' but rather ''Clear work plan'' or ''well structured plan'' etc. Identify the 10 most frequent categories for the following positive remarks and return only the category names in the following format: ''category 1, category 2 ,...,category 9, category 10'' so that if inserted in between two brackets, it works as a list."},
        {"role": "user", "content": f"{commentsPositive}"}
    ]
    
    msgsNeg = [
        {"role": "system", "content": "You are a method in a script. Your input is a list of negative remarks as JSON. Your output should be a list (python) of 10 category names that best represent these remarks. Make sure that the categories include descriptive words. Don't just write ''Work plan'' but rather ''Lacking work plan'' or ''poorly structured plan'' etc Identify the 10 most frequent categories for the following negative remarks and return only the category names in the following format: ''category 1, category 2 ,...,category 9, category 10'' so that if inserted in between two brackets, it works as a list."},
        {"role": "user", "content": f"{commentsNegative}"}
    ]

    # Send requests to OpenAI
    response_positive = client.chat.completions.create(
        model=MODEL,
        messages=msgsPos,
        temperature=0.7
    )

    response_negative = client.chat.completions.create(
        model=MODEL,
        messages=msgsNeg,
        temperature=0.7
    )

    catPos = response_positive.choices[0].message.content.strip()
    catNeg = response_negative.choices[0].message.content.strip()

    catPos = ast.literal_eval(catPos)
    catNeg = ast.literal_eval(catNeg)

    categories = {'positive': catPos, 'negative': catNeg}
    
    return categories

def saveJSON(JSONpath, data):
    # Write the updated dictionary to the JSON file
    with open(JSONpath, 'w') as JSONfile:
        json.dump(data, JSONfile, indent=4)

    print(f"JSON data has been written to {JSONpath}")

def loadExistingData(JSONpath):
    # Load existing data from the JSON file if it exists
    if os.path.exists(JSONpath):
        with open(JSONpath, 'r') as JSONfile:
            return json.load(JSONfile)
    else:
        return {}

# takes json, returns dataframe
def jsonDF(JSONpath):
    with open(JSONpath, 'r') as file:
        data = json.load(file)

    records = []

    for file_name, remarks in data.items():
        for remark_type, categories in remarks.items():
            for category, value in categories.items():
                records.append({
                    'file_name': file_name,
                    'remark_type': remark_type,
                    'category': category,
                    'value': value
                })

    df = pd.DataFrame(records)
    return df

def plotPosRemarks(df, posCat, savePath):
    totalFiles = df['file_name'].nunique()
    
    summaryPos = df[(df['value'] == True) & (df['remark_type'] == 'Positive remarks') & (df['category'].isin(posCat))].groupby('category')['file_name'].nunique() / totalFiles * 100
    summaryPos = summaryPos.sort_values(ascending=True)  # Sorting in ascending order
    
    plt.figure(figsize=(10, 8))
    summaryPos.plot(kind='barh', color='skyblue')
    plt.xlabel('Percentage of Files (%)')
    plt.title('Summary of Positive Remarks')
    plt.ylabel('Positive Categories')
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(savePath)  # Save the plot to the specified path
    plt.close()  # Close the plot to free memory

def plotNegRemarks(df, negCat, savePath):
    totalFiles = df['file_name'].nunique()
    
    summaryNeg = df[(df['value'] == True) & (df['remark_type'] == 'Negative remarks') & (df['category'].isin(negCat))].groupby('category')['file_name'].nunique() / totalFiles * 100
    summaryNeg = summaryNeg.sort_values(ascending=True)  # Sorting in ascending order
    
    plt.figure(figsize=(10, 8))
    summaryNeg.plot(kind='barh', color='salmon')
    plt.xlabel('Percentage of Files (%)')
    plt.title('Summary of Negative Remarks')
    plt.ylabel('Negative Categories')
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(savePath)  # Save the plot to the specified path
    plt.close()  # Close the plot to free memory

# Function to calculate time difference in a readable format
def formatTimeDifference(start, end):
    return time.strftime("%H:%M:%S", time.gmtime(end - start))

# Function to print progress
def printProgress(current, total, startTime):
    elapsedTime = time.time() - startTime
    remainingTime = (elapsedTime / current) * (total - current) if current > 0 else 0
    print(f"Processed {current}/{total} files. Estimated time remaining: {formatTimeDifference(0, remainingTime)}")

#### **Main** 
Might take some time to run, has varied from 6 to 30 min. One run resulted in:

Durations:
1. Converting PDFs to TXTs: 00:00:00
2. Finding and summarizing comments: 00:04:12
3. Finding categories: 00:00:05
4. Categorizing comments: 00:03:28
Total time taken: 00:07:46
Total documents processed: 28

Percentage of total time each step took:
1. Converting PDFs to TXTs: 0.07%
2. Finding and summarizing comments: 54.06%
3. Finding categories: 1.12%
4. Categorizing comments: 44.75%

In [None]:
## Time management
startTime = time.time()
pdfToTxtStart = startTime

# Take TXT-dir and make PDF-dir
PDFtoTXT(pdfFolder, txtFolder, verbose=False)
print('PDFtoTXT done')

## Time management
pdfToTxtEnd = time.time()
commentsSummaryStart = pdfToTxtEnd

# Get categories
commentsSummary = processDirectory(txtFolder)
with open(commentsPath, 'w') as outfile:
    json.dump(commentsSummary, outfile, indent=4)
print('Comments have been summarized and saved')

## Time management
commentsSummaryEnd = time.time()
categoriesStart = commentsSummaryEnd

categories = findCategories(commentsSummary)
print('Categories have been identified')

## Time management
categoriesEnd = time.time()

# Load existing data once
existingData = loadExistingData(catsPath)

# Update the existing data with new categorized comments
totalFiles = len(commentsSummary)
categorizationStart = categoriesEnd

for i, (fileName, comments) in enumerate(commentsSummary.items(), 1):
    printProgress(i, totalFiles, categorizationStart)
    categorizedComments = categorizeComments(comments, categories['positive'], categories['negative'])
    existingData[fileName] = json.loads(categorizedComments)  # assuming categorizedComments is a JSON string

## Time management
categorizationEnd = time.time()

# Save the updated data to the JSON file once
saveJSON(catsPath, existingData)

## Time management and prints ################################################################################################################################
########################################################################################################################################################
pdfToTxtDuration = pdfToTxtEnd - pdfToTxtStart
commentsSummaryDuration = commentsSummaryEnd - commentsSummaryStart
categoriesDuration = categoriesEnd - categoriesStart
categorizationDuration = categorizationEnd - categorizationStart
totalDuration = categorizationEnd - startTime

# Print the durations
print(f"\nDurations:")
print(f"1. Converting PDFs to TXTs: {formatTimeDifference(0, pdfToTxtDuration)}")
print(f"2. Finding and summarizing comments: {formatTimeDifference(0, commentsSummaryDuration)}")
print(f"3. Finding categories: {formatTimeDifference(0, categoriesDuration)}")
print(f"4. Categorizing comments: {formatTimeDifference(0, categorizationDuration)}")
print(f"Total time taken: {formatTimeDifference(0, totalDuration)}")
print(f"Total documents processed: {totalFiles}")

# Calculate and print the percentage of time each step took
totalPercentage = totalDuration / 100
print("\nPercentage of total time each step took:")
print(f"1. Converting PDFs to TXTs: {pdfToTxtDuration / totalPercentage:.2f}%")
print(f"2. Finding and summarizing comments: {commentsSummaryDuration / totalPercentage:.2f}%")
print(f"3. Finding categories: {categoriesDuration / totalPercentage:.2f}%")
print(f"4. Categorizing comments: {categorizationDuration / totalPercentage:.2f}%")

df = jsonDF(catsPath)

plotPosRemarks(df, categories['positive'],deniedPosPlotPath)
plotNegRemarks(df, categories['negative'],deniedNegPlotPath)