#### **1. Load packages and env and set up client and set paths etc**

In [1]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from dotenv import load_dotenv
from openai import OpenAI
import os 
import fitz  # PyMuPDF
from langchain.schema import Document
import json
import pandas as pd

## Load environment variables from the .env file
load_dotenv()

## Set up client
client = OpenAI() # defaults to getting the key using os.environ.get("OPENAI_API_KEY")
MODEL = "gpt-4o-mini"

## Folder paths set here
pdfAvslagFolder = os.getenv('PDF_AVSLAG_PATH')
txtAvslagFolder = os.getenv('TXT_AVSLAG_PATH')
pdfBeviljadeFolder = os.getenv('PDF_BEV_PATH')
txtBeviljadeFolder = os.getenv('TXT_BEV_PATH')

#### **2. Helper Functions**

In [2]:
def writeText(filePath, content):
    ## FUNCTION: Creates and writes to new .txt-file at filePath 
    ## INPUT: Path to write to, and which content to write to
    ## OUTPUT: none
    with open(filePath, 'w') as file:
        file.write(content)

def readText(filePath):
    ## FUNCTION: Returns content from a .txt file at filePath
    ## INPUT: Filepath
    ## OUTPUT : Content of the .txt-file

    with open(filePath, 'r') as file:
        content = file.read()
    return content

# function to load all pdfs in a folder
def loadPDFs(folderPath):
    docs = []
    for filename in os.listdir(folderPath):
        if filename.endswith(".pdf"):
            filePath = os.path.join(folderPath, filename)
            docs.append(loadOnePDF(filePath))
    return docs

def PDFtoTXT(pdfFolder, txtFolder, verbose = False):
    ## FUNCTION: Take folder of pdf's, fill an empty folder with .txt's - one for each pdf with it's content.
    ## INPUT: the path to a folder of pdfs
    ## OUTPUT: For each PDF in the input folder, returns a .txt with the content of the pdf

    # Ensure the text folder exists
    os.makedirs(txtFolder, exist_ok=True)
    
    documents = loadPDFs(pdfFolder)

    for document in documents:
        # Extract the filename without extension and append .txt
        pdfFilename = os.path.basename(document.metadata['source'])
        txtFilename = os.path.splitext(pdfFilename)[0] + '.txt'
        txtPath = os.path.join(txtFolder, txtFilename)

        # Write document content to a text file
        writeText(txtPath, document.page_content)
        if verbose:
            print(f"Converted {pdfFilename} to {txtFilename}")

def extractGrades(fileContent):
    ## FUNCTION: Essentially asks openai API to extract the categories and grades from a document.
    ## INPUT: The content of a .txt file (a string, that is)
    ## OUTPUT: The content part of a response from openai API client

    ## Construct messages
    messages = [
        {"role": "system", "content": "You are a helpful assistant that extracts categories and grades from text."},
        {"role": "user", "content": f"""
        Extract categories and grades from the following text and provide them in JSON format where categories are keys and grades are values. Each grade is an integer between 1 and 7:
        {fileContent}
        """}
    ]

    ## Generate response
    response = client.chat.completions.create(
        model=MODEL,
        messages=messages,
        temperature=0.7
    )
    return response.choices[0].message.content

def cleanJSONresponse(jsonResponse):
    ## FUNCTION: Turns an openai API-response to a usable json
    ## INPUT: content part of Response from openAI API client
    ## OUTPUT: content included between '{' and '}'
    try:
        # Attempt to find the first and last curly brace to extract valid JSON
        start = jsonResponse.index('{')
        end = jsonResponse.rindex('}') + 1
        json_str = jsonResponse[start:end]
        return json.loads(json_str)
    except (ValueError, json.JSONDecodeError) as e:
        print(f"Failed to clean and parse JSON: {e}")
        return None
 
def txtFolder2json(txtFolder):
    ## FUNCTION: Creates .json-file called the same as txtFolder but without the last three letters (assumed that txtFolder is called '******txt')
    ## INPUT: Filepath to folder with .txt files
    ## OUTPUT: none
 
    folderName = os.path.basename(os.path.normpath(txtFolder))
    besked = folderName[:-3] ## big assumption made here. That is that the file is called '****txt' Cleanup is neccessary.

    # initialize a dictionary to store the extracted data from each file
    extractedData = {}

    # read content from each .txt file and extract categories and grades
    for filename in os.listdir(txtFolder):
        if filename.endswith('.txt'):
            filePath = os.path.join(txtFolder, filename)
            fileContent = readText(filePath)
            jsonSummary = extractGrades(fileContent)
        
            jsonCleaned = cleanJSONresponse(jsonSummary)
            if jsonCleaned is not None:
                extractedData[filename] = jsonCleaned
            else:
                print(f"Failed to parse JSON for file {filename}")
                print("GPT Response:", jsonSummary)
 
    jsonName = besked +'.json'
    with open(jsonName, "w", encoding="utf-8") as json_file:
        json.dump(extractedData, json_file, indent=4, ensure_ascii=False)

#### **3. Generate .txt from PDF folders**

In [3]:
## Process the pdfs to txt. Set "verbose = True" to print name of each document processed. 
PDFtoTXT(pdfAvslagFolder, txtAvslagFolder, verbose = False) ## Avslag
PDFtoTXT(pdfBeviljadeFolder, txtBeviljadeFolder, verbose = False) ## Beviljade

#### **4. Generate and save.json from txtFolder**
This cell might take a minute or three to run depending on how many documents are in the folders. 

In [4]:
txtFolder2json(txtBeviljadeFolder)
txtFolder2json(txtAvslagFolder)

beviljade
avslag


### I HAVE NOT CLEANED BELOW THIS CELL //FILIP 24/7-15:00

Trying to make a data frame using the JSON and see what happens.

In [None]:
# load JSON file

with open("beviljade.json", "r", encoding="utf-8") as json_file:
    beviljade = json.load(json_file)

In [None]:
# convert to dataframe

# Convert the dictionary to a pandas DataFrame
dfbev = pd.DataFrame.from_dict(beviljade, orient='index')

# Reset the index to move the filenames from the index to a column
dfbev.reset_index(inplace=True)
dfbev.rename(columns={'index': 'Filename'}, inplace=True)

# Display the DataFrame
print(dfbev)

In [None]:
#dfbev.info()
#dfbev.describe() # summary statistics per column
#dfbev.loc[0] # first row

In [None]:
dfbev.to_excel('output.xlsx', index=False)

Some plotting below.

In [None]:
## V1: Grouped Bar Plot
 
import json
import pandas as pd
import matplotlib.pyplot as plt
 
# Load the data
with open('avslag.json', 'r') as file:
    data = json.load(file)
 
# Convert the data to a DataFrame
df = pd.DataFrame(data).T
 
# Create a grouped bar plot
df.plot(kind='bar', figsize=(12, 8))
plt.title('Scores for Each Category by Document')
plt.xlabel('Documents')
plt.ylabel('Scores')
plt.legend(title='Categories', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
 