#### **1. Load packages and env, set up client, paths etc**

In [None]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from dotenv import load_dotenv
from openai import OpenAI
import os 
import fitz # PyMuPDF
from langchain.schema import Document
import json
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch
import pandas as pd

## Load environment variables from the .env file
load_dotenv()

## Set up client
client = OpenAI() # defaults to getting the key using os.environ.get("OPENAI_API_KEY")
MODEL = "gpt-4o-mini"

## Folder paths set here
pdfAvslagFolder = os.getenv('PDF_AVSLAG_PATH')
txtAvslagFolder = os.getenv('TXT_AVSLAG_PATH')
pdfBeviljadeFolder = os.getenv('PDF_BEV_PATH')
txtBeviljadeFolder = os.getenv('TXT_BEV_PATH')

#### **2. Helper functions**

In [None]:
def writeText(filePath, content):
    ## FUNCTION: Creates and writes to new .txt-file at filePath 
    ## INPUT: Path to write to, and which content to write to
    ## OUTPUT: none
    with open(filePath, 'w') as file:
        file.write(content)

def readText(filePath):
    ## FUNCTION: Returns content from a .txt file at filePath
    ## INPUT: Filepath
    ## OUTPUT : Content of the .txt-file

    with open(filePath, 'r') as file:
        content = file.read()
    return content

def loadOnePDF(filePath):
    ## FUNCTION: Take filepath, loads all pages of pdf to one document
    ## INPUT: Path to pdf-file
    ## OUTPUT: Document with text content of all pdf pages

    text = ""
    with fitz.open(filePath) as doc:
        for page in doc:
            text += page.get_text()
    return Document(page_content=text, metadata={"source": filePath})

def loadPDFs(folderPath):
    ## FUNCTION: Take folderpath, load all pdfs to separate documents
    ## INPUT: Path to folder
    ## OUTPUT: List of documents with text content of all pdf-files

    docs = []
    for filename in os.listdir(folderPath):
        if filename.endswith(".pdf"):
            filePath = os.path.join(folderPath, filename)
            docs.append(loadOnePDF(filePath))
    return docs

def PDFtoTXT(pdfFolder, txtFolder, verbose = False):
    ## FUNCTION: Take folder of pdf's, fill an empty folder with .txt's - one for each pdf with it's content.
    ## INPUT: the path to a folder of pdfs
    ## OUTPUT: For each PDF in the input folder, returns a .txt with the content of the pdf

    # Ensure the folder with txt-files exists
    os.makedirs(txtFolder, exist_ok=True)
    
    documents = loadPDFs(pdfFolder)

    for document in documents:
        # Extract the filename without extension and append .txt
        pdfFilename = os.path.basename(document.metadata['source'])
        txtFilename = os.path.splitext(pdfFilename)[0] + '.txt'
        txtPath = os.path.join(txtFolder, txtFilename)

        # Write document content to a text file
        writeText(txtPath, document.page_content)
        if verbose:
            print(f"Converted {pdfFilename} to {txtFilename}")

def extractGrades(fileContent):
    ## FUNCTION: Essentially asks openai API to extract the categories and grades from a document.
    ## INPUT: The content of a .txt file (a string, that is)
    ## OUTPUT: The content part of a response from openai API client

    ## Construct messages
    messages = [
        {"role": "system", "content": "You are a helpful assistant that extracts categories and grades from text."},
        {"role": "user", "content": f"""
        Extract categories and grades from the following text and provide them in JSON format where categories are keys and grades are values. Each grade is an integer between 1 and 7:
        {fileContent}
        """}
    ]

    ## Generate response
    response = client.chat.completions.create(
        model=MODEL,
        messages=messages,
        temperature=0.7
    )
    return response.choices[0].message.content

def cleanJSONresponse(jsonResponse):
    ## FUNCTION: Turns an openai API-response to a usable json
    ## INPUT: content part of Response from openAI API client
    ## OUTPUT: content included between '{' and '}'
    try:
        # Attempt to find the first and last curly brace to extract valid JSON
        start = jsonResponse.index('{')
        end = jsonResponse.rindex('}') + 1
        json_str = jsonResponse[start:end]
        return json.loads(json_str)
    except (ValueError, json.JSONDecodeError) as e:
        print(f"Failed to clean and parse JSON: {e}")
        return None
 
def txtFolder2json(txtFolder):
    ## FUNCTION: Creates .json-file called the same as txtFolder but without the last three letters (assumed that txtFolder is called '******txt')
    ## INPUT: Filepath to folder with .txt files
    ## OUTPUT: none
 
    folderName = os.path.basename(os.path.normpath(txtFolder))
    besked = folderName[:-3] ## big assumption made here. That is that the file is called '****txt' Cleanup is neccessary.

    # initialize a dictionary to store the extracted data from each file
    extractedData = {}

    # read content from each .txt file and extract categories and grades
    for filename in os.listdir(txtFolder):
        if filename.endswith('.txt'):
            filePath = os.path.join(txtFolder, filename)
            fileContent = readText(filePath)
            jsonSummary = extractGrades(fileContent)
        
            jsonCleaned = cleanJSONresponse(jsonSummary)
            if jsonCleaned is not None:
                extractedData[filename] = jsonCleaned
            else:
                print(f"Failed to parse JSON for file {filename}")
                print("GPT Response:", jsonSummary)
 
    jsonName = besked +'.json'
    with open(jsonName, "w", encoding="utf-8") as json_file:
        json.dump(extractedData, json_file, indent=4, ensure_ascii=False)

#### **3. Generate .txt from PDF folders**

In [None]:
## Process the pdfs to txt. Set "verbose = True" to print name of each document processed. 
PDFtoTXT(pdfAvslagFolder, txtAvslagFolder, verbose = False) ## Avslag
PDFtoTXT(pdfBeviljadeFolder, txtBeviljadeFolder, verbose = False) ## Beviljade

#### **4. Generate and save.json from txtFolder**
This cell might take a minute or three to run depending on how many documents are in the folders. 

In [None]:
txtFolder2json(txtBeviljadeFolder)
txtFolder2json(txtAvslagFolder)

#### **5. Visualize statistics based on extracted data**
With the above json-files, visualize success rate for each category depending on grade.

In [None]:
# Load both granted and denied data
with open('beviljade.json', 'r') as file:
    grantedData = json.load(file)

with open('avslag.json', 'r') as file:
    deniedData = json.load(file)

# Select category using the categories in the first document
firstDoc = next(iter(grantedData.values()))
category = list(firstDoc.keys())[0] # change index here to change categories
# print(f"Selected category: {category}")

# Dictionary to store counts for each grade (1-7)
gradesCount = {i: {'granted': 0, 'denied': 0} for i in range(1, 8)}

# Count granted and denied grades for chosen category
for document, grades in grantedData.items():
    if category in grades:
        grade = grades[category]
        if 1 <= grade <= 7:
            gradesCount[grade]['granted'] += 1

for document, grades in deniedData.items():
    if category in grades:
        grade = grades[category]
        if 1 <= grade <= 7:
            gradesCount[grade]['denied'] += 1

# Filter out grades with zero counts
filteredGrades = []
filteredGrantedCounts = []
filteredDeniedCounts = []

for grade in range(1, 8):
    if gradesCount[grade]['granted'] > 0 or gradesCount[grade]['denied'] > 0:
        filteredGrades.append(grade)
        filteredGrantedCounts.append(gradesCount[grade]['granted'])
        filteredDeniedCounts.append(gradesCount[grade]['denied'])

# Calculate success rates in percentages
successRates = []
for granted, denied in zip(filteredGrantedCounts, filteredDeniedCounts):
    total = granted + denied
    if total > 0:
        successRate = (granted / total) * 100
    else:
        successRate = 0
    successRates.append(successRate)

# Plotting the stacked bar chart
fig, ax1 = plt.subplots(figsize=(14, 8))

bar_width = 0.35
index = np.arange(len(filteredGrades))

# Define colors
barColorsGranted = ['cornflowerblue'] * len(filteredGrades)
barColorsDenied = ['salmon'] * len(filteredGrades)

bar1 = ax1.bar(index, filteredGrantedCounts, bar_width, color=barColorsGranted, label = 'Granted')
bar2 = ax1.bar(index, filteredDeniedCounts, bar_width, bottom=filteredGrantedCounts, color=barColorsDenied, label = 'Rejected')

ax1.set_xlabel(category, fontsize=16)
ax1.set_ylabel('Applications', fontsize=16)
ax1.set_xticks(index)
ax1.set_xticklabels(filteredGrades, fontsize=16)
ax1.legend(loc='upper left', fontsize=16)

# Add grid lines
ax1.grid(True, which='both', linestyle='--', linewidth=0.5)

# Add data labels
for i in range(len(filteredGrades)):
    ax1.text(index[i], filteredGrantedCounts[i] / 2, filteredGrantedCounts[i], ha='center', va='center', color='black', fontsize=12, fontweight='bold')
    ax1.text(index[i], filteredGrantedCounts[i] + filteredDeniedCounts[i] / 2, filteredDeniedCounts[i], ha='center', va='center', color='black', fontsize=12, fontweight='bold')

# Create a secondary y-axis for the success rate
ax2 = ax1.twinx()
ax2.plot(index, successRates, color='darkgreen', marker='o', linestyle='-', linewidth=2, markersize=8, label='Success rate')
ax2.set_ylabel('Success rate (%)', fontsize=14)
ax2.set_ylim(0, 100)
ax2.legend(loc='upper right', fontsize=16)

# Add data labels for success rate
for i, rate in enumerate(successRates):
    ax2.text(index[i], rate, f'{rate:.1f}%', ha='center', va='bottom', color='darkgreen', fontsize=12, fontweight='bold')

# Improve layout
fig.tight_layout()
plt.show()

#### **5. Output data to other formats**
Create a dataframe and .xls for user-friendly purposes.

In [None]:
# Load both granted and denied
with open('beviljade.json', 'r') as file:
    grantedData = json.load(file)

with open('avslag.json', 'r') as file:
    deniedData = json.load(file)

# Combine data into a list of dictionaries with an additional 'Status' key
combinedData = []

for document, grades in grantedData.items():
    combinedEntry = grades.copy()
    combinedEntry['Document'] = document
    combinedEntry['Status'] = 'Granted'
    combinedData.append(combinedEntry)

for document, grades in deniedData.items():
    combinedEntry = grades.copy()
    combinedEntry['Document'] = document
    combinedEntry['Status'] = 'Denied'
    combinedData.append(combinedEntry)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(combinedData)

# Set 'Document' as the index if desired
df.set_index('Document', inplace=True)

# Display the DataFrame
print(df)

# Output excel-file
df.to_excel('output.xlsx', index=False)