## **1. Load packages and env, set up client, paths etc**

In [None]:
import os 
import fitz # PyMuPDF
import json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import ast

from typing import List
from langchain.schema import Document
from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm

## Load environment variables from the .env file
load_dotenv()

## Set up client
MODEL = "llama3.1"
client = OpenAI(
    base_url = "http://localhost:11434/v1",
    api_key="ollama", # required, but unused
)

## Folder paths set here
outFolder = os.getenv("OUT_DIR_PATH")
testFolder = os.getenv("TEST_DIR_PATH")

pdfAvslagFolder = os.getenv("PDF_AVSLAG_PATH")
txtAvslagFolder = os.getenv("TXT_AVSLAG_PATH")

pdfBeviljadeFolder = os.getenv("PDF_BEV_PATH")
txtBeviljadeFolder = os.getenv("TXT_BEV_PATH")

## **2. Helper functions**

In [None]:
def loadOnePDF(filePath: str) -> Document:
    """Take filepath, loads all pages of pdf to one document

    Args:
        filePath (str): Path to pdf-file

    Returns:
        Document: Document with text content of all pdf pages
    """
    text = ""
    with fitz.open(filePath) as doc:
        for page in doc:
            text += page.get_text()
    return Document(page_content=text, metadata={"source": filePath})

def loadPDFs(folderPath: str) -> List[Document]:
    """Take folderpath, load all pdfs to separate documents.

    Args:
        folderPath (str): Path to folder.

    Returns:
        List[Document]: List of documents with text content of all pdf-files.
    """
    docs = []
    for filename in os.listdir(folderPath):
        if filename.endswith(".pdf"):
            filePath = os.path.join(folderPath, filename)
            docs.append(loadOnePDF(filePath))
    return docs

def PDFtoTXT(pdfFolder: str, txtFolder: str, verbose: bool = False) -> None:
    """Take folder of pdf's, fill an empty folder with .txt's - one for each pdf with it's content.

    Args:
        pdfFolder (str): Path to a folder of pdfs.
        txtFolder (str): Path to output folder of .txt files.
        verbose (bool, optional): _description_. Defaults to False.
    """
    os.makedirs(txtFolder, exist_ok=True)   # Ensure the folder with txt-files exists
    documents = loadPDFs(pdfFolder)
    for document in documents:
        # Extract the filename without extension and append .txt
        pdfFilename = os.path.basename(document.metadata['source'])
        txtFilename = os.path.splitext(pdfFilename)[0] + '.txt'
        txtPath = os.path.join(txtFolder, txtFilename)

        # Write document content to a text file
        with open(txtPath, 'w') as file:
            file.write(document.page_content)
        if verbose:
            print(f"Converted {pdfFilename} to {txtFilename}")

def cleanJSONresponse(jsonResponse: str) -> json:
    """Turns an openai API-response to a usable json

    Args:
        jsonResponse (str): Content part of Response from openAI API client.

    Returns:
        json: Content included between '{' and '}'.
    """

    # Attempt to find the first and last curly brace to extract valid JSON
    jsonResponse = jsonResponse.replace("\n", "")
    try:
        start = jsonResponse.index('{')
        end = jsonResponse.rindex('}') + 1
        json_str = jsonResponse[start:end]
        return json.loads(json_str)
    except (ValueError, json.JSONDecodeError) as e:
        print("=> Failed to clean and parse JSON:")
        print(jsonResponse)
        print("\n")
        return None

def write2json(file_name: str, json_txt: json) -> None:
    """Write content to JSON file.

    Args:
        file_name (str): JSON file name.
        json_txt (json): JSON object/content.
    """
    outFolder = os.getenv('OUT_DIR_PATH')
    jsonName = os.path.join(outFolder, file_name +'.json')
    open(jsonName, 'w').close() # clear file
    with open(jsonName, "w", encoding="utf-8") as json_file:
        json.dump(json_txt, json_file, indent=4, ensure_ascii=False)
 
def txtFolder2json(txtFolder: str, task_fn: str, **kwargs) -> None:
    """Creates .json-file called the same as txtFolder: `txtFolder_task.json`.

    Args:
        txtFolder (str): Filepath to folder with .txt files.
        task (str): Available tasks: "grades", "comments".
    """

    # Read content from each .txt file and extract categories and grades.
    extractedData = {}
    for filename in tqdm(os.listdir(txtFolder)):
        if filename.endswith('.txt'):
            # Read txt file
            with open(os.path.join(txtFolder, filename), 'r') as file:
                fileContent = file.read()

            # Get JSON with grades response from LLM.
            task, results = task_fn(fileContent, **kwargs)

            # Clean response to only contain JSON object. 
            jsonCleaned = cleanJSONresponse(results)
            if jsonCleaned is not None:
                extractedData[filename] = jsonCleaned
    
    folderName = os.path.basename(os.path.normpath(txtFolder))
    write2json(f"{folderName}_{task}", extractedData)
    print(f"Nr. of successfully processed files to {folderName}_{task}.json: {len(extractedData)}")


## **3. Generate .txt from PDF folders**

Process the pdfs to txt. Set "verbose = True" to print name of each document processed. 

In [None]:
PDFtoTXT(pdfAvslagFolder, txtAvslagFolder, verbose = False) ## Avslag
PDFtoTXT(pdfBeviljadeFolder, txtBeviljadeFolder, verbose = False) ## Beviljade

## **4. Extract Grades**

**Generate and save.json from txtFolder**

This cell might take a minute or three to run depending on how many documents are in the folders. 

In [None]:
def extractGrades(fileContent: str, categories: str) -> str:
    """Essentially asks openai API to extract the categories and grades from a document.

    Args:
        fileContent (str): The content of a .txt file (a string, that is).
        categories (str): Assessment criterias.
    
    Returns:
        task (str): Task name.
        str: The content part of a response from LLM model.
    """
    messages = [
        {
            "role": "system", 
            "content": f"""You are a script that outputs that only outputs valid JSON format where assessment criterias are keys and grades are int values.
                The output must be in valid JSON. Don't add explanation beyond the JSON. Don't make suggestions.
                Your task is to extract the assessment criteria and the corresponding grade from the text.
                The assessment criterias are: {categories}
            """
        },
        {
            "role": "user", 
            "content": f"""Extract assessment criterias and grades from the following text and 
                provide them in JSON format where assessment criterias are keys and grades are int values.
                {fileContent}
            """
        }
    ]
    response = client.chat.completions.create(model=MODEL, messages=messages, temperature=0.1)
    return "grades", response.choices[0].message.content.strip()

## Specify the client data categories here
categories_Formas = """
    - Vetenskaplig frågeställning
    - Metod och genomförande
    - Vetenskaplig kompetens
    - Frågeställningens samhällsnytta
    - Slutbedömning
"""

categories_VR = """
    - Scientific quality of the proposed research
    - Novelty and originality
    - Merits of the applicant 
    - Feasibility
    - Overall assessment of the application's scientific quality
"""

## Get the respective assessment criteria names.
categories = categories_Formas if os.getenv("CLIENT_NAME") == "Formas" else categories_VR

txtFolder2json(txtBeviljadeFolder, task_fn=extractGrades, categories=categories)
txtFolder2json(txtAvslagFolder, task_fn=extractGrades, categories=categories)

## **5. Extract Comments**

### Extract postive/negative comments into JSON

In [None]:
def getComments(fileContent: str) -> str:
    """Asks LLM model to return a json with comments.

    Args:
        fileContent (str): The content of a .txt file (a string, that is).
    
    Returns:
        task (str): Task name.
        str: The content part of a response from LLM model.
    """

    messages = [
        {
            "role": "system",
            "content": """You are a script that only outputs valid JSON format with the keys: 'positive', 'negative' and the value as list of concise comments.
                The output must be in valid JSON. Don't add explanation beyond the JSON. Don't make suggestions.
                Your task is to summarize the detailed written comments from research proposal reviews.
                Ignore the grades and the other information.
                If there are no detailed written comments due to recieving the lowest grade, only output 'Low score' as value as negative comment, don't make something up.
            """
        },
        {
            "role": "user", 
            "content": """Summarize the positive/negative detailed written comments from the following text: {BODY}. Ignore everything else.
                Output must be in valid JSON with the keys: 'positive', 'negative' and the value as list of concise comments.
                Output must include only valid JSON.
            """.format(BODY=fileContent)
        }
    ]
    response = client.chat.completions.create(model=MODEL, messages=messages, temperature=0.2)
    return "comments", response.choices[0].message.content.strip()

txtFolder2json(txtFolder=txtBeviljadeFolder, task_fn=getComments)
txtFolder2json(txtFolder=txtAvslagFolder, task_fn=getComments)

### Identify the top 10 descriptive categories

In [None]:
def findCategories(comments_summary: json) -> str:
    """Finds the top 10 positive/negative descriptive categories from a json with lists of positive/negative comments.

    Args:
        comments_summary (json): json containing lists of positve/negative comments for each document.

    Returns:
        json: json with top 10 positive/negative category comments.
    """    

    # Extract all positive and negative comments
    comments_positive = []
    comments_negative = []
    for comments in comments_summary.values():
        comments_positive += comments["positive"]
        comments_negative += comments["negative"]

    # Create messages for client
    messages_positive = [
        {
            "role": "system", 
            "content": """You are a script that outputs a valid python list of strings representing the top 10 descriptive categories.
                The output must be a valid python list. Don't add explanation beyond the list.
            """
        },
        {
           "role": "user", 
           "content": f"""Identify the top 10 descriptive categories from the positive comments from the following text: {comments_positive}.
                Output must include only a Python list with 10 descriptive category items of 3-5 words. Don't add explanation beyond the list. Don't make suggestions.
            """
        }
    ]
    messages_negative = [
        {
            "role": "system", 
            "content": """You are a script that outputs a valid python list of strings representing the top 10 descriptive categories.
                The output must be a valid python list. Don't add explanation beyond the list.
            """
        },
        {
           "role": "user", 
           "content": f"""Identify the top 10 descriptive categories for the negative comments from the following text: {comments_negative}.
                Output must include only a Python list with 10 descriptive category items of 3-5 words. Don't add explanation beyond the list. Don't make suggestions.
            """
        }
    ]

    ## Positive categories
    response_positive = client.chat.completions.create(model=MODEL, messages=messages_positive, temperature=0.4)
    categories_positive = response_positive.choices[0].message.content.strip()
    categories_positive = ast.literal_eval(categories_positive)
    
    ## Negative categories
    response_negative = client.chat.completions.create(model=MODEL, messages=messages_negative, temperature=0.4)
    categories_negative = response_negative.choices[0].message.content.strip()
    categories_negative = ast.literal_eval(categories_negative)

    ## Add to dictionary
    categories = {'positive': categories_positive, 'negative': categories_negative}
    return categories

### BEVILJADE HERE

In [None]:
with open(outFolder + "/beviljade_comments" + ".json", "r") as f:
  beviljade_json = json.load(f)

categories = findCategories(beviljade_json)
beviljade_categories_pos = categories['positive']
beviljade_categories_neg = categories['negative']

print("Positive beviljade categories:")
for category in beviljade_categories_pos:
    print(f"- {category}")
print("\nNegative beviljade categories:")
for category in beviljade_categories_neg:
    print(f"- {category}")

### AVSLAG HERE

In [None]:
with open(outFolder + "/avslag_comments" + ".json", "r") as f:
  avslag_json = json.load(f)

categories = findCategories(avslag_json)
avslag_categories_pos = categories['positive']
avslag_categories_neg = categories['negative']

print("Positive rejection categories:")
for category in avslag_categories_pos:
  print(f"- {category}")
  
print("\nNegative rejection categories:")
for category in avslag_categories_neg:
  print(f"- {category}")

## **Visualize statistics based on extracted data**
With the above json-files, visualize success rate for each category depending on grade.

In [None]:
# Load both granted and denied data into pandas dataframes
grantedData = pd.read_json(path_or_buf=os.path.join(outFolder, 'beviljade_grades.json'), orient="index")
deniedData = pd.read_json(path_or_buf=os.path.join(outFolder, 'avslag_grades.json'), orient="index")
column = grantedData.columns[-1]

grantedData.loc[:,"Result"] = "Granted"
deniedData.loc[:,"Result"] = "Rejected"

df_grades = pd.concat([grantedData, deniedData])
df_grades.loc[:,[column, "Result"]].value_counts(sort=False)

In [None]:
granted_freq = {}
for grade, group in df_grades.loc[:,[column, "Result"]].groupby(column):
    granted_count = group["Result"].value_counts(normalize=True).to_frame().reset_index()
    if "Granted" in granted_count["Result"].to_list():
        granted_freq[int(grade)] = granted_count[granted_count["Result"] == "Granted"]["proportion"].values[0]
    else:
        granted_freq[int(grade)] = 0
print(granted_freq)

In [None]:
df_granted_freq = pd.DataFrame(granted_freq.items(), columns=["Grade", "Granted%"])

fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

# Plot bar and line plot.
sns.lineplot(df_granted_freq, x="Grade", y="Granted%", ax=ax2, color="red")
ax1 = sns.histplot(df_grades, x=column, hue="Result", ax=ax1, 
             discrete=True, multiple="layer", shrink=0.3, edgecolor=None, legend=True)

# label points on the plot
for x, y in zip(df_granted_freq["Grade"], df_granted_freq["Granted%"]):
    # the position of the data label relative to the data point can be adjusted by adding/subtracting a value from the x &/ y coordinates
    plt.text(
        x = x, # x-coordinate position of data label
        y = y, # y-coordinate position of data label, adjusted to be 150 below the data point
        s = f"{y*100:.0f}%" # data label, formatted to ignore decimals
    )

ax1.set_ylabel('Applications')
ax2.set_ylabel('Success rate')
ax2.set_ylim(0, 1.1)

sns.move_legend(ax1, "lower right", bbox_to_anchor=(.5, 1), ncol=2, title=None, frameon=False)
plt.show()

## **Output data to other formats**
Create a dataframe and .xls for user-friendly purposes.

In [None]:
# Load both granted and denied
grantedData = pd.read_json(path_or_buf=os.path.join(outFolder, 'beviljade_grades.json'), orient="index")
deniedData = pd.read_json(path_or_buf=os.path.join(outFolder, 'avslag_grades.json'), orient="index")

# Combine data with an additional 'Status' key
grantedData.loc[:,"Status"] = "Granted"
deniedData.loc[:,"Status"] = "Rejected"

df_grades = pd.concat([grantedData, deniedData])
df_grades.index.name = "Document"

# Display the DataFrame
print(df_grades)

# Output excel-file
df_grades.to_excel(os.path.join(outFolder,'output.xlsx'), index_label="Document")
df_grades.to_csv(os.path.join(outFolder,'grades.csv'))