In [None]:
import pandas as pd
from langchain_ollama import OllamaLLM
import pandas as pd
import ast
import re

model = OllamaLLM(model="mistral")

stigmatizingLanguageFound = ""

def askOllama(prompt):
    result = model.invoke(input=prompt)
    return result

def cleanOllamaOutput(output):
    pattern = r"\[.*?\]"
    
    matches = re.findall(pattern, output, re.DOTALL)
    a = matches[0].replace("\n", "")
    escaped_string = re.sub(r"(?<=\w)'(?=\w)", r"\'", a)
    result = re.sub(r"\([^()]*\)", "", escaped_string)
    return ast.literal_eval(result.replace("\\n", "").replace("\\\\", "\\"))

def group_by_second_index(data):
    result = {}

    for element in data:
        key = element[1]  # The second index (the grouping key)
        value = element[0]  # The first index (the value for the key)

        if key in result:
            result[key].append(value)  # If the key exists, append the value to the list
        else:
            result[key] = [value]  # If the key doesn't exist, create a new list with the value

    return result

def replaceStigmatizingLanguage(df, rowNumber):
    global stigmatizingLanguageFound
    clinicalNote = df.iloc[rowNumber]['Completion']
    ogClinicalNote = df.iloc[rowNumber]['Completion']
    allList = []
    for index, i in enumerate([stigmatizingLanguageFound]):
        clinicalNote = df.iloc[rowNumber]["Completion"]
        clinicalNote = re.sub(r'^.*?\*\*History of Present Illness:\*\*', '', clinicalNote, flags=re.DOTALL)
        sentences = clinicalNote.split("**")
        sentences = [item for part in sentences for item in part.split("-")]
        for word in i:
            if len([j for j in sentences if word.lower() in j.lower()]) > 0:
                text = [j for j in sentences if word.lower() in j.lower()]
                for x in text:
                    allList.append([word, x])
    newDict = {}
    grouped_list = group_by_second_index(allList)
    for key, value in grouped_list.items():
        replacingPrompt = "You are a professional linguist whose job is to replace stigmatizing language in clinical notes. If you see labels such as diabetic or abuser, replace these labels with person first language such as \"person who has diabetes\" or\"person with a substance abuse disorder\". If you see words like challenging or uncooperative, replace them with more respectful alternatives. Here is the sentence: " + str(key) + " And here is are the stigmatizing words you must replace: " + str(value) + ". Return to me a JSON object containing only the corrected sentence in a list"
        while True:
            try:
                ollamaOutput = cleanOllamaOutput(askOllama(replacingPrompt))[0]
                if not isinstance(ollamaOutput, str):
                    print(1/0)
                else:
                    newDict[key] = cleanOllamaOutput(askOllama(replacingPrompt))[0]
                break
            except:
                pass
    newClinicalNote = ogClinicalNote
    for key, value in newDict.items():
        if key[-2:] == "\n\n":
            value += "\n\n"
        if key[-1:] == "\n":
            value += "\n"
        newClinicalNote = newClinicalNote.replace(key, value)
    return newClinicalNote, newDict

def scanForStigmatizingLanguage(df, rowNumber):

    clinicalNote = df.iloc[rowNumber]['Completion']
    prompt = "You are a professional linguist researcher who is trying to identify stigmatizing language in clinical notes. Given this clinical note, return to me in a python-type list all forms of stigmatizing language (e.g. noncompliant, nonadherent, challenging, uncooperative, refused, contradicting themselves, frequent visitor to ED, narcotic dependence, obese, alcoholic, inconsistent responses etc...). Do not include any descriptions or explanations or comments. DO NOT INCLUDE STIGMATIZING LANGUAGE IF IT IS NOT FOUND IN THE NOTE, ONLY INCLUDE LANGUAGE THAT IS IN THE NOTE. Also do not rewrite the stigmatizing language in your own words. Here's the actual note you will have to analyze, and make sure you output the list of stigmatizing words in JSON output: "

    finalPrompt = prompt + clinicalNote

    while True:
        try:
            rawOutput = askOllama(finalPrompt)
            cleanedOutput = cleanOllamaOutput(rawOutput)
            break
        except:
            print("incorrect llama output format, supposed to be a list")
            pass
    return cleanedOutput

annotatedDf = pd.read_csv("/Users/sagewong/git/StigmatizingLanguageProject/Application/FinalAnnotatedData.csv")

ultimateDf = pd.read_csv("/Users/sagewong/git/StigmatizingLanguageProject/Application/ReplacingLanguage.csv")
for i in range(ultimateDf.shape[0], annotatedDf.shape[0]):
    print("Going through " + str(i) + "th note")

    stigmatizingLanguageFound = scanForStigmatizingLanguage(annotatedDf, i)

    print("Stigmatizing Language Found: " + str(stigmatizingLanguageFound))

    updatedClinicalNote, outputDictionary = replaceStigmatizingLanguage(annotatedDf, i)

    print("Updated clinical note: " + str(updatedClinicalNote))
    print("Output dictionary: " + str(outputDictionary))
    new_row = pd.DataFrame({'Stigmatizing Language Found': [stigmatizingLanguageFound], 'Updated Clinical Notes': [updatedClinicalNote], 'Output Dictionary': [outputDictionary]})

    ultimateDf = pd.concat([ultimateDf, new_row], ignore_index=True)
    ultimateDf.to_csv("ReplacingLanguage.csv")

Going through 47th note
Stigmatizing Language Found: ['noncompliant', 'challenging', 'refused', 'conflicting information', 'resistance', 'nonadherence', 'limited due to noncompliance']
Updated clinical note: **Admission date:** 09/15/2023

**Discharge date:** 09/22/2023

**Date of Birth:** 04/10/1964

**Sex:** M

**Service:** MEDICINE

**Allergies:** no known allergies

**Attending:** Dr. Smith

**Chief Complaint:** Difficulty breathing and persistent cough

**Major Surgical or Invasive Procedure:**
None performed due to patient's noncompliance.

**History of Present Illness:**
Patient presented with a one-Week history of difficulty breathing and persistent coughing. Became difficult to obtain accurate information regarding the timeline of symptoms.


**Past Medical History:**
- History of hypertension
- Previous smoker, claims to have quit five years ago, but noncompliance with cessation support

**Social History:**
Frequently mentions spending time at local fast-food establishments.
