# Relation Extraction

## Librerie utili

In [30]:
import json
import openai
import pandas as pd
import time

## Importo le secrets del progetto

In [None]:
with open("secrets.json") as f:
    secrets = json.load(f)
    api_key = secrets["api_key"]

## Lettura file CSV

In [31]:
df = pd.read_csv('ner_noteevents.csv', sep=';', header=0)

### Pulizia dataframe

In [32]:
df['Disease or Syndrome'] = df[['Disease or Syndrome', 'Mental or Behavioral Dysfunction', 'Pathologic Function']].astype(str).apply(', '.join, axis=1)

df = df.drop('Body Part, Organ, or Organ Component', axis=1)
df = df.drop('Laboratory Procedure', axis=1)
df = df.drop('Therapeutic or Preventive Procedure', axis=1)
df = df.drop('Mental or Behavioral Dysfunction', axis=1)
df = df.drop('Pathologic Function', axis=1)

# DataFrame di destinazione vuoto
id = pd.DataFrame()

# Copia delle colonne desiderate
id['Subject ID'] = df['Subject ID'].copy()
id['Note ID'] = df['Note ID'].copy()

df = df.drop(labels=['Subject ID','Note ID'], axis=1)

## Estrazione delle relazioni

### Per estrarre le relazioni usiamo il modello generativo di OpenAI - "GPT-3.5-turbo"

In [None]:
openai.api_key = api_key


# Create an empty dictionary to store the medical concepts for each row
medical_concepts = {}


# Iterate over the rows of the dataframe
for index, row in df.iterrows():
    row_concepts = {}
    
    # Iterate over the columns of the row
    for column in df.columns:
        concept = row[column]
        
        # Check if the concept exists and is not NaN
        if concept and pd.notnull(concept):
            if column in row_concepts:
                row_concepts[column].append(concept)
            else:
                row_concepts[column] = [concept]
    
    # Store the row's concepts in the main dictionary
    medical_concepts[index] = row_concepts



# Iterate over the rows' concepts
for index, row_concepts in medical_concepts.items():
    # Create a text prompt with the extracted concepts for each row
    prompt_text = ""
    prompt_text += """Find the relation between Disease or Sindrome, Diagnostic Procedure, Sign or Symptom, Pharmacologic Substance and Laboratory or Test Result; given the following values:"""
    prompt_text += "\n"

    # Iterate over the categories and concepts in the row
    for category, concepts in row_concepts.items():
        prompt_text += f"{category}: "
        prompt_text += ", ".join(concepts)
        prompt_text += "\n"
    
    prompt_text += "\n"

    # Add instructions for the model
    prompt_text += """Generate at most 1 relation for each disease in the list, do not generate concepts that not are present in the list and left the cell blank when the concept are not present. The first row always be: "Disease or Syndrome,Diagnostic Procedure,Sign or Symptom,Laboratory or Test Result,Pharmacologic Substance". The output must be formatted as csv: Transient Ischemic Attack,Electrocardiography,Chest Pain,Partial pressure CO2 result,atorvastatin"""

    message = [{"role": "system", "content": "You are an expert medical assistant."}, 
                {"role": "user", "content": prompt_text}]

    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=message,
        max_tokens=600,
        stop=None,
        temperature=0.3,
        frequency_penalty=0.7
    )

    completion_text = completion['choices'][0]['message']['content']

    with open(f'Relation Extracted/{index+1}.txt', 'w') as f:
        f.write(completion_text)
    
    print(completion_text)
    time.sleep(5)

### Unione delle relazioni estratte in un file di testo unico

In [46]:
result = "Subject ID,Note ID,Disease or Syndrome,Diagnostic Procedure,Sign or Symptom,Laboratory or Test Result,Pharmacologic Substance"
result += '\n'
for idx, row in id.iterrows():
    # Lettura del testo formattato
    with open(f'Relation Extracted/{idx+1}.txt', 'r') as f:
        text = f.read()

    lines = text.split('\n')

    # Rimozione della prima riga
    lines = lines[1:]

    # Creazione delle righe da aggiungere al testo formattato
    new_lines = [f'{id.loc[idx, "Subject ID"]},{id.loc[idx, "Note ID"]},{line}' for i, line in enumerate(lines)]

    result += '\n'.join(new_lines)
    result += '\n'

### Salvataggio delle relazioni estratte in un CSV

In [47]:
# Salvataggio del risultato in un file di testo
with open('Relation Extracted/result.txt', 'w') as f:
    f.write(result)

In [52]:
# Lettura del file di testo
data = pd.read_csv("Relation Extracted/result.txt", sep=',', on_bad_lines='skip')

# Salvataggio come file CSV
data.to_csv('Relation Extracted/result.csv', sep=',', index=False)