# Summarization

In [1]:
import json
import openai
import pandas as pd
from pymongo import MongoClient

## Importo le secrets del progetto

In [None]:
with open("secrets.json") as f:
    secrets = json.load(f)
    api_key = secrets["api_key"]
    mongo_string = secrets["mongo_string"]

## Lettura file CSV

In [2]:
df = pd.read_csv('preprocessed_noteevents.csv', sep=';', header=0)

In [3]:
print(df.shape)
df.head(10)

(51, 10)


Unnamed: 0,ROW_ID,SUBJECT_ID,CHARTDATE,TEXT,GENDER,DOB,DOD,EXPIRE_FLAG,LEMMATIZED TEXT,TOKEN_COUNT_COMP
0,722,27431,2148-03-07,Admission Date: [**2148-3-2**] D...,M,2070-10-04 00:00:00,2148-04-08 00:00:00,1,Chief Complaint Hypotension/hypoxia Major Surg...,1005.25
1,723,27431,2148-03-29,Admission Date: [**2148-3-20**] ...,M,2070-10-04 00:00:00,2148-04-08 00:00:00,1,Chief Complaint Hypoxia Major Surgical Invasiv...,1676.5
2,724,27431,2148-04-08,Admission Date: [**2148-4-8**] D...,M,2070-10-04 00:00:00,2148-04-08 00:00:00,1,Chief Complaint hypoxia s/p PEA arrest Major S...,379.75
3,1907,21323,2139-05-04,Admission Date: [**2139-4-26**] ...,M,2082-12-21 00:00:00,,0,Chief Complaint black stool Major Surgical Inv...,504.0
4,1905,21323,2135-08-06,Admission Date: [**2135-7-30**] Discharge...,M,2082-12-21 00:00:00,,0,HISTORY PRESENT ILLNESS Patient 52-year-old ma...,561.25
5,1906,21323,2138-05-06,Admission Date: [**2138-4-21**] Dischar...,M,2082-12-21 00:00:00,,0,HISTORY PRESENT ILLNESS Mr. Known firstname Kn...,350.0
6,5956,30155,2180-02-21,Admission Date: [**2180-2-21**] ...,F,2113-10-23 00:00:00,2180-02-21 00:00:00,1,Chief Complaint Called Emergency Department ev...,713.5
7,5955,30155,2173-02-06,Admission Date: [**2173-2-3**] Discharg...,F,2113-10-23 00:00:00,2180-02-21 00:00:00,1,CHIEF COMPLAINT Left facial pain trigeminal ne...,603.75
8,7321,20426,2155-07-13,Admission Date: [**2155-7-10**] Discharge...,F,2091-05-12 00:00:00,2161-09-11 00:00:00,1,HISTORY PRESENT ILLNESS patient 64-year-old wo...,619.5
9,7322,20426,2161-04-06,Admission Date: [**2161-4-1**] D...,F,2091-05-12 00:00:00,2161-09-11 00:00:00,1,Chief Complaint Melena Major Surgical Invasive...,446.0


## Creazione del summary

### Aggiungo la API key per usare i modelli di OpenAI

In [6]:
openai.api_key = api_key

### Summary delle note cliniche di ciascun paziente

Se si usano (max 125 words) passare la variabile max_tokens a 300.

In [None]:
# Creazione del dataframe summary
summary_df = pd.DataFrame(columns=['SUBJECT_ID', 'SUMMARY'])

# Raggruppa i testi per ogni SUBJECT_ID
grouped_data = df.groupby('SUBJECT_ID')['LEMMATIZED TEXT'].apply(list)

for subject_id, lemmatized_texts in grouped_data.items():
    prompt_notes = ""
    for i, text in enumerate(lemmatized_texts):
        prompt_notes += f"\n Note {i+1}: {text}"
    
    prompt = f"""Given the following patient and his notes: "{prompt_notes}"
    \n"Generate a complete but concise (max 100 words) and informative summary that focuses only on the unique patient, that is always the same throughout the notes in input, and his medical history, current condition, and relevant details, starting from his current conditions backwards.
 
    FOR EXAMPLE: The patient currently is a age-year-old gender who presented with chief complaint and has a medical history of relevant medical conditions.
    The patient was involved in incident/accident description, which led to specific injuries/traumas.
    The postoperative course involved relevant procedures performed on anatomical locations involved.  
    The patient is currently prescribed medications for specific purposes.

    The summary must be accurate and avoid unnecessary repetition, avoid details of prescriptions, doses and other subjects besides the specific patient. 
    If there are dates, enter the year, month and the day. 
    Do not include doctors' names.
    
    It must be easy for a doctor to understand. The tone is formal. 
    The summary always starts with "The patient is a: " "
    """
    
    # MODELLO DI GPT PER LA SUMMARIZATION
    message = [{"role": "system", "content": "You are a formal medical assistant specialising in the summary of a patient's clinical notes."}, 
               {"role": "user", "content": prompt}]
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=message,
        max_tokens=256,
        stop=None,
        temperature=0.3,
        frequency_penalty=0.7
    )
    
    completion_text = completion['choices'][0]['message']['content']

    # Aggiungi l'ID del paziente e il summary al nuovo dataframe
    new_data = {'SUBJECT_ID': [subject_id], 'SUMMARY': [completion_text]}
    new_df = pd.DataFrame(new_data)

    # Concatena il nuovo DataFrame con il DataFrame esistente
    summary_df = pd.concat([summary_df, new_df], ignore_index=True)
    
    # Esempio di stampa del prompt generato per il SUBJECT_ID corrente
    print(subject_id)
    print(completion_text)
    print("------------------------------")

In [69]:
summary_df

Unnamed: 0,SUBJECT_ID,SUMMARY
0,303,The patient is a 20-year-old male with a histo...
1,1197,The patient is a 72-year-old female with end-s...
2,1459,The patient is a 66-year-old male with a past ...
3,1590,The patient is a 55-year-old male with a histo...
4,3497,The patient is a 57-year-old male with a histo...
5,11448,The patient is a 48-year-old female with a his...
6,14432,The patient is a 39-year-old woman with a hist...
7,16651,The patient is an 80-year-old male with a hist...
8,17960,The patient is a former 28-weeker with chronic...
9,18089,The patient is a 75-year-old white female with...


In [70]:
summary_df.to_csv('Summary.csv', sep=';', index=None)

### Clinical Trend Extraction

A partire dai summary generati in precedenza, ancora una volta tramite il modello generativo "GPT-3.5-turbo" si estraggono i clinical trend dei pazienti.

In [None]:
# Creazione del dataframe clinical
clinical_df = pd.DataFrame(columns=['SUBJECT_ID', 'CLINICAL TREND'])

# Raggruppa i testi per ogni SUBJECT_ID
grouped_data = df.groupby('SUBJECT_ID')['LEMMATIZED TEXT'].apply(list)

# Lista per memorizzare i risultati
rows = []

for subject_id, lemmatized_texts in grouped_data.items():
    prompt_notes = ""
    for i, text in enumerate(lemmatized_texts):
        prompt_notes += f"\n Note {i+1}: {text}"
    
    prompt = f"""Given the following patient and his notes: "{prompt_notes}"
    Identify in 1 word the clinical trend like this: extract a single word that accurately represents the clinical trend observed in the patient's notes, considering every detail and keyword.
    For the word, use the general indicators: 'Improvement', 'Stable', 'Worsening' to describe the trend.
    EXAMPLE: "Improvement"

    Otherwise
    Identify in 1 word: "Dead" if the patient from his notes is dead.
    """
    
    # MODELLO DI GPT PER LA SUMMARIZATION
    message = [{"role": "system", "content": "You are a clinical trend extractor of a patient's clinical notes."}, 
               {"role": "user", "content": prompt}]
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=message,
        max_tokens=5,
        stop=None,
        temperature=0.3,
        frequency_penalty=0.7
    )
    
    clinical_trend = completion['choices'][0]['message']['content']
    
    # Aggiungi l'ID del paziente e il clinical trend al dizionario dei risultati
    new_data = {'SUBJECT_ID': subject_id, 'CLINICAL TREND': clinical_trend}
    rows.append(new_data)

    # Crea un nuovo DataFrame con i risultati
    clinical_df = pd.DataFrame(rows)
        
    # Esempio di stampa del prompt generato per il SUBJECT_ID corrente
    print(subject_id)
    print(clinical_trend)
    print("------------------------------")

In [9]:
clinical_df

Unnamed: 0,SUBJECT_ID,CLINICAL TREND
0,303,Improvement
1,1197,Stable
2,1459,Stable
3,1590,Stable
4,3497,Worsening
5,11448,Improvement
6,14432,Worsening
7,16651,Stable
8,17960,Improvement
9,18089,Stable


In [11]:
# Creazione del dataframe merged
merged_df = pd.DataFrame(columns=['SUBJECT_ID', 'SUMMARY', 'CLINICAL TREND'])

# Concatena i dataframe summary_df e clinical_df utilizzando la colonna "SUBJECT_ID" come chiave
merged_df = pd.merge(summary_df, clinical_df, on="SUBJECT_ID")

# Stampare il dataframe risultante
merged_df

Unnamed: 0,SUBJECT_ID,SUMMARY,CLINICAL TREND
0,303,The patient is a 20-year-old male with a histo...,Improvement
1,1197,The patient is a 72-year-old female with end-s...,Stable
2,1459,The patient is a 66-year-old male with a past ...,Stable
3,1590,The patient is a 55-year-old male with a histo...,Stable
4,3497,The patient is a 57-year-old male with a histo...,Worsening
5,11448,The patient is a 48-year-old female with a his...,Improvement
6,14432,The patient is a 39-year-old woman with a hist...,Worsening
7,16651,The patient is an 80-year-old male with a hist...,Stable
8,17960,The patient is a former 28-weeker with chronic...,Improvement
9,18089,The patient is a 75-year-old white female with...,Stable


## Salvataggio nella collezione documentale di noteeventsdb su MONGODB

In [92]:
client = MongoClient(mongo_string)

db = client.noteevents
summaries_db = db.summaries_db

for subject in summary_df['SUBJECT_ID']:
    if summaries_db.find_one({'SUBJECT_ID': subject}) is None:
        doc = json.loads(merged_df.to_json(orient='records'))
        summaries_db.insert_many(doc)