In [1]:
import pandas as pd
import numpy as np

from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer 
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from math import ceil

import en_core_web_lg

from collections import Counter
from string import punctuation



In [2]:
data_df = pd.read_csv('./datasets/mtsamples.csv').set_index(['index'])
data_df

Unnamed: 0_level_0,description,medical_specialty,sample_name,transcription,keywords
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."
...,...,...,...,...,...
4994,Patient having severe sinusitis about two to ...,Allergy / Immunology,Chronic Sinusitis,"HISTORY:, I had the pleasure of meeting and e...",
4995,This is a 14-month-old baby boy Caucasian who...,Allergy / Immunology,Kawasaki Disease - Discharge Summary,"ADMITTING DIAGNOSIS: , Kawasaki disease.,DISCH...","allergy / immunology, mucous membranes, conjun..."
4996,A female for a complete physical and follow u...,Allergy / Immunology,Followup on Asthma,"SUBJECTIVE: , This is a 42-year-old white fema...",
4997,Mother states he has been wheezing and coughing.,Allergy / Immunology,Asthma in a 5-year-old,"CHIEF COMPLAINT: , This 5-year-old male presen...",


In [3]:
#instantiating model

nlp = en_core_web_lg.load()

In [4]:
def summarize(text):

    ''' Function to summarize texts, where each word is given a fixed score according to its frequency in each text input 
    (e.g. data_df['transcription'][0]). 
    For each sentence in the text input, the sentence strength is then determined by adding up the scores of the words in the sentence. 
    We then pick the strongest sentences in each text input to make up the summary for that text input'''
    
    # we want the keywords in order to assign a score to each sentence

    keyword = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
    doc = nlp(str(text).lower())
    for token in doc:
        # If in stop words or is punctuation, then dont add to keyword
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        # if word is in pos_tag categories, add to key words
        if(token.pos_ in pos_tag):
            keyword.append(token.text)
    
    # get dictionary of word frequencies
    freq_word = Counter(keyword)

    # we get the frequency of the highest occuring word

    max_freq = Counter(keyword).most_common(1)[0][1]

    # standard scale the frequency of each word in relation to the highest occuring word
    # update the dictionary of word frequencies with scaled fraction

    for w in freq_word:
        freq_word[w] = (freq_word[w]/max_freq)
    
    # we create a dictionary of the sum(word frequencies) of each sentence
    sent_strength={}
    for sent in doc.sents:
        for word in sent:
            if word.text in freq_word.keys():
                if sent in sent_strength.keys():
                    sent_strength[sent]+=freq_word[word.text]
                else:
                    sent_strength[sent]=freq_word[word.text] 
    summary = []
    
    # sort sentences according to sent strength from highest to lowest

    sorted_x = sorted(sent_strength.items(), key=lambda kv: kv[1], reverse=True)

    # This denominator of 5 is to ensure a limit on how many sentences you want to summarize the text into. 
    # We do a division as if there are more sentences, we should automatically summarize to more sentences rather than hard code a number

    max_sent = ceil(len(sorted_x)/5)

    min_sent_strength = sorted_x[max_sent-1][1]

    # obtain the sentences in chronological order for those that have the necessary sent_strength
        
    summary = [str(k).capitalize() for k,v in sent_strength.items() if v >= min_sent_strength]

    summary = ' '.join(summary)
            
    return  summary

In [5]:
def summarize_bulk(text_list):

    ''' Function to summarize texts, where each word is given a fixed score according to its frequency in the whole document. 
    For each sentence, the sentence strength is then determined by adding up the scores of the words in the sentence. 
    We then pick the strongest sentences in each text input to make up the summary of that text input'''
    
    # we want the keywords in order to assign a score to each sentence

    keyword = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']

    # doing for the whole dataset

    for t in text_list:
        doc = nlp(str(t).lower())
        for token in doc:
            # If in stop words or is punctuation, then dont add to keyword
            if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
                continue
            # if word is in pos_tag categories, add to key words
            if(token.pos_ in pos_tag):
                keyword.append(token.text)
        
    # get dictionary of word frequencies
    freq_word = Counter(keyword)

    # we get the frequency of the highest occuring word according to document

    max_freq = Counter(keyword).most_common(1)[0][1]

    # standard scale the frequency of each word in relation to the highest occuring word
    # update the dictionary of word frequencies with scaled fraction

    for w in freq_word:
        freq_word[w] = (freq_word[w]/max_freq)
    
    # we create a dictionary of the sum(word frequencies) of each sentence

    summary_list = []

    for t in text_list:
        sent_strength={}

        doc = nlp(str(t).lower())
        for sent in doc.sents:
            for word in sent:
                if word.text in freq_word.keys():
                    if sent in sent_strength.keys():
                        sent_strength[sent]+=freq_word[word.text]
                    else:
                        sent_strength[sent]=freq_word[word.text] 
        summary = []
        
        # sort sentences according to sent strength from highest to lowest

        sorted_x = sorted(sent_strength.items(), key=lambda kv: kv[1], reverse=True)

        # This denominator of 5 is to ensure a limit on how many sentences you want to summarize the text into. 
        # We do a division as if there are more sentences, we should automatically summarize to more sentences rather than hard code a number

        max_sent = ceil(len(sorted_x)/5)

        min_sent_strength = sorted_x[max_sent-1][1]

        # obtain the sentences in chronological order for those that have the necessary sent_strength
        
        summary = [str(k).capitalize() for k,v in sent_strength.items() if v >= min_sent_strength]

        summary = ' '.join(summary)

        summary_list.append(summary)

    return  summary_list

In [6]:
data_df['word_freq_baseline_model'] = data_df['transcription'].apply(summarize)

In [7]:
summary_list = summarize_bulk(data_df['transcription'])
data_df['word_freq_bulk_model'] = summary_list

In [10]:
#Demo of baseline summary

clinical_notes = data_df.iloc[0]['transcription']

summary = summarize(clinical_notes)

print(f"Original: \n\n {data_df.iloc[0]['transcription']}")

print(f"\n Summarized: \n\n {summary}")

Original: 

 SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without 

In [11]:
# demo of baseline bulk summary

text_list = [data_df['transcription'][0],data_df['transcription'][1],data_df['transcription'][2]]

summary_list = summarize_bulk(text_list)

print(f"Original: \n\n {data_df.iloc[0]['transcription']}")

print(f"\n Summarized: \n\n {summary_list[0]}")

Original: 

 SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without 

In [12]:
# Instantiating textrank and lexrank summarizer objects

summarizer_text = TextRankSummarizer()
summarizer_lex = LexRankSummarizer()

In [69]:
def text_graph_summarizer(text):

      # form a parser document for the text rank summarizer to process

      parser=PlaintextParser.from_string(text,Tokenizer("english"))

      summary = ''.join([str(sentence).capitalize() for sentence in summarizer_text(parser.document, 5)])

      return summary

def lex_graph_summarizer(text):

      # form a parser document for the lex rank summarizer to process

      parser=PlaintextParser.from_string(text,Tokenizer("english"))

      summary = ''.join([str(sentence).capitalize() for sentence in summarizer_lex(parser.document, 5)])

      return summary

In [67]:
data_df['text_rank_model'] = data_df['transcription'].apply(text_graph_summarizer)

In [70]:
data_df['lex_rank_model'] = data_df['transcription'].apply(lex_graph_summarizer)

In [16]:
# original text
clinical_notes

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,

In [74]:
#  demo of textrank graph summarizer
text_summary_graph = text_graph_summarizer(clinical_notes)

text_summary_graph

'She used to have allergies when she lived in seattle but she thinks they are worse here.She used that last summer and she began using it again two weeks ago.She has used over-the-counter sprays but no prescription nasal sprays.She does have asthma but doest not require daily medication for this and does not think it is flaring up.,medications: , her only medication currently is ortho tri-cyclen and the allegra.,allergies: , she has no known medicine allergies.,objective:,vitals:  weight was 130 pounds and blood pressure 124/78.,heent:  her throat was mildly erythematous without exudate.She does not think she has prescription coverage so that might be cheaper.,2.'

In [75]:
#  demo of lexrank graph summarizer

lex_summary_graph = lex_graph_summarizer(clinical_notes)

lex_summary_graph

'She has used allegra also.It does not appear to be working very well.She has used over-the-counter sprays but no prescription nasal sprays.She does have asthma but doest not require daily medication for this and does not think it is flaring up.,medications: , her only medication currently is ortho tri-cyclen and the allegra.,allergies: , she has no known medicine allergies.,objective:,vitals:  weight was 130 pounds and blood pressure 124/78.,heent:  her throat was mildly erythematous without exudate.She will try zyrtec instead of allegra again.'

In [114]:
# saving to csv

#data_df.to_csv('nlp_summarizer_130922.csv',header=True)
#pd.read_csv('nlp_summarizer_130922.csv')

Unnamed: 0,index,description,medical_specialty,sample_name,transcription,keywords,word_freq_baseline_model,word_freq_bulk_model,text_rank_model,lex_rank_model
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...","Subjective:, this 23-year-old white female pr...","Subjective:, this 23-year-old white female pr...",She used to have allergies when she lived in s...,She has used allegra also.It does not appear t...
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...","Past medical history:, he has difficulty climb...","Past medical history:, he has difficulty climb...","Past medical history:, he has difficulty climb...",Difficulty with snoring.He now smokes less tha...
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",his biggest weight loss is 25 pounds and it w...,"History of present illness: , i have seen abc ...",When he loses weight he always regains it and ...,"He is 5'9"".He drinks alcohol ten to twelve dri..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...",left atrial enlargement with left atrial diam...,left atrial enlargement with left atrial diam...,Left atrial enlargement with left atrial diame...,"2-d m-mode: , ,1.Left atrial enlargement with ..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",the wall motion and left ventricular systolic...,the left ventricular cavity size and wall thi...,The wall motion and left ventricular systolic ...,The wall motion and left ventricular systolic ...
...,...,...,...,...,...,...,...,...,...,...
4994,4994,Patient having severe sinusitis about two to ...,Allergy / Immunology,Chronic Sinusitis,"HISTORY:, I had the pleasure of meeting and e...",,"History:, i had the pleasure of meeting and e...","History:, i had the pleasure of meeting and e...",She had a ct of her paranasal sinuses identify...,"History:, i had the pleasure of meeting and e..."
4995,4995,This is a 14-month-old baby boy Caucasian who...,Allergy / Immunology,Kawasaki Disease - Discharge Summary,"ADMITTING DIAGNOSIS: , Kawasaki disease.,DISCH...","allergy / immunology, mucous membranes, conjun...","Course:, this is a 14-month-old baby boy cauc...","Course:, this is a 14-month-old baby boy cauc...","Admitting diagnosis: , kawasaki disease.,disch...","When he was sent to the hospital, he had a fev..."
4996,4996,A female for a complete physical and follow u...,Allergy / Immunology,Followup on Asthma,"SUBJECTIVE: , This is a 42-year-old white fema...",,"Subjective: , this is a 42-year-old white fema...","Subjective: , this is a 42-year-old white fema...",She also notes that in the past she was on adv...,"No abdominal pain, no heartburn, no constipati..."
4997,4997,Mother states he has been wheezing and coughing.,Allergy / Immunology,Asthma in a 5-year-old,"CHIEF COMPLAINT: , This 5-year-old male presen...",,"Chief complaint: , this 5-year-old male presen...","Chief complaint: , this 5-year-old male presen...","He was evaluated at the clinic, given the brea...",Mother states he has been wheezing and coughin...


In [115]:
# filtering out top 5 medical specialties to evaluate summaries of specific clinical notes

top_5_medical_specialties = sorted(data_df['medical_specialty'].value_counts()[:5].index)
top_5_medical_specialties

[' Cardiovascular / Pulmonary',
 ' Consult - History and Phy.',
 ' Orthopedic',
 ' Radiology',
 ' Surgery']

In [116]:
#saving filtered summaries to df

summarized_specialty_filter = []

for specialty in top_5_medical_specialties:

    filtered_df = data_df[data_df['medical_specialty']==specialty][3:6]

    summarized_specialty_filter.append(filtered_df)

final_filtered_df = pd.concat(summarized_specialty_filter)

#final_filtered_df.to_csv('nlp_summarizer_top15_130922.csv',header=True)

In [117]:
pd.read_csv('nlp_summarizer_top15_130922.csv')

Unnamed: 0,index,description,medical_specialty,sample_name,transcription,keywords,word_freq_baseline_model,word_freq_bulk_model,text_rank_model,lex_rank_model
0,9,Echocardiogram and Doppler,Cardiovascular / Pulmonary,2-D Echocardiogram - 4,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,...",normal lv systolic function. aortic valve se...,normal cardiac chambers size. normal left ve...,Normal lv systolic function.Aortic valve seen ...,"Description:,1.Normal cardiac chambers size.,2..."
1,11,"Normal left ventricle, moderate biatrial enla...",Cardiovascular / Pulmonary,2-D Doppler,"2-D STUDY,1. Mild aortic stenosis, widely calc...","cardiovascular / pulmonary, 2-d study, doppler...",Mild left ventricular hypertrophy but normal s...,Mild left ventricular hypertrophy but normal s...,Mild left ventricular hypertrophy but normal s...,Mild left ventricular hypertrophy but normal s...
2,16,Neck exploration; tracheostomy; urgent flexib...,Cardiovascular / Pulmonary,Tracheostomy,"PREOPERATIVE DIAGNOSES,Airway obstruction seco...","cardiovascular / pulmonary, airway, laryngolog...","Preoperative diagnoses,airway obstruction seco...",",operation performed,neck exploration; tracheo...","Preoperative diagnoses,airway obstruction seco...","Preoperative diagnoses,airway obstruction seco..."
3,4084,"Well-woman check up for a middle-aged woman, ...",Consult - History and Phy.,Well-woman checkup,"CHIEF COMPLAINT:, The patient comes for her w...",,she has not had any problems with vasomotor s...,"Chief complaint:, the patient comes for her w...",She has not had any problems with vasomotor sy...,She has not had any problems with vasomotor sy...
4,4085,A woman with a remote history of ileojejunal ...,Consult - History and Phy.,Wound Care Consult,"TYPE OF CONSULTATION:, Wound care consult.,HI...",,",history of present illness:, the patient is ...",",history of present illness:, the patient is ...","Type of consultation:, wound care consult.,hi...",She subsequently was left with a large open ab...
5,4086,1-year well child check.,Consult - History and Phy.,Well-Child Check - 4,"SUBJECTIVE:, The patient presents with Mom an...","consult - history and phy., well child check, ...","Subjective:, the patient presents with mom an...","Subjective:, the patient presents with mom an...",The family has no concerns stating the patient...,The family has no concerns stating the patient...
6,2013,Excision of dorsal wrist ganglion. Made a tr...,Orthopedic,Wrist Ganglion Excision,"PREOPERATIVE DIAGNOSIS: , Wrist ganglion.,POST...","orthopedic, origin of stalk, extensor retinacu...",",title of procedure: , excision of dorsal wris...",",procedure: , after administering appropriate ...","Preoperative diagnosis: , wrist ganglion.,post...",The arm was exsanguinated with an esmarch and ...
7,2014,Unilateral transpedicular T11 vertebroplasty.,Orthopedic,Vertebroplasty,"PREOPERATIVE DIAGNOSIS:, T11 compression frac...","orthopedic, transpedicular, vertebroplasty, fl...",using ap and lateral fluoroscopic projections...,",summary: , the patient in the operating room ...",Starting from the left side local anesthetic w...,"Preoperative diagnosis:, t11 compression frac..."
8,2017,Trigger thumb release. Right trigger thumb. ...,Orthopedic,Trigger Thumb Release - 1,"PREOPERATIVE DIAGNOSIS: ,Right trigger thumb....","orthopedic, trigger thumb, trigger thumb relea...",",anesthesia:, monitored anesthesia care with ...","Preoperative diagnosis: ,right trigger thumb.","Preoperative diagnosis: ,right trigger thumb....","Preoperative diagnosis: ,right trigger thumb...."
9,1489,Whole body radionuclide bone scan due to pros...,Radiology,Whole Body Radionuclide Bone Scan,"INDICATION:, Prostate Cancer.,TECHNIQUE:, 3....","radiology, prostate cancer, technetium, whole ...",",findings:, there is a focus of abnormal incr...",",findings:, there is a focus of abnormal incr...","Indication:, prostate cancer.,technique:, 3....","Indication:, prostate cancer.,technique:, 3...."
