In [1]:
import ktrain
import os
import sys
import pandas as pd
import re
import string
import nltk as nltk
from nltk.corpus import stopwords
import numpy as np

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
pd.set_option('display.max_columns', None)
notebook_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(notebook_dir, '..'))
data_dir = os.path.join(root_dir, 'data')

In [3]:
df = pd.read_csv(os.path.join(data_dir, 'medical_samples.csv'))
df.head()

Unnamed: 0.1,Unnamed: 0,Pseudo_Patient_Name,description,medical_specialty,sample_name,transcription,keywords
0,0,James,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Chester,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Shannon,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,Domingo,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,Eugene,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


* Find unique values for patients, medical specialty, and maybe sample name
* Separate keywords and find all unique values
* lowercase everything

In [4]:
# begin with string replaces
df['transcription'] = df['transcription'].str.replace('DIAGNOSES','DIAGNOSIS')
df['transcription'] = df['transcription'].str.replace('PREOP DIAGNOSIS','PREOPERATIVE DIAGNOSIS')
df['transcription'] = df['transcription'].str.replace('ALLERGIES TO MEDICATIONS','ALLERGIES')
df['transcription'] = df['transcription'].str.replace('OPERATIVE PROCEDURE','OPERATIVE PROCEDURES')
df['transcription'] = df['transcription'].str.replace('DESCRIPTION OF PROCEDURE','OPERATIVE PROCEDURES')
df['transcription'] = df['transcription'].str.replace('DESCRIPTION OF THE PROCEDURE','OPERATIVE PROCEDURES')
df['transcription'] = df['transcription'].str.replace('PROCEDURE NOTE','OPERATIVE PROCEDURES')
df['transcription'] = df['transcription'].str.replace('PROCEDURE IN DETAIL','OPERATIVE PROCEDURES')
df['transcription'] = df['transcription'].str.replace('DETAILS OF THE OPERATION','OPERATIVE PROCEDURES')
df['transcription'] = df['transcription'].str.replace('OPERATIVE PROCEDURE IN DETAIL','OPERATIVE PROCEDURES')
df['transcription'] = df['transcription'].str.replace('INDICATION FOR OPERATION','INDICATIONS FOR PROCEDURE')
df['transcription'] = df['transcription'].str.replace('PAST MEDICAL HISTORY','HISTORY')
df['transcription'] = df['transcription'].str.replace('CURRENT MEDICATIONS','MEDICATIONS')
df['transcription'] = df['transcription'].str.replace('REASON FOR VISIT','INDICATIONS')
df['transcription'] = df['transcription'].str.replace('REASON FOR EXAM','INDICATIONS')
df['transcription'] = df['transcription'].str.replace('INDICATIONS FOR PROCEDURE','INDICATIONS')
df['transcription'] = df['transcription'].str.replace('INDICATION FOR SURGERY','INDICATIONS')
df['transcription'] = df['transcription'].str.replace('HISTORY OF PRESENT ILLNESS', 'HISTORY')
df['transcription'] = df['transcription'].str.replace('ASSESSMENT & PLAN', 'PLAN')
df['transcription'] = df['transcription'].str.replace('RECOMMENDATIONS', 'PLAN')


def ts_div(pre, pattern):
    d  = df['transcription'].str.extract(pattern).fillna("")
    d  = d.add_prefix(pre)
    #for col in d.columns:
    #    d[col] = clean_text(col)
    d1 = df.join(d)

    return d1

df = ts_div('preop_',         '(PREOPERATIVE DIAGNOSIS:(.*?)\.,)[A-Z]') 
df = ts_div('complaint_',     '(CHIEF COMPLAINT:(.*?)\.,)[A-Z]')
df = ts_div('postop_',        '(POSTOPERATIVE DIAGNOSIS:(.*?)\.,)[A-Z]')
df = ts_div('op_procedure_',  '(OPERATIVE PROCEDURES:(.*?)\.,)[A-Z]') 
df = ts_div('endoscope_',     '(ENDOSCOPE USED:(.*?)\.,)[A-Z]')
df = ts_div('anesthesia_',    '(ANESTHESIA:(.*?)\.,)[A-Z]')
df = ts_div('indications_',   '(INDICATIONS:(.*?)\.,)[A-Z]') 
df = ts_div('allergies_',     '(ALLERGIES:(.*?)\.,)[A-Z]') 
df = ts_div('complications_', '(COMPLICATIONS:(.*?)\.,)[A-Z]') 
df = ts_div('bloodloss_',     '(BLOOD LOSS:(.*?)\.,)[A-Z]') 
df = ts_div('meds_',          '(MEDICATIONS:(.*?)\.,)[A-Z]') 
df = ts_div('exam_',          '(PHYSICAL EXAMINATION:(.*?)\.,)[A-Z]') 
df = ts_div('asmt_',          '(ASSESSMENT:(.*?)\.,)[A-Z]') 
df = ts_div('history_',       '(HISTORY:(.*?)\.,)[A-Z]') 
df = ts_div('op_name_',       '(TITLE OF OPERATION:(.*?)\.,)[A-Z]') 
df = ts_div('physical_',      '(PHYSICAL EXAMINATION:(.*?)\.,)[A-Z]') 
df = ts_div('diagnosis_',     '(DIAGNOSIS:(.*?)\.,)[A-Z]') 
df = ts_div('recommendations_',     '(PLAN:(.*?)\.,)[A-Z]') 


# recommendations and next steps

# other variables to create: cancer, heart disease, binaries for all of these?
# also need to drop the "_0" versions of the output variable or figure out why they're made. 



In [5]:
#Clean some of the text
def clean_text(text, single_character = True, numbers = True, punctuation = True, lowercase = True,  stop_words = True):
    
    
    # Remove punctuation - do this before tokenizing in case there are dashes that connect words
    if punctuation:
        text = re.sub(r'[^\w\s]', ' ', text)
        #[word for word in words if word.isalpha()]
    
    words = nltk.tokenize.word_tokenize(text)
    stopwrd = stopwords.words('english')
    stopwrd = set(nltk.corpus.stopwords.words('english'))

    # Lowercase all words (default_stopwords are lowercase too)
    if lowercase:
        words = [word.lower() for word in words]
    
    # Remove single-character tokens (mostly punctuation)
    if single_character:
        words = [word for word in words if len(word) > 1]

    # Remove numbers
    if numbers:
        words = [word for word in words if not word.isnumeric()]

    # Remove stopwords
    if stop_words:
        words = [word for word in words if word not in stopwrd]

    #Join words into one string
    words = ' '.join(str(e) for e in words)
    
    return words


In [6]:
def clean_column(column, single_character = True, numbers = True, lowercase = True, punctuation = True, stop_words = True):
    
    ans_list = []

    for row in range(len(df)):

        #If not a string, ignore (there are some null values)
        if type(df[column][row]) != str:
            ans = ''
        else:
            ans = clean_text(df[column][row], single_character, numbers, lowercase, punctuation, stop_words)
        ans_list.append(ans)

    return ans_list
    

In [7]:
df['sample_name_adj'] = clean_column('sample_name')
df['sample_name_adj']

0                         allergic rhinitis
1       laparoscopic gastric bypass consult
2       laparoscopic gastric bypass consult
3                            echocardiogram
4                            echocardiogram
                       ...                 
4994                      chronic sinusitis
4995     kawasaki disease discharge summary
4996                        followup asthma
4997                        asthma year old
4998             allergy evaluation consult
Name: sample_name_adj, Length: 4999, dtype: object

In [8]:
df['description_adj'] = clean_column('description', numbers = False)
df['description_adj']

0       23 year old white female presents complaint al...
1                     consult laparoscopic gastric bypass
2                     consult laparoscopic gastric bypass
3                                            mode doppler
4                                          echocardiogram
                              ...                        
4994    patient severe sinusitis two three months ago ...
4995    14 month old baby boy caucasian came presumpti...
4996    female complete physical follow asthma allergi...
4997                      mother states wheezing coughing
4998    acute allergic reaction etiology uncertain how...
Name: description_adj, Length: 4999, dtype: object

In [9]:
df['transcription_adj'] = clean_column('transcription', numbers = False, stop_words = False)
df['transcription_adj']

0       subjective this 23 year old white female prese...
1       history he has difficulty climbing stairs diff...
2       history have seen abc today he is very pleasan...
3       mode left atrial enlargement with left atrial ...
4       the left ventricular cavity size and wall thic...
                              ...                        
4994    history had the pleasure of meeting and evalua...
4995    admitting diagnosis kawasaki disease discharge...
4996    subjective this is 42 year old white female wh...
4997    chief complaint this year old male presents to...
4998    history 34 year old male presents today self r...
Name: transcription_adj, Length: 4999, dtype: object

In [10]:
df['keywords_adj'] = clean_column('keywords')
df['keywords_adj']

0       allergy immunology allergic rhinitis allergies...
1       bariatrics laparoscopic gastric bypass weight ...
2       bariatrics laparoscopic gastric bypass heart a...
3       cardiovascular pulmonary mode doppler aortic v...
4       cardiovascular pulmonary doppler echocardiogra...
                              ...                        
4994                                                     
4995    allergy immunology mucous membranes conjunctiv...
4996                                                     
4997                                                     
4998                                                     
Name: keywords_adj, Length: 4999, dtype: object

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,Pseudo_Patient_Name,description,medical_specialty,sample_name,transcription,keywords,preop_0,preop_1,complaint_0,complaint_1,postop_0,postop_1,op_procedure_0,op_procedure_1,endoscope_0,endoscope_1,anesthesia_0,anesthesia_1,indications_0,indications_1,allergies_0,allergies_1,complications_0,complications_1,bloodloss_0,bloodloss_1,meds_0,meds_1,exam_0,exam_1,asmt_0,asmt_1,history_0,history_1,op_name_0,op_name_1,physical_0,physical_1,diagnosis_0,diagnosis_1,recommendations_0,recommendations_1,sample_name_adj,description_adj,transcription_adj,keywords_adj
0,0,James,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",,,,,,,,,,,,,,,"ALLERGIES: , She has no known medicine allergi...",", She has no known medicine allergies",,,,,"MEDICATIONS: , Her only medication currently i...",", Her only medication currently is Ortho Tri-...",,,"ASSESSMENT:, Allergic rhinitis.,",", Allergic rhinitis",,,,,,,,,,,allergic rhinitis,23 year old white female presents complaint al...,subjective this 23 year old white female prese...,allergy immunology allergic rhinitis allergies...
1,1,Chester,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"HISTORY:, He has difficulty climbing stairs, d...","bariatrics, laparoscopic gastric bypass, weigh...",,,,,,,,,,,,,,,"ALLERGIES:, He is allergic to Penicillin.,",", He is allergic to Penicillin",,,,,"MEDICATIONS:, None.,",", None",,,,,"HISTORY:, He has difficulty climbing stairs, d...",", He has difficulty climbing stairs, difficult...",,,,,,,,,laparoscopic gastric bypass consult,consult laparoscopic gastric bypass,history he has difficulty climbing stairs diff...,bariatrics laparoscopic gastric bypass weight ...
2,2,Shannon,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY: , I have seen ABC today. He is a ver...","bariatrics, laparoscopic gastric bypass, heart...",,,,,,,,,,,,,,,,,,,,,"MEDICATIONS:, Include Diovan, Crestor, and Tr...",", Include Diovan, Crestor, and Tricor","PHYSICAL EXAMINATION: ,He is alert and orient...",",He is alert and oriented x 3. Cranial nerv...",,,"HISTORY: , I have seen ABC today. He is a ver...",", I have seen ABC today. He is a very pleasa...",,,"PHYSICAL EXAMINATION: ,He is alert and orient...",",He is alert and oriented x 3. Cranial nerv...",,,,,laparoscopic gastric bypass consult,consult laparoscopic gastric bypass,history have seen abc today he is very pleasan...,bariatrics laparoscopic gastric bypass heart a...
3,3,Domingo,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,echocardiogram,mode doppler,mode left atrial enlargement with left atrial ...,cardiovascular pulmonary mode doppler aortic v...
4,4,Eugene,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,echocardiogram,echocardiogram,the left ventricular cavity size and wall thic...,cardiovascular pulmonary doppler echocardiogra...


### Analysis

In [13]:
df['Pseudo_Patient_Name'].value_counts()

James      95
John       89
Mary       85
Michael    83
Robert     80
           ..
Seth        1
Lana        1
Stephan     1
Barney      1
Nell        1
Name: Pseudo_Patient_Name, Length: 1280, dtype: int64

In [14]:
df['sample_name_adj'].value_counts()

gen med consult                                    108
colonoscopy                                         46
discharge summary                                   42
ct abdomen pelvis                                   36
anterior cervical discectomy fusion                 30
                                                  ... 
wound debridement                                    1
facial laceration closure                            1
year old exam                                        1
arthroscopic subacromial decompression shoulder      1
weight loss phentermine                              1
Name: sample_name_adj, Length: 1696, dtype: int64

In [15]:
df['medical_specialty'].value_counts()

 Surgery                          1103
 Consult - History and Phy.        516
 Cardiovascular / Pulmonary        372
 Orthopedic                        355
 Radiology                         273
 General Medicine                  259
 Gastroenterology                  230
 Neurology                         223
 SOAP / Chart / Progress Notes     166
 Obstetrics / Gynecology           160
 Urology                           158
 Discharge Summary                 108
 ENT - Otolaryngology               98
 Neurosurgery                       94
 Hematology - Oncology              90
 Ophthalmology                      83
 Nephrology                         81
 Emergency Room Reports             75
 Pediatrics - Neonatal              70
 Pain Management                    62
 Psychiatry / Psychology            53
 Office Notes                       51
 Podiatry                           47
 Dermatology                        29
 Cosmetic / Plastic Surgery         27
 Dentistry               

In [16]:
##Get all words in keywords
list_keywords = []

for row in range(len(df)):
    value = nltk.tokenize.word_tokenize(df['keywords_adj'][row])
    list_keywords.extend(value)
    
list_keywords

['allergy',
 'immunology',
 'allergic',
 'rhinitis',
 'allergies',
 'asthma',
 'nasal',
 'sprays',
 'rhinitis',
 'nasal',
 'erythematous',
 'allegra',
 'sprays',
 'allergic',
 'bariatrics',
 'laparoscopic',
 'gastric',
 'bypass',
 'weight',
 'loss',
 'programs',
 'gastric',
 'bypass',
 'atkin',
 'diet',
 'weight',
 'watcher',
 'body',
 'weight',
 'laparoscopic',
 'gastric',
 'weight',
 'loss',
 'pounds',
 'months',
 'weight',
 'laparoscopic',
 'band',
 'loss',
 'diets',
 'overweight',
 'lost',
 'bariatrics',
 'laparoscopic',
 'gastric',
 'bypass',
 'heart',
 'attacks',
 'body',
 'weight',
 'pulmonary',
 'embolism',
 'potential',
 'complications',
 'sleep',
 'study',
 'weight',
 'loss',
 'gastric',
 'bypass',
 'anastomosis',
 'loss',
 'sleep',
 'laparoscopic',
 'gastric',
 'bypass',
 'heart',
 'pounds',
 'weight',
 'cardiovascular',
 'pulmonary',
 'mode',
 'doppler',
 'aortic',
 'valve',
 'atrial',
 'enlargement',
 'diastolic',
 'function',
 'ejection',
 'fraction',
 'mitral',
 'mitral'

In [17]:
pd.DataFrame(list_keywords).value_counts()

surgery           1129
reports            788
sample             759
medical            518
transcription      506
                  ... 
sphincterotome       1
bending              1
leakingnote          1
inspiratory          1
rolled               1
Length: 5833, dtype: int64

In [19]:
df.to_csv(os.path.join(data_dir, 'medical_samples_adj.csv'), index = False)