# **Mount the Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Import required modules**

In [2]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

# **Dataset loading**

In [3]:
# Load dataset
def load_data():
    data =pd.read_csv("/content/drive/MyDrive/Research Work/NLP Dataset/Medical NER Dataset/mtsamples.csv")
    return data

In [4]:
NER_df = load_data()
NER_df.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


Observation: As you have seen that there is an unnecessary columns like 'Unnamed' which is not required and seems to be redundant one, so it has to be eliminated. Let's do that

In [5]:
NER_df.drop('Unnamed: 0',inplace=True,axis = 1)

In [6]:
NER_df.head(3)

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."


I'm gonna remove two attributes, e.g., sample_name and transcription because I do not know the hypothesis of those columns, however, it can be added later, at this moment, I'm focusing to work on other columns

In [7]:
NER_df.drop(['medical_specialty', 'sample_name'], axis=1, inplace= True)

In [8]:
NER_df.head(5)

Unnamed: 0,description,transcription,keywords
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


## **Text Preprocessing Layer**

Preprocessing the text is significant steps for the case of natural language processing because data often collect from various online repositories so that it is containing many noise and outliers. In order to make a clean corpus, these have to be filtered out properly. However, the following procedures will be applied for removing the missing values and making a clean corpus. I have considered the following steps for dealing with the unstructured text data. These steps are recommended by the researchers even though there might have another methods. <br>

**Step 1 :** Handelling null and missing values <br>
**Step 2 :** Converting html entities <br>
**Step 3 :** Removing "@user" from all the desriptions <br>
**Step 4 :**Changing all the text into lowercase <br>
**Step 5 :** Apostrophe Lookup <br>
**Step 6 :**Short Word Lookup <br>
**Step 7 :** Emoticon Lookup <br>
**Step 8 :** Replacing Special Characters with space <br>
**Step 9 :**Replacing Numbers (integers) with space <br>
**Step 10 :**Removing words whom length is 1 <br>
**Step 11 :**Removing URL from the text (if exists) <br>
**Step 12 :**Spelling correction <br>
**Step 13 :** Remove stopwords + Tokenization  <br>
**Step 14 :** Text normalization: Lammetization and stemming <br>
**Step 15 :**Part-of-speech tagging (POS tagging) <br>
**Step 16 :** Biomedical Name Entity Recognition (BIO_NER) <br>
**Step 17 :** Abbreviation detection <br>

**Step 1 : Handelling null and missing values**

In [9]:
NER_df.isnull().sum()

description         0
transcription      33
keywords         1068
dtype: int64

In [10]:
NER_df = NER_df.dropna(how='any',axis=0) 

In [11]:
NER_df.isnull().sum()

description      0
transcription    0
keywords         0
dtype: int64

**Step 2 : Converting html entities**

In [12]:
# Importing HTMLParser
from html.parser import HTMLParser
html_parser = HTMLParser()

In [13]:
# Created a new columns i.e. clean_tweet contains the same tweets but cleaned version
NER_df['clean_desc'] = NER_df['description'].apply(lambda x: html_parser.unescape(x))
NER_df.tail(10)

Unnamed: 0,description,transcription,keywords,clean_desc
4977,"Gentleman with long-standing morbid obesity, ...","HISTORY OF PRESENT ILLNESS: , In short, the pa...","bariatrics, medifast, medifast dieting, hypert...","Gentleman with long-standing morbid obesity, ..."
4978,Preop evaluation regarding gastric bypass sur...,"REASON FOR VISIT:, Preop evaluation regarding...","bariatrics, medifast, medifast diet, preop eva...",Preop evaluation regarding gastric bypass sur...
4979,Patient scheduled for laparoscopic gastric by...,"HISTORY: , The patient is scheduled for laparo...","bariatrics, medifast, laparoscopic gastric byp...",Patient scheduled for laparoscopic gastric by...
4980,Evaluation for elective surgical weight loss ...,"PAST MEDICAL HISTORY: , She has a history of h...","bariatrics, elective surgical weight loss, sur...",Evaluation for elective surgical weight loss ...
4982,Evaluation for bariatric surgery.,"PAST MEDICAL HISTORY: , Her medical conditions...","bariatrics, evaluation for bariatric surgery, ...",Evaluation for bariatric surgery.
4984,Patient suffered from morbid obesity for many...,"ADMISSION DIAGNOSIS:, Morbid obesity. BMI is...","bariatrics, laparoscopic gastric bypass, gastr...",Patient suffered from morbid obesity for many...
4985,Patient presented to the Bariatric Surgery Se...,"HISTORY OF PRESENT ILLNESS:, Ms. A is a 55-ye...","bariatrics, jenny craig, medifast, nutrisystem...",Patient presented to the Bariatric Surgery Se...
4989,Evaluation for elective surgical weight loss ...,"PAST MEDICAL HISTORY: ,She had a negative str...","bariatrics, elective surgical weight loss, sur...",Evaluation for elective surgical weight loss ...
4993,"Chronic glossitis, xerostomia, probable envir...","HISTORY:, A 55-year-old female presents self-...","allergy / immunology, chronic glossitis, xeros...","Chronic glossitis, xerostomia, probable envir..."
4995,This is a 14-month-old baby boy Caucasian who...,"ADMITTING DIAGNOSIS: , Kawasaki disease.,DISCH...","allergy / immunology, mucous membranes, conjun...",This is a 14-month-old baby boy Caucasian who...


**Remove HTML taggs**

In [14]:
import re

In [15]:
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

text = """<div>
<h1> H2O</h1>
<p> AutoML</p>
<a href="https://www.h2o.ai/products/h2o-driverless-ai/"> Driverless AI</a>
</div>"""

print(remove_html(text))


 H2O
 AutoML
 Driverless AI



In [16]:
NER_df['clean_desc'] = NER_df['description'].apply(lambda x: remove_html(x))
NER_df.head(10)

Unnamed: 0,description,transcription,keywords,clean_desc
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",A 23-year-old white female presents with comp...
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",Consult for laparoscopic gastric bypass.
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",Consult for laparoscopic gastric bypass.
3,2-D M-Mode. Doppler.,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...",2-D M-Mode. Doppler.
4,2-D Echocardiogram,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",2-D Echocardiogram
5,Morbid obesity. Laparoscopic antecolic anteg...,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...","bariatrics, gastric bypass, eea anastomosis, r...",Morbid obesity. Laparoscopic antecolic anteg...
6,"Liposuction of the supraumbilical abdomen, re...","PREOPERATIVE DIAGNOSES:,1. Deformity, right b...","bariatrics, breast reconstruction, excess, lma...","Liposuction of the supraumbilical abdomen, re..."
7,2-D Echocardiogram,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram...",2-D Echocardiogram
8,Suction-assisted lipectomy - lipodystrophy of...,"PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...","bariatrics, lipodystrophy, abd pads, suction-a...",Suction-assisted lipectomy - lipodystrophy of...
9,Echocardiogram and Doppler,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,...",Echocardiogram and Doppler


**STEP 3: Removing "@user" from all the text (if exists)**

In [17]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

In [18]:
NER_df['clean_desc'] = np.vectorize(remove_pattern)(NER_df['clean_desc'], "@[\w]*")
NER_df.tail(10)

Unnamed: 0,description,transcription,keywords,clean_desc
4977,"Gentleman with long-standing morbid obesity, ...","HISTORY OF PRESENT ILLNESS: , In short, the pa...","bariatrics, medifast, medifast dieting, hypert...","Gentleman with long-standing morbid obesity, ..."
4978,Preop evaluation regarding gastric bypass sur...,"REASON FOR VISIT:, Preop evaluation regarding...","bariatrics, medifast, medifast diet, preop eva...",Preop evaluation regarding gastric bypass sur...
4979,Patient scheduled for laparoscopic gastric by...,"HISTORY: , The patient is scheduled for laparo...","bariatrics, medifast, laparoscopic gastric byp...",Patient scheduled for laparoscopic gastric by...
4980,Evaluation for elective surgical weight loss ...,"PAST MEDICAL HISTORY: , She has a history of h...","bariatrics, elective surgical weight loss, sur...",Evaluation for elective surgical weight loss ...
4982,Evaluation for bariatric surgery.,"PAST MEDICAL HISTORY: , Her medical conditions...","bariatrics, evaluation for bariatric surgery, ...",Evaluation for bariatric surgery.
4984,Patient suffered from morbid obesity for many...,"ADMISSION DIAGNOSIS:, Morbid obesity. BMI is...","bariatrics, laparoscopic gastric bypass, gastr...",Patient suffered from morbid obesity for many...
4985,Patient presented to the Bariatric Surgery Se...,"HISTORY OF PRESENT ILLNESS:, Ms. A is a 55-ye...","bariatrics, jenny craig, medifast, nutrisystem...",Patient presented to the Bariatric Surgery Se...
4989,Evaluation for elective surgical weight loss ...,"PAST MEDICAL HISTORY: ,She had a negative str...","bariatrics, elective surgical weight loss, sur...",Evaluation for elective surgical weight loss ...
4993,"Chronic glossitis, xerostomia, probable envir...","HISTORY:, A 55-year-old female presents self-...","allergy / immunology, chronic glossitis, xeros...","Chronic glossitis, xerostomia, probable envir..."
4995,This is a 14-month-old baby boy Caucasian who...,"ADMITTING DIAGNOSIS: , Kawasaki disease.,DISCH...","allergy / immunology, mucous membranes, conjun...",This is a 14-month-old baby boy Caucasian who...


**STEP 4 : Changing all the text into lowercase**

In [19]:
NER_df['clean_desc'] = NER_df['clean_desc'].apply(lambda x: x.lower())
NER_df.head(10)

Unnamed: 0,description,transcription,keywords,clean_desc
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",a 23-year-old white female presents with comp...
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",consult for laparoscopic gastric bypass.
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",consult for laparoscopic gastric bypass.
3,2-D M-Mode. Doppler.,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...",2-d m-mode. doppler.
4,2-D Echocardiogram,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",2-d echocardiogram
5,Morbid obesity. Laparoscopic antecolic anteg...,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...","bariatrics, gastric bypass, eea anastomosis, r...",morbid obesity. laparoscopic antecolic anteg...
6,"Liposuction of the supraumbilical abdomen, re...","PREOPERATIVE DIAGNOSES:,1. Deformity, right b...","bariatrics, breast reconstruction, excess, lma...","liposuction of the supraumbilical abdomen, re..."
7,2-D Echocardiogram,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram...",2-d echocardiogram
8,Suction-assisted lipectomy - lipodystrophy of...,"PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...","bariatrics, lipodystrophy, abd pads, suction-a...",suction-assisted lipectomy - lipodystrophy of...
9,Echocardiogram and Doppler,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,...",echocardiogram and doppler


**STEP 5 : Apostrophe Lookup**

In [20]:
# Apostrophe Dictionary
apostrophe_dict = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [21]:
def lookup_dict(text, dictionary):
    for word in text.split():
        if word.lower() in dictionary:
            if word.lower() in text.split():
                text = text.replace(word, dictionary[word.lower()])
    return text

In [22]:
NER_df['clean_desc'] = NER_df['clean_desc'].apply(lambda x: lookup_dict(x,apostrophe_dict))
NER_df.head(10)

Unnamed: 0,description,transcription,keywords,clean_desc
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",a 23-year-old white female presents with comp...
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",consult for laparoscopic gastric bypass.
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",consult for laparoscopic gastric bypass.
3,2-D M-Mode. Doppler.,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...",2-d m-mode. doppler.
4,2-D Echocardiogram,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",2-d echocardiogram
5,Morbid obesity. Laparoscopic antecolic anteg...,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...","bariatrics, gastric bypass, eea anastomosis, r...",morbid obesity. laparoscopic antecolic anteg...
6,"Liposuction of the supraumbilical abdomen, re...","PREOPERATIVE DIAGNOSES:,1. Deformity, right b...","bariatrics, breast reconstruction, excess, lma...","liposuction of the supraumbilical abdomen, re..."
7,2-D Echocardiogram,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram...",2-d echocardiogram
8,Suction-assisted lipectomy - lipodystrophy of...,"PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...","bariatrics, lipodystrophy, abd pads, suction-a...",suction-assisted lipectomy - lipodystrophy of...
9,Echocardiogram and Doppler,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,...",echocardiogram and doppler


**STEP 6 : Short Word Lookup**

In [23]:
short_word_dict = {
"121": "one to one",
"a/s/l": "age, sex, location",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"ym": "young man",
"gr8": "great"
}

In [24]:
NER_df['clean_desc'] = NER_df['clean_desc'].apply(lambda x: lookup_dict(x,short_word_dict))
NER_df.head(10)

Unnamed: 0,description,transcription,keywords,clean_desc
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",a 23-year-old white female presents with comp...
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",consult for laparoscopic gastric bypass.
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",consult for laparoscopic gastric bypass.
3,2-D M-Mode. Doppler.,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...",2-d m-mode. doppler.
4,2-D Echocardiogram,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",2-d echocardiogram
5,Morbid obesity. Laparoscopic antecolic anteg...,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...","bariatrics, gastric bypass, eea anastomosis, r...",morbid obesity. laparoscopic antecolic anteg...
6,"Liposuction of the supraumbilical abdomen, re...","PREOPERATIVE DIAGNOSES:,1. Deformity, right b...","bariatrics, breast reconstruction, excess, lma...","liposuction of the supraumbilical abdomen, re..."
7,2-D Echocardiogram,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram...",2-d echocardiogram
8,Suction-assisted lipectomy - lipodystrophy of...,"PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...","bariatrics, lipodystrophy, abd pads, suction-a...",suction-assisted lipectomy - lipodystrophy of...
9,Echocardiogram and Doppler,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,...",echocardiogram and doppler


**STEP 7 : Emoticon Lookup**

In [25]:
emoticon_dict = {
":)": "happy",
":‑)": "happy",
":-]": "happy",
":-3": "happy",
":->": "happy",
"8-)": "happy",
":-}": "happy",
":o)": "happy",
":c)": "happy",
":^)": "happy",
"=]": "happy",
"=)": "happy",
"<3": "happy",
":-(": "sad",
":(": "sad",
":c": "sad",
":<": "sad",
":[": "sad",
">:[": "sad",
":{": "sad",
">:(": "sad",
":-c": "sad",
":-< ": "sad",
":-[": "sad",
":-||": "sad"
}

In [26]:
emoticon_dict

{'8-)': 'happy',
 ':(': 'sad',
 ':)': 'happy',
 ':-(': 'sad',
 ':-3': 'happy',
 ':-< ': 'sad',
 ':->': 'happy',
 ':-[': 'sad',
 ':-]': 'happy',
 ':-c': 'sad',
 ':-||': 'sad',
 ':-}': 'happy',
 ':<': 'sad',
 ':[': 'sad',
 ':^)': 'happy',
 ':c': 'sad',
 ':c)': 'happy',
 ':o)': 'happy',
 ':{': 'sad',
 ':‑)': 'happy',
 '<3': 'happy',
 '=)': 'happy',
 '=]': 'happy',
 '>:(': 'sad',
 '>:[': 'sad'}

In [27]:
NER_df['clean_desc'] = NER_df['clean_desc'].apply(lambda x: lookup_dict(x,emoticon_dict))
NER_df.head(10)

Unnamed: 0,description,transcription,keywords,clean_desc
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",a 23-year-old white female presents with comp...
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",consult for laparoscopic gastric bypass.
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",consult for laparoscopic gastric bypass.
3,2-D M-Mode. Doppler.,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...",2-d m-mode. doppler.
4,2-D Echocardiogram,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",2-d echocardiogram
5,Morbid obesity. Laparoscopic antecolic anteg...,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...","bariatrics, gastric bypass, eea anastomosis, r...",morbid obesity. laparoscopic antecolic anteg...
6,"Liposuction of the supraumbilical abdomen, re...","PREOPERATIVE DIAGNOSES:,1. Deformity, right b...","bariatrics, breast reconstruction, excess, lma...","liposuction of the supraumbilical abdomen, re..."
7,2-D Echocardiogram,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram...",2-d echocardiogram
8,Suction-assisted lipectomy - lipodystrophy of...,"PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...","bariatrics, lipodystrophy, abd pads, suction-a...",suction-assisted lipectomy - lipodystrophy of...
9,Echocardiogram and Doppler,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,...",echocardiogram and doppler


**STEP 7 : ReplacingPunctuations with space**

In [28]:
NER_df['clean_desc'] = NER_df['clean_desc'].apply(lambda x: re.sub(r'[^\w\s]',' ',x))
NER_df.head(10)

Unnamed: 0,description,transcription,keywords,clean_desc
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",a 23 year old white female presents with comp...
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",consult for laparoscopic gastric bypass
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",consult for laparoscopic gastric bypass
3,2-D M-Mode. Doppler.,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...",2 d m mode doppler
4,2-D Echocardiogram,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",2 d echocardiogram
5,Morbid obesity. Laparoscopic antecolic anteg...,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...","bariatrics, gastric bypass, eea anastomosis, r...",morbid obesity laparoscopic antecolic anteg...
6,"Liposuction of the supraumbilical abdomen, re...","PREOPERATIVE DIAGNOSES:,1. Deformity, right b...","bariatrics, breast reconstruction, excess, lma...",liposuction of the supraumbilical abdomen re...
7,2-D Echocardiogram,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram...",2 d echocardiogram
8,Suction-assisted lipectomy - lipodystrophy of...,"PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...","bariatrics, lipodystrophy, abd pads, suction-a...",suction assisted lipectomy lipodystrophy of...
9,Echocardiogram and Doppler,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,...",echocardiogram and doppler


**STEP 8 : Replacing Special Characters with space**

In [29]:
NER_df['clean_desc'] = NER_df['clean_desc'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]',' ',x))
NER_df.head(10)

Unnamed: 0,description,transcription,keywords,clean_desc
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",a 23 year old white female presents with comp...
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",consult for laparoscopic gastric bypass
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",consult for laparoscopic gastric bypass
3,2-D M-Mode. Doppler.,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...",2 d m mode doppler
4,2-D Echocardiogram,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",2 d echocardiogram
5,Morbid obesity. Laparoscopic antecolic anteg...,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...","bariatrics, gastric bypass, eea anastomosis, r...",morbid obesity laparoscopic antecolic anteg...
6,"Liposuction of the supraumbilical abdomen, re...","PREOPERATIVE DIAGNOSES:,1. Deformity, right b...","bariatrics, breast reconstruction, excess, lma...",liposuction of the supraumbilical abdomen re...
7,2-D Echocardiogram,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram...",2 d echocardiogram
8,Suction-assisted lipectomy - lipodystrophy of...,"PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...","bariatrics, lipodystrophy, abd pads, suction-a...",suction assisted lipectomy lipodystrophy of...
9,Echocardiogram and Doppler,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,...",echocardiogram and doppler


**STEP 9 : Replacing Numbers (integers) with space**

In [30]:
NER_df['clean_desc'] = NER_df['clean_desc'].apply(lambda x: re.sub(r'[^a-zA-Z]',' ',x))
NER_df.head(10)

Unnamed: 0,description,transcription,keywords,clean_desc
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",a year old white female presents with comp...
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",consult for laparoscopic gastric bypass
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",consult for laparoscopic gastric bypass
3,2-D M-Mode. Doppler.,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...",d m mode doppler
4,2-D Echocardiogram,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",d echocardiogram
5,Morbid obesity. Laparoscopic antecolic anteg...,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...","bariatrics, gastric bypass, eea anastomosis, r...",morbid obesity laparoscopic antecolic anteg...
6,"Liposuction of the supraumbilical abdomen, re...","PREOPERATIVE DIAGNOSES:,1. Deformity, right b...","bariatrics, breast reconstruction, excess, lma...",liposuction of the supraumbilical abdomen re...
7,2-D Echocardiogram,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram...",d echocardiogram
8,Suction-assisted lipectomy - lipodystrophy of...,"PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...","bariatrics, lipodystrophy, abd pads, suction-a...",suction assisted lipectomy lipodystrophy of...
9,Echocardiogram and Doppler,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,...",echocardiogram and doppler


**STEP 10 : Removing words whom length is 1**

In [31]:
NER_df['clean_desc'] = NER_df['clean_desc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))
NER_df['clean_desc'][0:5]

0    year old white female presents with complaint ...
1              consult for laparoscopic gastric bypass
2              consult for laparoscopic gastric bypass
3                                         mode doppler
4                                       echocardiogram
Name: clean_desc, dtype: object

**STEP 11: Remove URL from Text**

In [32]:
import re
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [33]:
NER_df['clean_desc'] = NER_df['clean_desc'].apply(lambda x: remove_urls(x))
NER_df.head(10)

Unnamed: 0,description,transcription,keywords,clean_desc
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",year old white female presents with complaint ...
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",consult for laparoscopic gastric bypass
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",consult for laparoscopic gastric bypass
3,2-D M-Mode. Doppler.,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...",mode doppler
4,2-D Echocardiogram,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",echocardiogram
5,Morbid obesity. Laparoscopic antecolic anteg...,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...","bariatrics, gastric bypass, eea anastomosis, r...",morbid obesity laparoscopic antecolic antegast...
6,"Liposuction of the supraumbilical abdomen, re...","PREOPERATIVE DIAGNOSES:,1. Deformity, right b...","bariatrics, breast reconstruction, excess, lma...",liposuction of the supraumbilical abdomen revi...
7,2-D Echocardiogram,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram...",echocardiogram
8,Suction-assisted lipectomy - lipodystrophy of...,"PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...","bariatrics, lipodystrophy, abd pads, suction-a...",suction assisted lipectomy lipodystrophy of th...
9,Echocardiogram and Doppler,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,...",echocardiogram and doppler


**STEP 12 : Spelling Correction - With TextBlob Library**

In [34]:
from textblob import TextBlob
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [35]:
# Spelling correction is a cool feature which TextBlob offers, we can be accessed using the correct function as shown below.
blob = TextBlob("Why are you stting on this bech??") # Scentence with two errors
print(blob.correct()) # Correct function give us the best possible word simmilar to "gret"

Why are you sitting on this bench??


In [36]:
# we can see all the similar matches our first error along with the probability score.
blob.words[3].spellcheck()

[('sitting', 0.8078078078078078),
 ('setting', 0.11411411411411411),
 ('string', 0.036036036036036036),
 ('sting', 0.02702702702702703),
 ('stating', 0.015015015015015015)]

In [37]:
# Not cleaning the just showing the spelling check as its take lot of time to process all these tweets
## Shown sample how its must done
text = NER_df['clean_desc'][0:10].apply(lambda x: str(TextBlob(x).correct()))
text

0    year old white female presents with complaint ...
1                consult for laparoscopic gastric pass
2                consult for laparoscopic gastric pass
3                                          mode copper
4                                       echocardiogram
5    morbid obesity laparoscopic antecolic antegast...
6    liposuction of the supraumbilical abdomen revi...
7                                       echocardiogram
8    suction assisted lipectomy lipodystrophy of th...
9                            echocardiogram and copper
Name: clean_desc, dtype: object

In [38]:
# Importing stop words from NLTK coupus and word tokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [39]:
# Creating token for the clean tweets
NER_df['NER_token'] = NER_df['clean_desc'].apply(lambda x: word_tokenize(x))
NER_df.head(10)

Unnamed: 0,description,transcription,keywords,clean_desc,NER_token
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",year old white female presents with complaint ...,"[year, old, white, female, presents, with, com..."
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",consult for laparoscopic gastric bypass,"[consult, for, laparoscopic, gastric, bypass]"
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",consult for laparoscopic gastric bypass,"[consult, for, laparoscopic, gastric, bypass]"
3,2-D M-Mode. Doppler.,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...",mode doppler,"[mode, doppler]"
4,2-D Echocardiogram,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",echocardiogram,[echocardiogram]
5,Morbid obesity. Laparoscopic antecolic anteg...,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...","bariatrics, gastric bypass, eea anastomosis, r...",morbid obesity laparoscopic antecolic antegast...,"[morbid, obesity, laparoscopic, antecolic, ant..."
6,"Liposuction of the supraumbilical abdomen, re...","PREOPERATIVE DIAGNOSES:,1. Deformity, right b...","bariatrics, breast reconstruction, excess, lma...",liposuction of the supraumbilical abdomen revi...,"[liposuction, of, the, supraumbilical, abdomen..."
7,2-D Echocardiogram,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram...",echocardiogram,[echocardiogram]
8,Suction-assisted lipectomy - lipodystrophy of...,"PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...","bariatrics, lipodystrophy, abd pads, suction-a...",suction assisted lipectomy lipodystrophy of th...,"[suction, assisted, lipectomy, lipodystrophy, ..."
9,Echocardiogram and Doppler,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,...",echocardiogram and doppler,"[echocardiogram, and, doppler]"


**STEP 13 : Remove the stopwords**

In [40]:
# Importing stop words from NLTK corpus for english language
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [41]:
NER_df['NER_token_filtered'] = NER_df['NER_token'].apply(lambda x: [word for word in x if not word in stop_words])

NER_df[['NER_token', 'NER_token_filtered']].head(10)

Unnamed: 0,NER_token,NER_token_filtered
0,"[year, old, white, female, presents, with, com...","[year, old, white, female, presents, complaint..."
1,"[consult, for, laparoscopic, gastric, bypass]","[consult, laparoscopic, gastric, bypass]"
2,"[consult, for, laparoscopic, gastric, bypass]","[consult, laparoscopic, gastric, bypass]"
3,"[mode, doppler]","[mode, doppler]"
4,[echocardiogram],[echocardiogram]
5,"[morbid, obesity, laparoscopic, antecolic, ant...","[morbid, obesity, laparoscopic, antecolic, ant..."
6,"[liposuction, of, the, supraumbilical, abdomen...","[liposuction, supraumbilical, abdomen, revisio..."
7,[echocardiogram],[echocardiogram]
8,"[suction, assisted, lipectomy, lipodystrophy, ...","[suction, assisted, lipectomy, lipodystrophy, ..."
9,"[echocardiogram, and, doppler]","[echocardiogram, doppler]"


**STEP 14: Text Normalization**

Normalization is helpful in reducing the number of unique tokens present in the text, removing the variations in a text. and also cleaning the text by removing redundant information.

Two popular methods used for normalization are stemming and lemmatization. Let’s discuss them in detail. Nevertheless, for this purpose, I will be creating two columns, one for stemming and another one for lemmatization

**A. Stemming**

In [42]:
# Importing library for stemming
from nltk.stem import PorterStemmer
# Initialize the stemming object
stemming = PorterStemmer()

In [43]:
# Created one more columns tweet_stemmed it shows tweets' stemmed version
NER_df['NER_stemmed'] = NER_df['NER_token_filtered'].apply(lambda x: ' '.join([stemming.stem(i) for i in x]))
NER_df['NER_stemmed'].head(10)

0       year old white femal present complaint allergi
1                    consult laparoscop gastric bypass
2                    consult laparoscop gastric bypass
3                                         mode doppler
4                                       echocardiogram
5    morbid obes laparoscop antecol antegastr roux ...
6    liposuct supraumbil abdomen revis right breast...
7                                       echocardiogram
8    suction assist lipectomi lipodystrophi abdomen...
9                               echocardiogram doppler
Name: NER_stemmed, dtype: object

**B. Lemmatization**

In [44]:
# Importing library for lemmatizing
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizing = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Created one more columns tweet_lemmatized it shows tweets' lemmatized version

In [45]:
NER_df['NER_lemmatized'] = NER_df['NER_token_filtered'].apply(lambda x: ' '.join([lemmatizing.lemmatize(i) for i in x]))
NER_df['NER_lemmatized'].head(10)

0      year old white female present complaint allergy
1                  consult laparoscopic gastric bypass
2                  consult laparoscopic gastric bypass
3                                         mode doppler
4                                       echocardiogram
5    morbid obesity laparoscopic antecolic antegast...
6    liposuction supraumbilical abdomen revision ri...
7                                       echocardiogram
8    suction assisted lipectomy lipodystrophy abdom...
9                               echocardiogram doppler
Name: NER_lemmatized, dtype: object

In [46]:
NER_df.head(3)

Unnamed: 0,description,transcription,keywords,clean_desc,NER_token,NER_token_filtered,NER_stemmed,NER_lemmatized
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",year old white female presents with complaint ...,"[year, old, white, female, presents, with, com...","[year, old, white, female, presents, complaint...",year old white femal present complaint allergi,year old white female present complaint allergy
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",consult for laparoscopic gastric bypass,"[consult, for, laparoscopic, gastric, bypass]","[consult, laparoscopic, gastric, bypass]",consult laparoscop gastric bypass,consult laparoscopic gastric bypass
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",consult for laparoscopic gastric bypass,"[consult, for, laparoscopic, gastric, bypass]","[consult, laparoscopic, gastric, bypass]",consult laparoscop gastric bypass,consult laparoscopic gastric bypass


In [47]:
# !python -m spacy download en_core_web_sm

# **STEP 15: Parts-of-speech tagging (POS Tagging)**

In [48]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

In [49]:
test_data1= NER_df['clean_desc'].to_string()

In [50]:
doc = nlp(test_data1)
fmt_str = "{:<8}| {:<6}| {:<8}| {:<8}"
print(fmt_str.format("token","pos", "label","parent"))
for token in doc:
    print(fmt_str.format(token.text, token.pos_, token.ent_type_, token.head.text))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

       | SPACE |         | ...     
4264    | NUM   | DATE    | consultation
        | SPACE |         | 4264    
neurologic| ADJ   |         | consultation
consultation| NOUN  |         | requested
was     | AUX   |         | requested
requested| VERB  |         | requested
to      | ADP   |         | requested
evalu   | VERB  |         | to      
...     | PUNCT |         | requested

       | SPACE |         | ...     
4268    | NUM   | DATE    | 4268    
        | SPACE |         | 4268    
the     | DET   |         | patient 
patient | NOUN  |         | admitted
is      | AUX   |         | admitted
admitted| VERB  |         | admitted
with    | ADP   |         | admitted
diagnosis| NOUN  |         | with    
of      | ADP   |         | diagnosis
acut    | NOUN  |         | of      
...     | PUNCT |         | admitted

       | SPACE |         | ...     
4269    | NUM   | DATE    | patient 
        | SPACE |        

# **STEP 16: Biomedical Name Entity Recognition (BIO-NER)**

For this task, we will apply Med7 model which is developed in the context of biomedical name entity recognition in terms of extracting significant information from the large number of medical dataset. The Med7 trained model comprises three components in its pipeline: <br>


*   tagger
*   parser

*   clinical NER with seven categories

In [51]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

**Test using dummy text**

In [52]:
def medicalNER_Exp2():
    
    col_dict={}
    seven_colours=['#e6194B', '#3cb44b', '#ffe119', '#ffd8b1', '#f58231', '#f032e6', '#42d4f4']
    for label, colour in zip(nlp.pipe_labels['ner'], seven_colours):
        col_dict[label]= colour
    options= {'ents': nlp.pipe_labels['ner'], 'colors':col_dict}  
    
    text='A patient was prescribed Magnesium hydroxide 400mg/5ml suspension PO of total 30ml bid for the next 5 days.'
    doc = nlp(text)
    spacy.displacy.render(doc, style='ent', jupyter=True, options=options)
    
    return [(ent.text, ent.label_) for ent in doc.ents]

In [53]:
medicalNER_Exp2()

[('400mg/5ml', 'CARDINAL'), ('30ml', 'ORDINAL'), ('the next 5 days', 'DATE')]

The Med7 model identifies correctly all seven entities in the following example and highlights them in different colours for better visualisation:

**Test with original dataset**

First, we have to convert the dataframe into the from of string owing to the fact that it is quite difficult to feed the dataframe directly to the Med7 model instead of converting to the string format that I have experimented. Let's do that.

In [54]:
test_data2= NER_df['clean_desc'].to_string()

In [55]:
type(test_data2)

str

Observation: We have successfully converted our dataframe into the form of string format. Now let's apply the Med7 which is RoBERTa-base implementation. The authors of this model stated that their future works will improve its performance and introduce new feautres. Some entities may not be identified correctrly.

In [56]:
col_dict={}
seven_colours=['#e6194B', '#3cb44b', '#ffe119', '#ffd8b1', '#f58231', '#f032e6', '#42d4f4']
for label, colour in zip(nlp.pipe_labels['ner'], seven_colours):
  
  col_dict[label]= colour
  options= {'ents': nlp.pipe_labels['ner'], 'colors':col_dict}  
    
  #text='Objectives: The United Kingdom Parkinsons Disease Research Group (UKPDRG) trial found an increased mortality in patients with Parkinsons disease (PD) randomized to receive 10 mg selegiline per day and L-dopa compared with those taking L-dopa alone.'
  doc = nlp(test_data2)
  spacy.displacy.render(doc, style='ent', jupyter=True, options=options)

Output hidden; open in https://colab.research.google.com to view.

**Observation: Look, we have applied the Med7 model to our dataset and it has roughly identified the entities. Some of them may not be correct and we have to measure the performance more, for this purpose, we need to experiment with a vast amount of dataset**

# **STEP 17: Abbreviation detection**

# **STEP 18: Negation with negspaCy**

# **STEP 19: Keywords Extraction**

In [67]:
NER_df.head(3)

Unnamed: 0,description,transcription,keywords,clean_desc,NER_token,NER_token_filtered,NER_stemmed,NER_lemmatized
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",year old white female presents with complaint ...,"[year, old, white, female, presents, with, com...","[year, old, white, female, presents, complaint...",year old white femal present complaint allergi,year old white female present complaint allergy
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",consult for laparoscopic gastric bypass,"[consult, for, laparoscopic, gastric, bypass]","[consult, laparoscopic, gastric, bypass]",consult laparoscop gastric bypass,consult laparoscopic gastric bypass
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",consult for laparoscopic gastric bypass,"[consult, for, laparoscopic, gastric, bypass]","[consult, laparoscopic, gastric, bypass]",consult laparoscop gastric bypass,consult laparoscopic gastric bypass


In [57]:
pip install keybert

Collecting keybert
  Downloading keybert-0.5.1.tar.gz (19 kB)
Collecting sentence-transformers>=0.3.8
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 3.3 MB/s 
Collecting rich>=10.4.0
  Downloading rich-12.3.0-py3-none-any.whl (232 kB)
[K     |████████████████████████████████| 232 kB 10.4 MB/s 
Collecting commonmark<0.10.0,>=0.9.0
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 6.1 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 40.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 39.4 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB

In [58]:
from keybert import KeyBERT

In [59]:
test_data3= NER_df['clean_desc'].to_string()

In [60]:
kw_model= KeyBERT()
keywords= kw_model.extract_keywords(test_data3)
keywords

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

[('laparoscopic', 0.3729),
 ('tracheostomy', 0.3698),
 ('postoperative', 0.3672),
 ('laparotomy', 0.3635),
 ('metastasectomy', 0.3621)]

keyphrase_ngram_range to set the length of the resulting keywords/keyphrases

In [61]:
kw_model.extract_keywords(test_data3, keyphrase_ngram_range=(1,1), stop_words=None)

[('laparoscopic', 0.3729),
 ('tracheostomy', 0.3698),
 ('postoperative', 0.3672),
 ('laparotomy', 0.3635),
 ('metastasectomy', 0.3621)]

To extract keyphrases, simply set keyphrase_ngram_range to (1, 2) or higher depending on the number of words

In [62]:
kw_model.extract_keywords(test_data3, keyphrase_ngram_range=(1,2), stop_words=None)

[('bariatric surgery', 0.4977),
 ('tracheostomy urgent', 0.4891),
 ('diagnostic operative', 0.4772),
 ('laparoscopic roux', 0.472),
 ('laparoscopic supracervical', 0.4644)]

We can highlight the keywords in the document by simply setting hightlight:

In [63]:
keywords = kw_model.extract_keywords(test_data3, highlight=True)

**Max sum similarity**

In [64]:
kw_model.extract_keywords(test_data3, keyphrase_ngram_range=(3, 3), stop_words='english',
                              use_maxsum=True, nr_candidates=20, top_n=5)

[('218 transurethral resection', 0.4689),
 ('620 laparoscopic supracervical', 0.4705),
 ('post gastric bypass', 0.4726),
 ('exploration tracheostomy urgent', 0.473),
 ('patient presented bariatric', 0.4839)]

**Maximal Marginal Relevance**

In [65]:
kw_model.extract_keywords(test_data3, keyphrase_ngram_range=(3, 3), stop_words='english',
                              use_mmr=True, diversity=0.7)

[('evaluation bariatric surgery', 0.5359),
 ('ph 2481 normal', 0.1078),
 ('cervical epidural steroid', 0.2255),
 ('creation autologous right', 0.0712),
 ('bypass mode doppler', 0.1128)]

In [66]:
kw_model.extract_keywords(test_data3, keyphrase_ngram_range=(3, 3), stop_words='english',use_mmr=True, diversity=0.2)

[('evaluation bariatric surgery', 0.5359),
 ('exploration tracheostomy urgent', 0.473),
 ('2590 laparoscopic supracervical', 0.4855),
 ('obesity laparoscopic roux', 0.505),
 ('gastric bypass consult', 0.4998)]