In [143]:
import warnings
warnings.filterwarnings('ignore',category=DeprecationWarning)

import pandas as pd
import numpy as np

import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import spacy
from nltk.corpus import stopwords
from nltk import word_tokenize
import itertools
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF 
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation

import pickle

In [144]:
df = pd.read_csv('data/cleaned_topics_reduced.csv').drop(['cleaned','cleaned_lemma'],axis=1)
df

Unnamed: 0,medical_specialty,transcription
0,Cardiovascular / Pulmonary,"2-D M-MODE: , ,1. Left atrial enlargement wit..."
1,Cardiovascular / Pulmonary,1. The left ventricular cavity size and wall ...
2,Cardiovascular / Pulmonary,"2-D ECHOCARDIOGRAM,Multiple views of the heart..."
3,Cardiovascular / Pulmonary,"DESCRIPTION:,1. Normal cardiac chambers size...."
4,Cardiovascular / Pulmonary,"2-D STUDY,1. Mild aortic stenosis, widely calc..."
...,...,...
3709,Cardiovascular / Pulmonary,"INDICATION: , Chest pain.,TYPE OF TEST: , Aden..."
3710,Cardiovascular / Pulmonary,"CHIEF COMPLAINT: , Chest pain.,HISTORY OF PRES..."
3711,Cardiovascular / Pulmonary,"HISTORY OF PRESENT ILLNESS: , The patient is a..."
3712,Cardiovascular / Pulmonary,"HISTORY OF PRESENT ILLNESS: , Mr. ABC is a 60-..."


In [145]:
useless_words = ['patient','abc','use','place','history','procedure','right','left','perform','normal','well']

In [146]:
stop_words = stopwords.words('english')
stop_words.extend(list(string.punctuation))
stop_words.extend(useless_words)
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [213]:
def clean(doc):
    cleaned = [x.lower() for x in simple_preprocess(doc) if 
                ((x.lower() not in stop_words) and (x.isalpha()) and (len(x) > 1))]
    return cleaned

def lemmatization(docs,pos_tags):
    en_core = spacy.load('en_core_web_sm')
    final_docs = []
    for doc in docs:
        doc = en_core(doc)
        new_doc = []
        for token in doc:
            if token.lemma_ in ['use', 'place', 'note']:
                continue
            if token.pos_ in pos_tags:
                new_doc.append(token.lemma_)
            else:
                new_doc.append(token.text)
        joined = ' '.join(new_doc)
        final_docs.append(joined)
    return final_docs

In [214]:
def preprocess(doc):
    norm_doc = clean(doc)
    norm_doc = lemmatization(norm_doc,pos_tags)
    return norm_doc

In [215]:
pos_tags = ['ADJ','ADV','NOUN','VERB','PRON']

In [216]:
df['second_gensim_clean'] = df['transcription'].apply(preprocess)

In [217]:
dictionary = gensim.corpora.Dictionary(df['second_gensim_clean'])
len(dictionary)

15376

In [218]:
df

Unnamed: 0,medical_specialty,transcription,second_gensim_clean
0,Cardiovascular / Pulmonary,"2-D M-MODE: , ,1. Left atrial enlargement wit...","[mode, atrial, enlargement, atrial, diameter, ..."
1,Cardiovascular / Pulmonary,1. The left ventricular cavity size and wall ...,"[ventricular, cavity, size, wall, thickness, a..."
2,Cardiovascular / Pulmonary,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","[echocardiogram, multiple, view, heart, great,..."
3,Cardiovascular / Pulmonary,"DESCRIPTION:,1. Normal cardiac chambers size....","[description, cardiac, chamber, size, ventricu..."
4,Cardiovascular / Pulmonary,"2-D STUDY,1. Mild aortic stenosis, widely calc...","[study, mild, aortic, stenosis, widely, calcif..."
...,...,...,...
3709,Cardiovascular / Pulmonary,"INDICATION: , Chest pain.,TYPE OF TEST: , Aden...","[indication, chest, pain, type, test, adenosin..."
3710,Cardiovascular / Pulmonary,"CHIEF COMPLAINT: , Chest pain.,HISTORY OF PRES...","[chief, complaint, chest, pain, present, illne..."
3711,Cardiovascular / Pulmonary,"HISTORY OF PRESENT ILLNESS: , The patient is a...","[present, illness, year, old, woman, follow, a..."
3712,Cardiovascular / Pulmonary,"HISTORY OF PRESENT ILLNESS: , Mr. ABC is a 60-...","[present, illness, mr, year, old, gentleman, m..."


In [221]:
word_count = {}
for doc in df['second_gensim_clean']:
    for word in doc:
        if word not in word_count:
            word_count[word]=1
        if word in word_count:
            word_count[word]+=1
word_count = dict(sorted(word_count.items(), key=lambda x: x[1], reverse=True))
word_count = list(word_count.items())[:25]
keys = [item[0] for item in word_count]
values = [item[1] for item in word_count]
word_count

[('', 16509),
 ('pain', 4244),
 ('diagnosis', 3741),
 ('time', 3682),
 ('year', 3503),
 ('remove', 3448),
 ('take', 3385),
 ('perform', 3343),
 ('incision', 3265),
 ('also', 3106),
 ('artery', 3102),
 ('anesthesia', 3052),
 ('mg', 2957),
 ('skin', 2934),
 ('blood', 2933),
 ('suture', 2831),
 ('without', 2714),
 ('room', 2707),
 ('position', 2614),
 ('present', 2535),
 ('see', 2526),
 ('make', 2510),
 ('old', 2500),
 ('anterior', 2426),
 ('general', 2418)]

In [222]:
for doc in df['second_gensim_clean']:
    for word in doc:
        if word == '':
            doc.remove(word)

word_count = {}
for doc in df['second_gensim_clean']:
    for word in doc:
        if word not in word_count:
            word_count[word]=1
        if word in word_count:
            word_count[word]+=1
word_count = dict(sorted(word_count.items(), key=lambda x: x[1], reverse=True))
word_count = list(word_count.items())[:25]
keys = [item[0] for item in word_count]
values = [item[1] for item in word_count]
word_count

[('pain', 4244),
 ('diagnosis', 3741),
 ('time', 3682),
 ('year', 3503),
 ('remove', 3448),
 ('take', 3385),
 ('perform', 3343),
 ('incision', 3265),
 ('also', 3106),
 ('artery', 3102),
 ('anesthesia', 3052),
 ('mg', 2957),
 ('skin', 2934),
 ('blood', 2933),
 ('suture', 2831),
 ('without', 2714),
 ('room', 2707),
 ('position', 2614),
 ('present', 2535),
 ('see', 2526),
 ('make', 2510),
 ('old', 2500),
 ('anterior', 2426),
 ('general', 2418),
 ('follow', 2393)]

In [223]:
with open('data/second_gensim.pickle', 'wb') as f:
    pickle.dump(df, f)