In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
from pymongo import MongoClient
client = MongoClient()

In [2]:
drs = client.docinfo.doctors

In [3]:
docnames = []
docbios = []

In [4]:
for dr in drs.find({}, {"profile.bio": 1, "profile.first_name":1,
                        "profile.last_name":1, "profile.title":1, "_id": 0}):
    first_name = dr['profile']['first_name']
    last_name = dr['profile']['last_name']
    title = dr['profile']['title']
    try:
        name = first_name + " " + last_name + ", " + title
    except TypeError:
        name = first_name + " " + last_name
    docnames.append(name)
    docbios.append(dr['profile']['bio'].replace(u"\u000A", " "))

In [5]:
for d in docnames[0:10]:
    print d

Jagga Alluri, MD
Michael Amoashiy, MD
Brian Anziska, MD
Yasemin Baldik, MD
Ramani Balu, MD
Ron Ben-Meir, DO
Monica Bilboul, PhD
Ivan Bodis-Wollner, MD
Yuri Brosgol, MD
Alan Carver, MD


In [7]:
print docbios[0]

Dr. Jagga Alluri specializes in neurology and clinical neurophysiology in Forest Hills, New York. Dr. Alluri has 20 years experience as an MD, graduated from Rangaraya Medical College, Kakinda, Neurology Residency at Newyork University Medical Center, New York City, and fellowship in Neuromuscular physiology at NYU Medical Center, Hospital for Joint Diseases, NY. Dr. Alluri have extensive experience in the diagnosis and treatment of various neurological disorders like Alzheimer's, seizures, neck pain, memory loss, tumours, Parkinson's disease, migraine, vertigo, Sleep disorders, Strokes etc. His ability to provide onsite ancillary neurological testing allows early diagnosis. Individualized follow up care with thorough discussions on the progress are designed to establish realistic expectations. Dr. Alluri works closely with referring and/or primary care physicians and other specialists to create a plan of care that takes into consideration all aspects of medical problems including pers

In [8]:
print len(docbios)

2691


In [9]:
print len(docnames)

2691


### stopwords, stemming and tokenizing

In [10]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')

In [11]:
print stopwords[:10]

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your']


In [12]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [13]:
# here he defines a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [14]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in docbios:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [15]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print 'there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame'

there are 191885 items in vocab_frame


In [16]:
df = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_tokenized)
print 'there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame'

there are 191885 items in vocab_frame


In [17]:
print vocab_frame.head()

               words
dr.              dr.
jagga          jagga
alluri        alluri
special  specializes
in                in


### Tf-idf and document similarity

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, stop_words='english',
                                   use_idf=True,
                                   tokenizer=tokenize_only, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(docbios) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 9.04 s, sys: 961 ms, total: 10 s
Wall time: 9.79 s
(2691, 60133)


In [25]:
terms = tfidf_vectorizer.get_feature_names()

In [26]:
print len(terms)

60133


In [27]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [28]:
print dist.shape

(2691, 2691)


### K-means clustering

In [29]:
from sklearn.cluster import KMeans

n = 5
km = KMeans(n_clusters=n)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 4.32 s, sys: 340 ms, total: 4.66 s
Wall time: 5.55 s


In [30]:
from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster.pkl')

# km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [30]:
joblib.dump(docbios, 'docbios.pkl')

['docbios.pkl']

In [31]:
joblib.dump(docnames, 'docnames.pkl')

['docnames.pkl']

In [32]:
#!pip install textstat
#import textstat

### hacky attempt at reading level

In [31]:
sentlen = []
wordlen = []
fkgl  = []

In [32]:
sentlen = [nltk.sent_tokenize(d) for d in docbios]

In [33]:
wordlen = [[nltk.word_tokenize(t) for t in bio] for bio in sentlen]

In [34]:
counts = []

In [35]:
for dr in wordlen:
    sentence_cnt = len(dr)
    word_cnt = 0
    letter_cnt = 0
    for sentence in dr:
        word_cnt += len(sentence)
        for word in sentence:
            letter_cnt += len(word)
    counts.append((sentence_cnt, word_cnt, letter_cnt))

In [36]:
from __future__ import division

In [37]:
counts[0]

(6, 174, 914)

In [38]:
def catch(func, handle=lambda e : e, *args, **kwargs):
    try:
        return func(*args, **kwargs)
    except ZeroDivisionError:
        return np.nan

In [39]:
fkgl = [catch(lambda: 0.39 * (dr[1]/dr[0]) + 11.8 * (dr[2]/dr[1])/2.83 - 15.59) \
        for dr in counts]

In [40]:
fkgl.count(np.nan)

186

In [41]:
len(fkgl)

2691

In [42]:
fkgl[0]

17.622441005645587

In [43]:
max(fkgl)

47.35116607773851

In [44]:
min(fkgl)

5.826548881036516

In [45]:
clusters[0]

4

In [46]:
docs = { 'name': docnames, 'bio': docbios, 'flesch_kincaid': fkgl, 'cluster': clusters }

In [47]:
len(df)

191885

In [48]:
docsdf = pd.DataFrame(docs, index = [clusters] , columns = ['name', 'flesch_kincaid', 'cluster'])

In [53]:
docsdf['cluster'].value_counts()

1    768
4    664
0    487
2    397
3    375
dtype: int64

In [54]:
grouped = docsdf['flesch_kincaid'].groupby(docsdf['cluster']) #groupby cluster for aggregation purposes

grouped.mean() #average flesch_kincaid per cluster

cluster
0    24.183680
1    12.136408
2    13.264695
3    12.937860
4    13.332016
Name: flesch_kincaid, dtype: float64

In [55]:
grouped.median()

cluster
0    23.509470
1    11.867986
2    12.914639
3    12.690212
4    12.145712
Name: flesch_kincaid, dtype: float64

In [56]:
num_clusters = len(pd.unique(docsdf['cluster']))

In [59]:
print "Top terms per cluster:"
print
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print "Cluster %d words:" % i, end=''
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print ' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0], end=','
    print #add whitespace
    print #add whitespace
    
    print"Cluster %d doctors:" % i, end=''
    for title in docsdf.ix[i]['name'].values.tolist():
        print(' %s,' % title, end=''
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: nan, nan, nan, nan, nan, nan,

Cluster 0 doctors: Brian Anziska, MD, Ramani Balu, MD, Ron Ben-Meir, DO, Ivan Bodis-Wollner, MD, Kina Cliette, MD, Joan Cracco, MD, Roger Cracco, MD, Howard Crystal, MD, Robert Delgado Jr, MD, Anna Delios, MD, Evan Fertig, MD, Zorica Filipovic Jewell, MD, Eric Fremed, MD, Miriam Friedlander, MD, Miriam Galescu, MD, Radha Giridharan, MD, Zlatin Ivanov, MD, James Jeng, MD, Daniel Labovitz, MD, Betty Lau, MD, Marguerite Lederberg, MD, David Levine, MD, John Lops, DO, Paul MacCabee, MD, Paolo Manfredi, MD, Tresa McSween, MD, Dr Md, MD, Andreas Neophytides, MD, Nikolaos Papamitsakis, MD, Chilvana Patel, MD, Steven Pavlakis, MD, Thomas Perera, MD, Jerome Posner, MD, David Prince, MD, Aaron Rabin, MD, Martin Sadowski, MD, Sophia Sharfstein, MD, Leonid Shkolnik, MD, Beth Silverstein, DO, Seymour Solomon, MD, Mahendra Somasundaram, MD, Jose Torrijos, MD, George Vas, MD, Thomas Wisniewski, MD, Anna Yusim, MD, Brian Anziska, 

In [60]:
docsdf.head()

Unnamed: 0,name,flesch_kincaid,cluster
4,"Jagga Alluri, MD",17.622441,4
4,"Michael Amoashiy, MD",13.747045,4
0,"Brian Anziska, MD",22.949994,0
4,"Yasemin Baldik, MD",,4
0,"Ramani Balu, MD",24.276038,0


In [64]:
docnames.index('Brian Anziska, MD')

2

In [65]:
docbios[2]

u'Dr. Brian Anziska, MD--specialist in child & adolescent psychiatry, clinical neurophysiology, neurology, chiropractic neurology, neurology physical therapy, psychiatry, psychiatry & neurology behavioral neurology & neuropsychiatry, psychiatry & neurology diagnostic neuroimaging, psychiatry & neurology forensic psychiatry, psychiatry & neurology hospice and palliative medicine, psychiatry & neurology psychosomatic medicine, and psychiatry & neurology pain medicine--currently practices medicine at Brooklyn, New York, New york, New York, and New rochelle, New York.  Dr. Anziska is licensed to treat patients in New York.  Dr. Anziska has been found during an automated background check to be clear of any malpractice history and holds one or more active medical licenses.'

In [66]:
docnames[2]

u'Brian Anziska, MD'

In [70]:
count = 0
for bio in docbios:

In [73]:
docbios[3]

u''

In [78]:
count = 0
for bio in docbios:
    if bio == '':
        count += 1
print(count)

186


In [80]:
print (len(docbios))

2691
