In [3]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

In [4]:
import pickle

In [6]:
from sklearn.externals import joblib

docbios = joblib.load('docbios.pkl')
docnames = joblib.load('docnames.pkl')

In [7]:
for d in docnames[0:10]:
    print d

Jagga Alluri, MD
Michael Amoashiy, MD
Brian Anziska, MD
Yasemin Baldik, MD
Ramani Balu, MD
Ron Ben-Meir, DO
Monica Bilboul, PhD
Ivan Bodis-Wollner, MD
Yuri Brosgol, MD
Alan Carver, MD


In [8]:
print docbios[0]

Dr. Jagga Alluri specializes in neurology and clinical neurophysiology in Forest Hills, New York. Dr. Alluri has 20 years experience as an MD, graduated from Rangaraya Medical College, Kakinda, Neurology Residency at Newyork University Medical Center, New York City, and fellowship in Neuromuscular physiology at NYU Medical Center, Hospital for Joint Diseases, NY. Dr. Alluri have extensive experience in the diagnosis and treatment of various neurological disorders like Alzheimer's, seizures, neck pain, memory loss, tumours, Parkinson's disease, migraine, vertigo, Sleep disorders, Strokes etc. His ability to provide onsite ancillary neurological testing allows early diagnosis. Individualized follow up care with thorough discussions on the progress are designed to establish realistic expectations. Dr. Alluri works closely with referring and/or primary care physicians and other specialists to create a plan of care that takes into consideration all aspects of medical problems including pers

In [9]:
print len(docbios)

2691


In [10]:
print len(docnames)

2691


### removing null descriptions

In [64]:
docnames.index('Brian Anziska, MD')

2

In [41]:
docbios = np.array(docbios)

emptybios = []
emptybios.append(np.where(docbios==''))
print len(emptybios)

emptybios = np.array(emptybios[0][0])

print len(docnames)
print len(docbios)

print len(emptybios)

print len(docnames)-len(emptybios)

new_docnames = np.delete(docnames, emptybios)
new_docbios = np.delete(docbios, emptybios)

print len(docnames)
print len(docbios)

docnames = new_docnames
docbios = new_docbios

len(np.unique(docnames))

2505-1899

In [78]:
from collections import defaultdict

docdict = defaultdict(list)

for name, bio in zip(docnames, docbios):
    docdict[name] = bio

In [116]:
print docdict['Ludmila Davidov, MD']
print
print docdict['Igor Gavrilovic, MD']

Dr. Ludmila Davidov, MD is one of the country's most highly ranked doctors. Her specialties include psychiatry and she currently treats patients in Flushing, New York, Rego park, New York, and Whitestone, New York.  Dr. Davidov completed medical school at Tajik State Medical University Named After Abuali Ibn Sino and is licensed to see patients in New York.  Based on an in-depth analysis of Dr. Davidov's credentials, experience and network, she has been found to be among the 20% of doctors nationwide.  Dr. Davidov has been found to hold one or more active medical licenses, and successfully passed a malpractice history screening.

I am a neurologist with specialty training in neuro-oncology who sees patients at Memorial Sloan-Kettering facilities in both New York City and Basking Ridge, New Jersey. I treat patients with primary brain tumors and patients with neurological complications of systemic cancer. As a neuro-oncologist, I provide continuity of care for those patients with primary

In [113]:
len(docdict)

1899

In [None]:
#joblib.dump(docdict,'docdict.pkl')

In [112]:
2691-2051

640

### stopwords, stemming and tokenizing

In [99]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')

In [100]:
print stopwords[:10]

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your']


In [101]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [102]:
# here he defines a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [103]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in docdict.values():
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [104]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print 'there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame'

there are 135761 items in vocab_frame


In [105]:
df = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_tokenized)
print 'there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame'

there are 135761 items in vocab_frame


In [264]:
vocab_frame.ix['malpractice'].values.tolist()[0]

KeyError: 'malpractice'

In [169]:
vocab_frame.head()

Unnamed: 0,words
dr.,dr.
ludmila,ludmila
davidov,davidov
md,md
is,is


In [106]:
print vocab_frame.head()

           words
dr.          dr.
ludmila  ludmila
davidov  davidov
md            md
is            is


### Tf-idf and document similarity

In [214]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, stop_words='english',
                                   use_idf=True,
                                   tokenizer=tokenize_only, ngram_range=(2,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(docdict.values()) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 6.18 s, sys: 146 ms, total: 6.32 s
Wall time: 6.47 s
(1899, 53839)


In [143]:
terms = tfidf_vectorizer.get_feature_names()

In [171]:
terms[0:10]

[u"'s",
 u"'s adult",
 u"'s adult long-term",
 u"'s advisory",
 u"'s advisory committee",
 u"'s affairs",
 u"'s affairs medical",
 u"'s areas",
 u"'s areas expertise",
 u"'s best"]

In [222]:
'' in terms_np:

In [144]:
print len(terms)

60113


In [145]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [146]:
print dist.shape

(1899, 1899)


### K-means clustering

In [181]:
print(tfidf_matrix.shape)

(1899, 60113)


In [147]:
from sklearn.cluster import KMeans

n = 5
km = KMeans(n_clusters=n)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 2.24 s, sys: 288 ms, total: 2.53 s
Wall time: 2.77 s


In [148]:
#from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster.pkl')

# km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [17]:
#joblib.dump(docbios, 'docbios.pkl')
#joblib.dump(docnames, 'docnames.pkl')

In [32]:
#!pip install textstat
#import textstat

### hacky attempt at reading level

In [119]:
sentlen = []
wordlen = []
fkgl  = []

In [120]:
sentlen = [nltk.sent_tokenize(d) for d in docdict.values()]

In [121]:
wordlen = [[nltk.word_tokenize(t) for t in bio] for bio in sentlen]

In [122]:
counts = []

In [123]:
for dr in wordlen:
    sentence_cnt = len(dr)
    word_cnt = 0
    letter_cnt = 0
    for sentence in dr:
        word_cnt += len(sentence)
        for word in sentence:
            letter_cnt += len(word)
    counts.append((sentence_cnt, word_cnt, letter_cnt))

In [124]:
from __future__ import division

In [125]:
counts[0]

(5, 119, 532)

In [126]:
def catch(func, handle=lambda e : e, *args, **kwargs):
    try:
        return func(*args, **kwargs)
    except ZeroDivisionError:
        return np.nan

In [127]:
fkgl = [catch(lambda: 0.39 * (dr[1]/dr[0]) + 11.8 * (dr[2]/dr[1])/2.83 - 15.59) \
        for dr in counts]

In [128]:
fkgl.count(np.nan)

0

In [129]:
len(fkgl)

1899

In [130]:
fkgl[0]

12.332615256703388

In [131]:
max(fkgl)

47.35116607773851

In [132]:
min(fkgl)

5.826548881036516

### looking at the clusters

In [133]:
clusters[0]

2

In [154]:
print len(docdict.keys())
print len(docdict.values())
print len(fkgl)
print len(clusters)

1899
1899
1899
1899


In [155]:
docs = { 'name': docdict.keys(), 'bio': docdict.values(), 'flesch_kincaid': fkgl, 'cluster': clusters }

In [156]:
len(df)

135761

In [157]:
docsdf = pd.DataFrame(docs, index = [clusters] , columns = ['name', 'flesch_kincaid', 'cluster'])

In [158]:
docsdf['cluster'].value_counts()

1    533
2    407
0    396
4    388
3    175
dtype: int64

In [159]:
grouped = docsdf['flesch_kincaid'].groupby(docsdf['cluster']) #groupby cluster for aggregation purposes

grouped.mean() #average flesch_kincaid per cluster

cluster
0    12.130809
1    12.662505
2    12.930212
3    23.717654
4    12.568479
Name: flesch_kincaid, dtype: float64

In [160]:
grouped.median()

cluster
0    11.799146
1    11.820971
2    12.710589
3    23.112517
4    12.443783
Name: flesch_kincaid, dtype: float64

In [161]:
num_clusters = len(pd.unique(docsdf['cluster']))

In [251]:
terms[7048].split(" ")

[u'check', u'clear']

In [253]:
vocab_frame.ix[terms[7048].split(" ")].values.tolist()[0][0]

u'check'

In [269]:
terms[33679].encode('utf-8', 'ignore')

'malpractice history holds'

In [252]:
terms[33679].split(" ")

[u'malpractice', u'history', u'holds']

In [255]:
vocab_frame.ix[terms[33679].split(" ")]

Unnamed: 0,words
malpractice,
history,
holds,


In [275]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        try:
            print(' %s' % terms[ind].encode('utf-8', 'ignore'), end=',')
            print()
        except AttributeError as e:
            print("AttributeError")
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    for title in docsdf.ix[i]['name'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: check clear,
 malpractice history holds,
 background check clear,
 history holds,
 holds active,
 holds active medical,


Cluster 0 titles: Paula Marcus, MD, Eve Faber, MD, Antonio Giancotti, MD, Hersha Diaz, PSYD, Griselda Bartha, MD, Grace Hennessy, MD, Kevin Cotterell, MD, Gregorio Sungcad, MD, Carlos Saavedra Vele, MD, Vicki Kalira, MD, Irene Shulga, MD, Deborah Petrone, MD, Daniel Rosell, MD, Irina Kiblitsky, MD, Yvanka Pachas, MD, Ralph OConnell, MD, Dinshaw Bamji, MD, June Christmas, MD, Charles Lee, MD, James Bernard, MD, Alan Engelberg, MD, Omar Pena, MD, Sharon Sageman, MD, Hamayun Ahmed, MD, Ho-Chong Chyu, MD, William Weiss, MD, Mehmet Tosyali, MD, Mikhail Nickita, MD, Jana Colton, MD, Gary Weinstein, MD, Jeffrey Schwam, MD, Ramotse Saunders, MD, Joon Chang, MD, Pervez Akhter, MD, Mark Finger, MD, Oksana Cohen, MD, Natasha Wallace, MD, Roger Wolfsohn, MD, M Sublette, MD, Ronald Brenner, MD, Linda Brady, MD, Richard Hess, MD, Richard C

In [281]:
print(docdict['Sharon Lee, MD'])
print()
print(docdict['Ifeanyi Mbbs, MBBS'])

Dr. Sharon Lee, MD MDMPH treats patients in Somerville, Massachusetts, specializing in aerospace medicine, environmental preventive medicine, occupational medicine, preventive medical toxicology, preventive medicine, preventive sports medicine, undersea and hyperbaric medicine, dentist dental public health, and preventive medicine clinical informatics.  Dr. Lee is licensed to treat patients in Massachusetts and New York.  In addition to having active medical licenses, Dr. Lee has been found during an automated background check to be clear of any malpractice history and holds one or more active medical licenses.

Dr. Ifeanyi Mbbs, MBBS MD--specialist in addiction psychiatry, child & adolescent psychiatry, clinical neurophysiology, psychiatry, psychiatry & neurology behavioral neurology & neuropsychiatry, psychiatry & neurology diagnostic neuroimaging, psychiatry & neurology forensic psychiatry, psychiatry & neurology hospice and palliative medicine, psychiatry & neurology psychosomatic 

In [213]:
vocab_frame.ix[terms[2].split(' ')].values.tolist()[360][0]

u'long-term'