In [1]:
import pickle
import pandas as pd
from spacy.en import English, STOPWORDS
import re
import pyLDAvis
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem import WordNetLemmatizer 
import numpy as np

In [2]:
from sklearn.externals import joblib

In [12]:
doc_cluster0 = joblib.load('doc_cluster0.pkl')
doc_cluster1 = joblib.load('doc_cluster1.pkl')
doc_cluster2 = joblib.load('doc_cluster2.pkl')
docdict = joblib.load('docdict.pkl')

In [27]:
print doc_cluster0['flesch_kincaid'].mean()
print doc_cluster1['flesch_kincaid'].mean()
print doc_cluster2['flesch_kincaid'].mean()

12.5798756854
12.5911252657
23.7176540296


In [28]:
print len(doc_cluster0)
print len(doc_cluster1)
print len(doc_cluster2)

1335
389
175


In [4]:
doc_cluster0.head()

Unnamed: 0,name,flesch_kincaid,bio,cluster
0,"Ludmila Davidov, MD",12.332615,"Dr. Ludmila Davidov, MD is one of the country'...",0
0,"Faiq Hameedi, MD",9.976244,"Specializing in psychiatry, Dr. Faiq Hameedi, ...",0
0,"Hersha Diaz, PSYD",11.019929,"Dr. Hersha Diaz, PSYD specializes in psycholog...",0
0,"Paula Marcus, MD",10.896992,"Dr. Paula Marcus, MD specializes in psychiatry...",0
0,"Igor Gavrilovic, MD",19.305265,I am a neurologist with specialty training in ...,0


In [13]:
ind = [x for x in range(len(docdict))]

In [14]:
df = pd.DataFrame({'name':docdict.keys(), 'bio':docdict.values()}, index=ind)

In [5]:
def prep_pylda(docs, n_topics = 5):   
    vect = TfidfVectorizer(max_df = 0.5, max_features = 10000,
                                 min_df = 5, stop_words = STOPWORDS,
                                 use_idf = True, tokenizer = None, ngram_range=(1, 3))
    matrix = vect.fit_transform(docs)
    vocab = vect.get_feature_names()
    
    # fit transform lda
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                learning_method='online', learning_offset=50.,
                random_state=0, doc_topic_prior = .001)
    doc_topic_dists = lda.fit_transform(matrix)
    
    # prepare pyLDAvis stuff 
    get_normed = lambda data: pd.DataFrame(data).div(data.sum(axis = 1), axis = 0) 
    prepared = pyLDAvis.prepare(
            doc_lengths = docs.str.len(),
            vocab = vocab,
            term_frequency = np.asarray(matrix.sum(axis = 0)).ravel().tolist(),
            topic_term_dists = get_normed(lda.components_),  
            doc_topic_dists = get_normed(doc_topic_dists)) 
    
    return prepared

In [15]:
prepared = prep_pylda(df.bio, n_topics = 3)

In [23]:
prep0 = prep_pylda(doc_cluster0.bio, n_topics = 2)
prep1 = prep_pylda(doc_cluster1.bio, n_topics = 2)
prep2 = prep_pylda(doc_cluster2.bio, n_topics = 2)

In [31]:
prep0_5 = prep_pylda(doc_cluster0.bio, n_topics=4)



In [33]:
prep1_4 = prep_pylda(doc_cluster1.bio, n_topics = 4)



In [35]:
prep2_4 = prep_pylda(doc_cluster2.bio, n_topics=4)

In [36]:
pyLDAvis.display(prep2_4)

In [34]:
pyLDAvis.display(prep1_4)

In [32]:
pyLDAvis.display(prep0_5)

In [9]:
pyLDAvis.display(prep0)

In [10]:
pyLDAvis.display(prep1)

In [11]:
pyLDAvis.display(prep2)

In [16]:

# display in notebook
pyLDAvis.display(prepared)

In [None]:
# export to html
pyLDAvis.save_html(prepared, 'three_topics_doctors.html')

In [18]:
cluster1_stopwords = joblib.load('doc_cluster1_stop.pkl')

In [22]:
prep1stop = prep_pylda(cluster1_stopwords.bio, n_topics = 3)

In [25]:
pyLDAvis.display(prep1stop)

### attempting gensim

In [None]:
import gensim

In [None]:
dictionary = gensim.corpora.Dictionary.load('newsgroups.dict')
corpus = gensim.corpora.MmCorpus('newsgroups.mm')
lda = gensim.models.ldamodel.LdaModel.load('newsgroups_50.model')

### stopwords, stemming and tokenizing

In [None]:
import nltk

In [None]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
print len(stopwords)

In [None]:
print stopwords[:10]

In [None]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [None]:
# here he defines a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in docdict.values():
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [None]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print 'there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame'

In [None]:
vocab_frame.head()

In [None]:
print vocab_frame.head()

### Tf-idf and document similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                   use_idf=True,
                                   tokenizer=tokenize_only, ngram_range=(2,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(docdict.values()) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

In [None]:
terms = tfidf_vectorizer.get_feature_names()

In [None]:
terms[0:10]

In [None]:
#terms_np = np.array(terms)

In [None]:
#print (np.nan in terms_np)

In [None]:
#'' in terms_np

In [None]:
print len(terms)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [None]:
print dist.shape

### K-means clustering

In [None]:
print(tfidf_matrix.shape)

In [None]:
from sklearn.cluster import KMeans

n = 5
km = KMeans(n_clusters=n)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [None]:
#from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster_nostopwords.pkl')

# km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

### hacky attempt at reading level

In [None]:
sentlen = []
wordlen = []
fkgl  = []
counts = []

In [None]:
from __future__ import division
sentlen = [nltk.sent_tokenize(d) for d in docdict.values()]
wordlen = [[nltk.word_tokenize(t) for t in bio] for bio in sentlen]
for dr in wordlen:
    sentence_cnt = len(dr)
    word_cnt = 0
    letter_cnt = 0
    for sentence in dr:
        word_cnt += len(sentence)
        for word in sentence:
            letter_cnt += len(word)
    counts.append((sentence_cnt, word_cnt, letter_cnt))
    
print counts[0]

def catch(func, handle=lambda e : e, *args, **kwargs):
    try:
        return func(*args, **kwargs)
    except ZeroDivisionError:
        return np.nan

fkgl = [catch(lambda: 0.39 * (dr[1]/dr[0]) + 11.8 * (dr[2]/dr[1])/2.83 - 15.59) \
        for dr in counts]

In [None]:
fkgl.count(np.nan)

In [None]:
len(fkgl)

In [None]:
fkgl[0]

In [None]:
max(fkgl)

In [None]:
min(fkgl)

### looking at the clusters

In [None]:
clusters[0]

In [None]:
print len(docdict.keys())
print len(docdict.values())
print len(fkgl)
print len(clusters)

In [None]:
docs = { 'name': docdict.keys(), 'bio': docdict.values(), 'flesch_kincaid': fkgl, 'cluster': clusters }

In [None]:
len(vocab_frame)

In [None]:
df = pd.DataFrame(docs, index = [clusters] , columns = ['name', 'flesch_kincaid', 'cluster'])

In [None]:
df['cluster'].value_counts()

In [None]:
grouped = df['flesch_kincaid'].groupby(df['cluster']) #groupby cluster for aggregation purposes

grouped.mean() #average flesch_kincaid per cluster

In [None]:
grouped.median()

In [None]:
num_clusters = len(pd.unique(df['cluster']))

In [None]:
terms[7048].split(" ")

In [None]:
#vocab_frame.ix[terms[7048].split(" ")].values.tolist()[0][0]

In [None]:
terms[33679].encode('utf-8', 'ignore')

In [None]:
terms[33679].split(" ")

In [None]:
#vocab_frame.ix[terms[33679].split(" ")]

In [None]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        try:
            print(' %s' % terms[ind].encode('utf-8', 'ignore'), end=',')
            print()
        except AttributeError as e:
            print("AttributeError")
    print() #add whitespace
    print() #add whitespace
    
#     print("Cluster %d titles:" % i, end='')
#     for title in df.ix[i]['name'].values.tolist():
#         print(' %s,' % title, end='')
#     print() #add whitespace
#     print() #add whitespace
    
print()
print()

In [None]:
print(df.columns)

In [None]:
print(df[df['name']=='Ludmila Davidov, MD'])
print(docdict['Ludmila Davidov, MD'])
print()
print(df[df['name']=='Igor Gavrilovic, MD'])
print(docdict['Igor Gavrilovic, MD'])

In [None]:
print(docdict['Sharon Lee, MD'])
print()
print(docdict['Carol Bernstein, MD'])

In [None]:
vocab_frame.ix[terms[2].split(' ')].values.tolist()[360][0]