# Analyzing sessions with topic modeling
Develop topic summaries from original session descriptions

In [5]:
import sklearn.lda
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import lda

In [2]:
sessions = pd.read_pickle('data_train/strata_sessions.pkl')

sessions.head()

Unnamed: 0,link,topic,descriptions
0,/strata/hadoop-big-data-ny/public/schedule/det...,Parallel SQL and analytics with Solr,Analytics has increasingly become a major focu...
1,/strata/hadoop-big-data-ny/public/schedule/det...,JupyterLab: The evolution of the Jupyter Notebook,Project Jupyter provides building blocks for i...
2,/strata/hadoop-big-data-ny/public/schedule/det...,Designing a location intelligence platform for...,CartoDB has enabled hundreds of thousands of u...
3,/strata/hadoop-big-data-ny/public/schedule/det...,The future of column-oriented data processing ...,"In pursuit of speed and efficiency, big data p..."
4,/strata/hadoop-big-data-ny/public/schedule/det...,Beyond Hadoop at Yahoo: Interactive analytics ...,Yahoo initially built Hadoop as an answer to a...


In [9]:
data = []

for i in sessions['descriptions']:
    sent = i.split('. ')
    for s in sent:
        data.append(s.decode('utf-8'))
        
print 'Number of sentences:', len(data)
print 'That\'s an average of ' + str(len(data)/193) + \
    ' sentences per session description.'

Number of sentences: 584
That's an average of 3 sentences per session description.


In [12]:
# http://mike.place/talks/pygotham/#p1
documents = data

vec = CountVectorizer(stop_words='english', binary=True)

X = vec.fit_transform(documents)

vocab = sorted(vec.vocabulary_, key=vec.vocabulary_.get)  # you'll need this later

lda_model = lda.LDA(n_topics=100)

model = lda_model.fit(X)

INFO:lda:n_documents: 584
INFO:lda:vocab_size: 3020
INFO:lda:n_words: 8227
INFO:lda:n_topics: 100
INFO:lda:n_iter: 2000
INFO:lda:<0> log likelihood: -112354
INFO:lda:<10> log likelihood: -84502
INFO:lda:<20> log likelihood: -83686
INFO:lda:<30> log likelihood: -82837
INFO:lda:<40> log likelihood: -82626
INFO:lda:<50> log likelihood: -82702
INFO:lda:<60> log likelihood: -82598
INFO:lda:<70> log likelihood: -82115
INFO:lda:<80> log likelihood: -82687
INFO:lda:<90> log likelihood: -82706
INFO:lda:<100> log likelihood: -82425
INFO:lda:<110> log likelihood: -82368
INFO:lda:<120> log likelihood: -82297
INFO:lda:<130> log likelihood: -82506
INFO:lda:<140> log likelihood: -82366
INFO:lda:<150> log likelihood: -82481
INFO:lda:<160> log likelihood: -82717
INFO:lda:<170> log likelihood: -82349
INFO:lda:<180> log likelihood: -82233
INFO:lda:<190> log likelihood: -82095
INFO:lda:<200> log likelihood: -82556
INFO:lda:<210> log likelihood: -82285
INFO:lda:<220> log likelihood: -82244
INFO:lda:<230> l

In [11]:
def topn_indices(arr, n):
    '''Indices of the top n elements in arr, sorted by value'''
    return np.argsort(-arr)[:n]
def print_lda(lda_model, n_words=10):
    '''Print top n_words words for all topics of an LDA model'''
    for i, _ in enumerate(lda_model.topic_word_):
        print_topic(model, i)
def print_topic(lda_model, i, n_words=10):
    '''Print top n_words words from topic i'''
    top_words = topn_indices(lda_model.topic_word_[i], n_words)
    words = np.array(vocab)[top_words]
    print('*{}* {}'.format(i, ' '.join(words)))

In [13]:
print_lda(model)

*0* company architectures high manage ted requires value promise hdfs emerging
*1* code interactive visualizations python computing web jupyter choices building output
*2* information quickly scale problem years levels odpi greater ot integrates
*3* open source single organizations project difficult introduces rapid billions architecture
*4* time real challenge stack used case metrics druid custom caravel
*5* data complex science getting project design languages considering exploration mature
*6* models text unstructured making big significant applied deliver meng start
*7* data world enable come 2015 required narrative nature results figures
*8* central customer ll metrics need requirements interoperability dashboards historical general
*9* industry use technology healthcare parallel internal better solr scores enforcement
*10* data permanente gain connected kaiser today explain concept latency enabled
*11* critical language days similar compliance matter spot concerned session 40
*12