## Data Breach Analytics 2005 - 2017 (Part III - Unsupervised Topic Modeling, LDA, LSI)
#### by Miriam Rodriguez 

###  Dataset - databreach.csv

### Topic Modeling with Gensim - Text preprocessing

In [19]:
import csv
import pandas as pd

# import packages for text processing
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re

import gensim
from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim import corpora, models, similarities

from gensim.models.coherencemodel import CoherenceModel
from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet

import numpy
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings(action= 'ignore')  # To ignore all warnings that arise here to enhance clarity

In [20]:
warnings.filterwarnings(action='ignore')


In [28]:
#import breach data ... open or read the breach data
texts = []
r = csv.reader(open('databreach.csv', 'rb'))
for i in r:
    texts.append(i)  
len(texts)

8177

In [29]:
# Remove useless numbers and alphanumerical words
documents = [re.sub("[^a-zA-Z]+", " ", str(text)) for text in texts]

# tokenize
texts = [[word for word in text.lower().split() ] for text in documents]

# stemming words: having --> have; friends --> friend
lmtzr = WordNetLemmatizer()
texts = [[lmtzr.lemmatize(word) for word in text ] for text in texts]

porter_stemmer = PorterStemmer()
texts = [[porter_stemmer.stem(word) for word in text ] for text in texts]

# remove common words 
stoplist = stopwords.words('english')
texts = [[word for word in text if word not in stoplist] for text in texts]

#remove short words
texts = [[ word for word in tokens if len(word) >= 3 ] for tokens in texts]

In [30]:
# A list of extra stopwords 
extra_stopwords = ['will', 'people', 'need', 'think', 'well','going', 'can', 'country', 'know', 'lot', 'get','make','way','want',
                'like','say','got','said','just','something','tell','put','now', 'bad','back','want','right','every','one','use','come','never', 
                'many','along','things','day','also','first','guy', 'great', 'take', 'good', 'much','anderson', 'let', 'would', 'year', 'thing', 'america',
                'talk', 'talking', 'thank', 'does', 'give', 'look', 'believe', 'tonight','today','see']

extra_stoplist = extra_stopwords
texts = [[word for word in text if word not in extra_stoplist] for text in texts]

In [31]:
# this is text processing required for topic modeling with Gensim

## Create a dictionary representation of the documents.
dictionary = Dictionary(texts)
dictionary.save('databreach.dict')  # store the dictionary, for future reference

len(dictionary)

10721

In [32]:
## Remove rare and common tokens.
# ignore words that appear in less than 2 documents or more than 40% documents (remove too frequent & infrequent words) - an optional step
dictionary.filter_extremes(no_below=2, no_above=0.4) 
len(dictionary)

5091

In [33]:
# convert words to vectors or integers, this is text processing required for topic modeling with Gensim
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('databreach.mm', corpus)  # store to disk, for later use  # store to disk, for later use
len(corpus)

8177

In [34]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 5091
Number of documents: 8177


In [37]:
# later you can retrive the saved dict and corpus
# https://radimrehurek.com/gensim/tut1.html

saved_dict = dictionary.load('databreach.dict')

for i in saved_dict.token2id.iteritems():
    print i

(u'nalleg', 8657)
(u'yellow', 6698)
(u'interchang', 9813)
(u'four', 105)
(u'payoff', 10209)
(u'accret', 547)
(u'lori', 586)
(u'ntulsa', 5239)
(u'hermann', 10084)
(u'lord', 8397)
(u'ncqa', 7143)
(u'digit', 500)
(u'deli', 985)
(u'fradul', 3833)
(u'wjb', 5186)
(u'dell', 8968)
(u'formspr', 3098)
(u'honeywel', 8253)
(u'delt', 8690)
(u'delv', 9384)
(u'oceana', 10651)
(u'broward', 5834)
(u'viewabl', 1582)
(u'kasperski', 7510)
(u'disturb', 8909)
(u'prize', 4142)
(u'cashout', 6295)
(u'carthag', 7985)
(u'voyeur', 9233)
(u'wednesday', 1806)
(u'assail', 7740)
(u'inur', 4172)
(u'bcbsnc', 9148)
(u'terabyt', 4907)
(u'unathor', 10568)
(u'onehitplay', 2978)
(u'charter', 7864)
(u'kuroi', 7422)
(u'miller', 3140)
(u'seper', 8315)
(u'frederick', 7391)
(u'budget', 5479)
(u'second', 672)
(u'nget', 8787)
(u'nsuggest', 1941)
(u'melvin', 9975)
(u'maloney', 7575)
(u'decre', 8514)
(u'launchpoint', 7060)
(u'clorox', 7932)
(u'ongo', 1455)
(u'widget', 4196)
(u'intellectu', 3279)
(u'avert', 8744)
(u'cenabl', 10249)
(

(u'homeport', 2306)
(u'lifestyl', 8677)
(u'abm', 6346)
(u'believ', 39)
(u'airway', 1744)
(u'nfeinstein', 8664)
(u'unreimburs', 5849)
(u'bartholomew', 8948)
(u'pcpa', 9407)
(u'narrest', 9909)
(u'nmention', 10116)
(u'penetr', 2585)
(u'bumper', 9041)
(u'testament', 10425)
(u'resuscit', 8565)
(u'major', 653)
(u'purport', 4170)
(u'guam', 6899)
(u'mockeri', 4910)
(u'number', 14)
(u'precautionari', 2863)
(u'rail', 8184)
(u'guess', 2365)
(u'guest', 1823)
(u'jet', 10068)
(u'arlington', 5415)
(u'evoni', 5465)
(u'swipe', 4647)
(u'nwere', 895)
(u'databreach', 852)
(u'dfr', 4304)
(u'jeb', 6158)
(u'contra', 4123)
(u'saint', 3040)
(u'crow', 8460)
(u'illeg', 1290)
(u'northbound', 9843)
(u'hotl', 3509)
(u'jen', 9404)
(u'relationship', 1525)
(u'ndownload', 10192)
(u'nextbu', 3973)
(u'grace', 4258)
(u'obama', 5639)
(u'reinstal', 9297)
(u'aam', 7149)
(u'depriv', 2982)
(u'aat', 7130)
(u'reus', 3301)
(u'aap', 6673)
(u'torrentfreak', 4774)
(u'reinstat', 6742)
(u'arrang', 2703)
(u'estiamt', 9209)
(u'listen', 

(u'saver', 10551)
(u'nshovel', 8837)
(u'nemerg', 7047)
(u'defraud', 1319)
(u'houstonchronicl', 10193)
(u'jasacar', 8884)
(u'voter', 887)
(u'spew', 4658)
(u'privacyright', 1086)
(u'albuquerqu', 5485)
(u'farber', 9902)
(u'bike', 6959)
(u'hedg', 8244)
(u'worth', 1974)
(u'compassion', 7275)
(u'aurora', 9755)
(u'sheridan', 1985)
(u'kcase', 6140)
(u'aldridg', 8739)
(u'kshb', 8052)
(u'chapel', 9094)
(u'croissant', 3181)
(u'pinch', 9305)
(u'identificaiton', 10684)
(u'upload', 769)
(u'nbeen', 3658)
(u'red', 3800)
(u'nfor', 4238)
(u'nunemploy', 6115)
(u'wholli', 337)
(u'triumph', 9112)
(u'robley', 7200)
(u'jamaica', 8867)
(u'homecar', 6082)
(u'clarifi', 1977)
(u'gasolin', 3299)
(u'envisionrx', 9294)
(u'sunbelt', 5986)
(u'horn', 10718)
(u'consequ', 698)
(u'hors', 1673)
(u'chen', 2656)
(u'panda', 7031)
(u'longstand', 9490)
(u'supervis', 386)
(u'disk', 841)
(u'amus', 8763)
(u'lakeshor', 3564)
(u'near', 2020)
(u'siskiy', 3305)
(u'toll', 1348)
(u'told', 1770)
(u'wilshir', 8045)
(u'umpc', 9575)
(u'mca

(u'hccl', 3038)
(u'medicin', 2624)
(u'engrav', 5533)
(u'metropark', 2619)
(u'intermedi', 7719)
(u'wesleyan', 5311)
(u'nmedicaid', 10535)
(u'glitch', 2244)
(u'purchas', 127)
(u'famili', 411)
(u'asi', 10492)
(u'aggress', 4711)
(u'ash', 9319)
(u'taker', 8842)
(u'injuri', 646)
(u'viru', 149)
(u'taken', 62)
(u'burlington', 7517)
(u'sentara', 3255)
(u'assessor', 10339)
(u'wentworth', 7551)
(u'linden', 2201)
(u'turbo', 2348)
(u'hurri', 1816)
(u'preciou', 9996)
(u'broke', 1893)
(u'alderman', 6665)
(u'grill', 5953)
(u'aspx', 6872)
(u'nwith', 5669)
(u'cquench', 9632)
(u'histori', 413)
(u'kunz', 6735)
(u'ninc', 10078)
(u'nebraska', 2347)
(u'nine', 1263)
(u'landrysinc', 10166)
(u'transcend', 9957)
(u'addreass', 7390)
(u'templat', 3662)
(u'hhc', 9626)
(u'experianidwork', 4984)
(u'anoth', 978)
(u'wfmynew', 9144)
(u'dentrix', 9594)
(u'reject', 2360)
(u'idaho', 4995)
(u'ransom', 1590)
(u'kroll', 3561)
(u'nwithin', 4024)
(u'threat', 1753)
(u'cmug', 10708)
(u'winnebago', 10671)
(u'timesunion', 3416)
(u'

(u'met', 1542)
(u'pedro', 9662)
(u'unpublish', 7180)
(u'scandatafact', 4299)
(u'interpret', 3733)
(u'luther', 8706)
(u'cindi', 9807)
(u'dri', 6970)
(u'jame', 648)
(u'harass', 6918)
(u'sunshin', 6242)
(u'cthey', 4781)
(u'permit', 6076)
(u'serco', 6331)
(u'drc', 8546)
(u'drb', 9333)
(u'drd', 10007)
(u'fell', 2272)
(u'counti', 478)
(u'nineti', 9361)
(u'volunt', 2393)
(u'campaign', 1013)
(u'avenu', 2018)
(u'moral', 6326)
(u'org', 766)
(u'mammogram', 3576)
(u'fiserv', 2406)
(u'reguard', 6429)
(u'landfil', 1217)
(u'poulsbo', 10487)
(u'schneck', 6997)
(u'edward', 8048)
(u'call', 748)
(u'intrust', 3334)
(u'recommend', 419)
(u'partzilla', 6065)
(u'swedishamerican', 6765)
(u'alphabet', 6964)
(u'frenzi', 8728)
(u'kgw', 9448)
(u'warn', 743)
(u'adult', 1626)
(u'accesscatalog', 8022)
(u'wari', 8721)
(u'bujanda', 5171)
(u'ward', 6667)
(u'ware', 9179)
(u'flora', 3859)
(u'room', 1718)
(u'flore', 3890)
(u'nvarieti', 10215)
(u'olen', 9816)
(u'hansel', 4234)
(u'hansen', 10576)
(u'endow', 5563)
(u'defer', 

(u'marriott', 5425)
(u'cabarru', 9042)
(u'allianc', 1458)
(u'road', 5335)
(u'skagit', 10510)
(u'healthitsecur', 1548)
(u'kansan', 7170)
(u'hmsa', 6600)
(u'coupon', 4932)
(u'terminolog', 9660)
(u'mari', 2584)
(u'quietli', 8518)
(u'inbox', 4504)
(u'strip', 1365)
(u'jerusalem', 125)
(u'hutton', 9818)
(u'nmanag', 10503)
(u'complianc', 235)
(u'maass', 6358)
(u'brute', 2738)
(u'compliant', 2104)
(u'amerita', 8131)
(u'newsdetail', 6874)
(u'monadnock', 8213)
(u'alameda', 3168)
(u'mdinr', 6184)
(u'slurper', 2756)
(u'downer', 9320)
(u'daytim', 5094)
(u'enough', 639)
(u'tinley', 6678)
(u'sunris', 9754)
(u'cryptograph', 1357)
(u'woman', 439)
(u'affectedmor', 6206)
(u'possibl', 1025)
(u'mailng', 10590)
(u'vice', 1874)
(u'ember', 7553)
(u'ludlow', 10329)
(u'thedarkoverlord', 8078)
(u'tigta', 5755)
(u'unusu', 1006)
(u'govwin', 10413)
(u'flex', 7826)
(u'neill', 7739)
(u'embed', 1821)
(u'vha', 4003)
(u'proveedor', 9666)
(u'deer', 7855)
(u'casualti', 4183)
(u'fellow', 5363)
(u'grainger', 5023)
(u'deem',

(u'ckimpton', 4276)
(u'protectingourguest', 6556)
(u'immigr', 1788)
(u'wakem', 9059)
(u'blumenth', 5319)
(u'dementia', 5071)
(u'nedlloyd', 8484)
(u'mil', 3070)
(u'colburn', 8535)
(u'min', 3483)
(u'passwordpassword', 10507)
(u'mid', 2035)
(u'npose', 10202)
(u'mix', 3507)
(u'mir', 4404)
(u'micu', 7970)
(u'uppercas', 10508)
(u'analysi', 157)
(u'unless', 2975)
(u'sfx', 6651)
(u'alton', 8240)
(u'salli', 10047)
(u'randstad', 7555)
(u'sfn', 8548)
(u'eight', 460)
(u'nmatter', 5762)
(u'cattl', 7119)
(u'transcript', 2924)
(u'payment', 914)
(u'johnstown', 9467)
(u'zotec', 8871)
(u'wishbon', 4700)
(u'seguro', 204)
(u'request', 420)
(u'nsan', 5238)
(u'expecteda', 2835)
(u'occasion', 4976)
(u'trapwir', 9971)
(u'charley', 9812)
(u'catalina', 1989)
(u'nretir', 3493)
(u'text', 1032)
(u'nnumber', 879)
(u'insync', 3387)
(u'cflash', 10686)
(u'clearanc', 2894)
(u'empir', 4065)
(u'portfolio', 4845)
(u'texa', 935)
(u'urogynecolog', 9378)
(u'staff', 206)
(u'wear', 3028)
(u'cmail', 7789)
(u'bicycl', 5872)
(u'r

(u'serf', 4470)
(u'ubiqu', 3912)
(u'naccess', 1553)
(u'balanc', 695)
(u'posit', 352)
(u'driveway', 1165)
(u'pre', 3551)
(u'rene', 1430)
(u'prc', 3429)
(u'cdigniti', 1997)
(u'anm', 9720)
(u'ann', 7748)
(u'ninvolv', 1239)
(u'pri', 532)
(u'hershey', 9579)
(u'reno', 8150)
(u'mateo', 2183)
(u'cjacket', 10011)
(u'homedepot', 6459)
(u'rent', 3759)
(u'techniqu', 2370)
(u'moreov', 1635)
(u'bridgewat', 5376)
(u'dinclud', 9011)
(u'hingham', 7525)
(u'ideal', 961)
(u'nazareth', 9612)
(u'publicintegr', 10468)
(u'nsell', 7531)
(u'bradi', 6132)
(u'surf', 10499)
(u'surg', 10102)
(u'sure', 4026)
(u'mcpn', 5167)
(u'ukrainian', 6320)
(u'greenplum', 3075)
(u'cigarett', 2491)
(u'lur', 9963)
(u'nvulner', 4022)
(u'fcc', 5580)
(u'connor', 9341)
(u'freshman', 6923)
(u'librarian', 6576)
(u'intermedix', 6047)
(u'icon', 8946)
(u'feinstein', 8660)
(u'disenrol', 8686)
(u'later', 144)
(u'intelenet', 9484)
(u'tweet', 4663)
(u'senior', 2256)
(u'recipi', 544)
(u'unaccompani', 10179)
(u'cheap', 10594)
(u'trespass', 7984)

(u'safekeep', 9054)
(u'providencenightlif', 9686)
(u'buffalo', 7808)
(u'scroll', 6473)
(u'mardi', 8181)
(u'trustco', 7496)
(u'edg', 6783)
(u'edd', 3381)
(u'ganz', 8988)
(u'miner', 8070)
(u'edm', 9153)
(u'alien', 6363)
(u'nembarrass', 8904)
(u'gang', 2495)
(u'edt', 9429)
(u'edu', 3061)
(u'issuer', 4875)
(u'broadridg', 8262)
(u'theorist', 10216)
(u'simon', 9676)
(u'bpgwi', 6494)
(u'gold', 10301)
(u'prequalif', 9982)
(u'susquehanna', 9591)
(u'newenglandcalendar', 8866)
(u'outgo', 7793)
(u'breach', 42)
(u'cybercrim', 4287)
(u'sniper', 6003)
(u'imposs', 3932)
(u'religi', 5553)
(u'carefirst', 7420)
(u'divisadero', 2931)
(u'heard', 4583)
(u'lifega', 6420)
(u'electron', 171)
(u'spoof', 3911)
(u'goldthwait', 7542)
(u'relev', 1333)
(u'weapon', 6247)
(u'kmtv', 8125)
(u'misplac', 739)
(u'happenedth', 4832)
(u'aprehend', 3837)
(u'inacit', 9555)
(u'prep', 6063)
(u'watersid', 9789)
(u'pleas', 297)
(u'cfo', 7897)
(u'cfm', 6165)
(u'edina', 7865)
(u'locatd', 9814)
(u'bellin', 10666)
(u'battl', 3214)
(u'

In [38]:
# you can retrieve the saved corpus

corpus_saved = corpora.MmCorpus('databreach.mm')

In [39]:
numpy.random.seed(1) # setting random seed to get the same results each time. 
model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=35, passes=50)

# Optimal k value

In [None]:
numpy.random.seed(1) # setting random seed to get the same results each time.
k_range = range(2,35)
scores = []
for k in k_range:
    goodLdaModel = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=k, passes=50)
    goodcm = CoherenceModel(model=goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')
    scores.append(goodcm.get_coherence())
    
plt.figure()
plt.plot(k_range, scores)

In [None]:
print goodcm.get_coherence()

In [43]:
model.save('lda.model') # same for tfidf, lda, ...
#model = models.LdaModel.load('data/lda.model')

# Prints the topics (Term-Topic Distribution)

In [44]:
# print words without probability
for i in range(0,35):
    topics = model.show_topic(i, 10)
    print ', '.join([str(word[0]) for word in topics])
  

email, employe, sent, includ, person, contain, spreadsheet, account, social, phish
card, payment, system, affect, date, point, malwar, locat, credit, restaur
card, credit, custom, debit, bank, may, account, purchas, affect, store
network, bar, list, soni, gang, seek, intrus, facebook, download, automat
updat, million, lawsuit, boe, prescript, file, claim, action, class, five
laptop, stolen, contain, comput, social, employe, person, drive, address, offic
investig, thi, data, washington, gener, attorney, law, incid, hotel, forens
ephi, devic, electron, comput, encrypt, drive, portabl, health, contain, laptop
mail, sent, address, letter, error, social, receiv, client, print, form
access, unauthor, may, address, parti, includ, account, third, thi, investig
server, comput, network, system, access, viru, hacker, technolog, group, malwar
research, data, center, offici, agenc, ani, test, state, taxpay, googl
data, compani, com, http, www, servic, accord, notifi, custom, thi
record, document, f

# Assign the topics to the documents in corpus (Document-Topic Distribution)

In [45]:
lda_corpus = model[corpus]

results = []
for i in lda_corpus:
#    print i
    results.append(i)
print 




In [46]:
# finding highest value from each row
toptopic = [max(collection, key=lambda x: x[1])[0] for collection in results]
# toptopic

In [47]:
toptopic = pd.DataFrame(toptopic)
documents = pd.DataFrame(documents)
documents = documents.rename(columns = {0: 'documents'})
summary = documents.join(toptopic)
summary.head(10)

Unnamed: 0,documents,0
0,An external auditor lost a CD with names Soci...,25
1,There are TennCare r r n enrollees at risk of...,5
2,A laptop computer was stolen from a vehicle x...,5
3,A Fidelity laptop used by a former Fidelity e...,16
4,A former contract worker of a Japanese commer...,12
5,Patient names addresses Social Security numbe...,27
6,At least colleges are scrambling to inform te...,5
7,Hundreds of files with Social Security number...,28
8,A computer was compromised by a virus That ca...,22
9,N r r nLocation of breached information Other...,21


# pyLDAvis


In [48]:
import pyLDAvis.gensim

In [49]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model, corpus, dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


### get_term_topics

The function `get_term_topics` returns the odds of that particular word belonging to a particular topic. 


In [50]:
model.get_term_topics('hack')

[(19, 0.010074164), (23, 0.013244809), (30, 0.01502259)]

In [55]:
model.get_term_topics('password')

[(23, 0.07222867)]

In [56]:
summary.groupby(0).count()

Unnamed: 0_level_0,documents
0,Unnamed: 1_level_1
0,174
1,80
2,355
3,2
4,14
5,884
6,40
7,118
8,105
9,534


### Here are the 35 topics

In [57]:
model.show_topics(num_topics=35)
#show_topics(num_topics=35, num_words=10, log=False, formatted=True)

[(0,
  u'0.108*"email" + 0.098*"employe" + 0.037*"sent" + 0.030*"includ" + 0.027*"person" + 0.027*"contain" + 0.026*"spreadsheet" + 0.024*"account" + 0.022*"social" + 0.016*"phish"'),
 (1,
  u'0.088*"card" + 0.087*"payment" + 0.037*"system" + 0.027*"affect" + 0.024*"date" + 0.024*"point" + 0.023*"malwar" + 0.023*"locat" + 0.023*"credit" + 0.021*"restaur"'),
 (2,
  u'0.164*"card" + 0.134*"credit" + 0.115*"custom" + 0.027*"debit" + 0.026*"bank" + 0.020*"may" + 0.019*"account" + 0.017*"purchas" + 0.017*"affect" + 0.015*"store"'),
 (3,
  u'0.031*"network" + 0.023*"bar" + 0.021*"list" + 0.020*"soni" + 0.018*"gang" + 0.016*"seek" + 0.015*"intrus" + 0.015*"facebook" + 0.014*"download" + 0.014*"automat"'),
 (4,
  u'0.092*"updat" + 0.051*"million" + 0.040*"lawsuit" + 0.038*"boe" + 0.037*"prescript" + 0.032*"file" + 0.030*"claim" + 0.026*"action" + 0.023*"class" + 0.023*"five"'),
 (5,
  u'0.097*"laptop" + 0.072*"stolen" + 0.069*"contain" + 0.052*"comput" + 0.042*"social" + 0.035*"employe" + 0.03

## Apply topic model to new data to determine how breaches occurred


In [72]:
unlabeled = [] 

In [76]:
# Remove useless numbers and alphanumerical words
unlabeled = [re.sub("[^a-zA-Z]+", " ", text) for text in unlabeled]
# tokenize
unlabeled = [[word for word in text.lower().split() ] for text in unlabeled]
# stemming words: having --> have; friends --> friend
lmtzr = WordNetLemmatizer()
unlabeled = [[lmtzr.lemmatize(word) for word in text ] for text in unlabeled]
# remove common words 
stoplist = stopwords.words('english')
unlabeled = [[word for word in text if word not in stoplist] for text in unlabeled]
#remove short words
unlabeled = [[ word for word in tokens if len(word) >= 3 ] for tokens in unlabeled]

In [77]:
unlabeled

[]

## Results

### LSI and Results

In [58]:
tfidf = models.TfidfModel(corpus)

In [59]:
corpus_tfidf = tfidf[corpus]


In [60]:
for doc in corpus_tfidf:
    print(doc)

[(0, 0.054436636896890787), (1, 0.21778874225475314), (2, 0.071567874369151366), (3, 0.32327270743129993), (4, 0.214010470303582), (5, 0.25026541800794805), (6, 0.15768733172955074), (7, 0.2754249562985957), (8, 0.13188581052092682), (9, 0.18378888396835491), (10, 0.13303659623879602), (11, 0.64591811494302354), (12, 0.12339022719209709), (13, 0.11210403307592905), (14, 0.074172392655797101), (15, 0.035832988006378498), (16, 0.24470792840626038), (17, 0.14907641529648277), (18, 0.1334770868933732), (19, 0.10118790670216064)]
[(10, 0.4234211923190167), (14, 0.11803580301731009), (15, 0.057023581987845663), (20, 0.069964610900259849), (21, 0.38168569943017039), (22, 0.38543246893940231), (23, 0.4032120176626699), (24, 0.17613697909134216), (25, 0.072795034409568207), (26, 0.096710948730237806), (27, 0.19335290693224602), (28, 0.48888884808961092), (29, 0.14049627227181308)]
[(30, 0.31336177523400194), (31, 0.38320258138222013), (32, 0.34362947069350258), (33, 0.79804696999305813)]
[(0, 0

[(1, 0.47389290185816224), (4, 0.058208956159889115), (30, 0.12951271145198368), (31, 0.23756697181628175), (32, 0.071011188986103951), (46, 0.12513925010427898), (102, 0.20771275662250696), (103, 0.080557608611511552), (105, 0.21581522401420328), (122, 0.062587895359464768), (124, 0.15636817847722437), (138, 0.28444834512589789), (152, 0.15895586781403501), (184, 0.11136718561896974), (320, 0.096002224834958086), (327, 0.17402623547287413), (339, 0.22870892749054816), (377, 0.18053946960758202), (386, 0.14348709858604669), (420, 0.19711901150484085), (425, 0.18482679647708886), (426, 0.2793383596134858), (427, 0.2304332167943732), (428, 0.15785834151770631), (429, 0.077765548018908517), (430, 0.23421954437668807)]
[(0, 0.049649599415629339), (20, 0.040098803615715189), (22, 0.44180566947350619), (25, 0.041721003682129582), (43, 0.070416847005428107), (69, 0.14262725861415326), (112, 0.1314294276776887), (114, 0.19532396299148386), (116, 0.096126051518943559), (127, 0.13056434072676071

[(2, 0.066358248094219946), (3, 0.049956796749257133), (4, 0.024804006001900941), (7, 0.04256266689140787), (15, 0.033224604322039303), (19, 0.093822155209503363), (24, 0.10262563719754574), (25, 0.042413789708662122), (29, 0.040929847724643476), (30, 0.027593984535244993), (35, 0.063830755118374521), (48, 0.060688542310770865), (63, 0.033960510976865713), (67, 0.068214440577395188), (85, 0.028354048330629752), (87, 0.023934708778029314), (91, 0.091312501817138916), (98, 0.023193070648192489), (99, 0.038624839168093235), (116, 0.097722244788536347), (118, 0.071870740706135861), (121, 0.1060576788255973), (122, 0.026669959995129888), (160, 0.097788743758826985), (219, 0.09980565479518369), (318, 0.075308703534624308), (332, 0.051312818598772868), (341, 0.16943919876257441), (342, 0.099251623663748872), (362, 0.20850053910923352), (367, 0.066454236695578719), (370, 0.15716890710135303), (388, 0.044203921582569534), (428, 0.067266611682575389), (429, 0.033137462807335066), (437, 0.0755634

[(53, 0.11496268764130513), (63, 0.20978173651349802), (85, 0.058383117380449254), (87, 0.14785009488874476), (91, 0.12534620452848522), (98, 0.047756276291724853), (131, 0.07623473922107106), (132, 0.10359893960459887), (161, 0.18220305998703759), (185, 0.10136886484734799), (189, 0.063392985209097488), (200, 0.2483580283729368), (353, 0.10427232897788373), (401, 0.094796340629867581), (420, 0.17295484574083592), (429, 0.068232527440569013), (438, 0.17811214778599357), (441, 0.11714101748732773), (442, 0.11979811662631076), (483, 0.14923561948637951), (505, 0.11006270903619314), (508, 0.1401540696530858), (612, 0.13454400975978686), (630, 0.14515228786132989), (700, 0.19125612671831527), (866, 0.14126902885578232), (869, 0.12798469180424105), (918, 0.17596239738790653), (976, 0.14654268106724727), (1090, 0.12870738148769525), (1094, 0.21939828520249458), (1178, 0.17923565215198003), (1199, 0.15000532478354098), (1540, 0.22483328747325682), (1578, 0.3082949933095081), (1579, 0.17062245

[(15, 0.038706258783398804), (25, 0.049411547675200972), (26, 0.065645104678595453), (69, 0.084459060986290344), (90, 0.11731282759893473), (98, 0.054039288822777372), (99, 0.089994933021108225), (112, 0.077828082482135186), (114, 0.11566427527721287), (122, 0.12428071236708467), (147, 0.14370414548640939), (421, 0.4701886928190242), (422, 0.27592940261947269), (424, 0.19463379147987669), (450, 0.162239143447033), (482, 0.18074689213224454), (546, 0.20092381934656944), (720, 0.21824278265594133), (1702, 0.20281663324864474), (1812, 0.23000152241819863), (2029, 0.3104169206072146), (2030, 0.34885555323239376), (2031, 0.33184618732838633)]
[(2, 0.27481093001164442), (19, 0.38854783646806823), (25, 0.17564919704640328), (30, 0.22855119809946067), (32, 0.2506270178315026), (63, 0.28128288112645466), (69, 0.30023682607660324), (138, 0.33464416656911411), (607, 0.58852222533492515)]
[(0, 0.21301280779036294), (4, 0.20935795162295304), (15, 0.070107980781533602), (20, 0.086018405462995814), (

[(2, 0.081677101356254209), (20, 0.050175204491739872), (30, 0.067928154718853426), (32, 0.14897870573904745), (43, 0.17622369644845073), (53, 0.27488440344026982), (63, 0.083600642778550172), (85, 0.069799204948049212), (87, 0.058920109886560081), (91, 0.14985608531882938), (121, 0.13054117659567782), (127, 0.16337376441642687), (132, 0.12385641504445348), (138, 0.099460256220473428), (148, 0.15335407831667414), (167, 0.069046427866956375), (172, 0.11361309754104219), (200, 0.14846066552024395), (429, 0.081574543817396622), (441, 0.14004647669189069), (505, 0.13158409365319912), (508, 0.16755944305390605), (509, 0.15750947950745839), (587, 0.21566811269925829), (630, 0.17353500024827279), (634, 0.20239296203821527), (847, 0.15387449834589953), (980, 0.19699911901298761), (1041, 0.24300473565530073), (1512, 0.23925698902773865), (1581, 0.27372635642586607), (1795, 0.17872139551796287), (2110, 0.33785659820740577), (2245, 0.27119298521252666)]
[(15, 0.068883878674904114), (37, 0.3052802

[(2, 0.065650465337167896), (20, 0.080659706786179883), (26, 0.055747303295338453), (43, 0.0708225897048851), (63, 0.067196570516858858), (66, 0.15923401188086991), (69, 0.14344907855337599), (99, 0.2292771799543149), (116, 0.096679930956749566), (131, 0.14651551031202667), (163, 0.082060346349756647), (288, 0.11435615861933118), (318, 0.074505454453333481), (343, 0.11727973120165289), (364, 0.13079908888114777), (374, 0.16927355504721123), (395, 0.065926630326001823), (400, 0.056547901148261984), (410, 0.11125636916415889), (439, 0.14922640658933572), (441, 0.22513326773317102), (511, 0.096811632050025312), (809, 0.084777681221427564), (1023, 0.19231011389997094), (1697, 0.1750879886727221), (1868, 0.36757506214031105), (2084, 0.19532247974033154), (2530, 0.24686912066211772), (2531, 0.59251200611309462)]
[(20, 0.054764892707026606), (26, 0.037850312198238451), (80, 0.09271131583346319), (87, 0.032154861439128629), (122, 0.071659018389035448), (129, 0.072216790373398732), (131, 0.0497

[(4, 0.072186949587648055), (112, 0.097212434396236891), (124, 0.19391761270967364), (125, 0.18232905372051392), (128, 0.11961811943773895), (148, 0.18129957751750886), (318, 0.2191704672684093), (377, 0.22389327091427555), (395, 0.096967198442039867), (409, 0.14297940732100148), (593, 0.17013995873620308), (702, 0.33296180686072796), (927, 0.18780361628123957), (999, 0.23745727190547106), (1003, 0.36424306806104739), (2331, 0.30339893290870429), (2527, 0.5320826463070476)]
[(48, 0.089411078846250241), (90, 0.074178621385924848), (120, 0.095782354151376287), (122, 0.039292258557472169), (124, 0.098166887760013602), (296, 0.13799811219444824), (320, 0.06026954922582025), (366, 0.13799811219444824), (374, 0.063018905467726732), (380, 0.10985767358414178), (384, 0.15821445888083624), (385, 0.20411171924715968), (468, 0.12907292926040626), (497, 0.078561577163529572), (632, 0.069915567970873144), (702, 0.084277606004268507), (756, 0.10732949830652543), (881, 0.11874207901071518), (999, 0.1

[(0, 0.09645149715292424), (10, 0.11785793185263671), (13, 0.099313646501845806), (27, 0.21527664785904857), (30, 0.052729701148436572), (47, 0.11874147354211959), (53, 0.10669046217848117), (61, 0.099163076434562805), (74, 0.095261296960197125), (91, 0.058163412703034358), (98, 0.044319938007927419), (103, 0.065596319924049643), (108, 0.11441453989767632), (136, 0.067219781579388571), (143, 0.038469282443007069), (144, 0.04117279222959478), (145, 0.0532597775665411), (146, 0.040415392701947793), (150, 0.13035185480587566), (157, 0.12361722241185859), (158, 0.076890704897345494), (163, 0.23775113622108526), (165, 0.077246419552651546), (167, 0.053597768434318897), (169, 0.18183909146190505), (171, 0.10992291402235345), (176, 0.10513670947128484), (177, 0.082842591016188322), (180, 0.075503521043241928), (182, 0.13847401080054175), (187, 0.061265079717711689), (189, 0.058831495936618905), (191, 0.07539040793709996), (207, 0.089472519787376151), (228, 0.17401591758370391), (233, 0.094927

[(0, 0.031807106951989024), (20, 0.05137712893008204), (25, 0.026727797240677954), (38, 0.065156240747055311), (66, 0.050712968621027692), (70, 0.037723460605660113), (102, 0.055776502124090271), (105, 0.057952234115865073), (122, 0.03361309083971073), (128, 0.051801960089189303), (131, 0.046662370615309), (134, 0.10746385485542766), (136, 0.044334548350142042), (140, 0.050023349803731326), (166, 0.080448151049084635), (177, 0.054638512214154861), (296, 0.11805234036409126), (380, 0.093979223826542266), (395, 0.041992726246374068), (443, 0.12111629205625407), (448, 0.083096837377230837), (453, 0.072909339387567385), (559, 0.27760611499843624), (631, 0.13262488366020231), (676, 0.049378511501920512), (697, 0.069283648895387101), (702, 0.21628923332808347), (711, 0.099042757329604825), (713, 0.086096055581621644), (726, 0.083978183918916663), (774, 0.39984711189846539), (787, 0.088160370880293423), (788, 0.094869213225276888), (809, 0.10800024031956851), (818, 0.084317718745956796), (927

[(0, 0.036728481887851723), (4, 0.036098297628323409), (12, 0.083251574360606664), (25, 0.030863272737699451), (30, 0.040158668983884614), (47, 0.04521644753127961), (70, 0.04356024713890029), (92, 0.080691383453887267), (101, 0.08435005731858583), (102, 0.064406557035337875), (122, 0.15525559114376841), (132, 0.073223080972643101), (134, 0.186136902640063), (136, 0.051194239656632841), (166, 0.09289554259610816), (179, 0.062114138376265317), (189, 0.044805764487354426), (232, 0.1146203805921714), (242, 0.09570567871618256), (261, 0.10565888905196121), (267, 0.11289750859531879), (291, 0.08041410226968991), (318, 0.054799938243598564), (393, 0.10180104056589349), (400, 0.041591874225407499), (410, 0.081830816336026374), (440, 0.089175520345699005), (467, 0.09128089465562271), (555, 0.1143672112083053), (589, 0.098302140533968588), (593, 0.085081346476247863), (631, 0.15314535348168701), (648, 0.16870011463717166), (711, 0.1143672112083053), (796, 0.093006807423117413), (809, 0.12471064

[(4, 0.11315071937076072), (15, 0.037890934629264315), (20, 0.092979935864033025), (25, 0.048370723023606427), (26, 0.064262329873501098), (38, 0.11791673088714621), (70, 0.06827016263320132), (72, 0.2039379455566297), (85, 0.064672637263476634), (87, 0.054592583068156883), (91, 0.13884955071855906), (160, 0.11152298041923417), (167, 0.19192544902633349), (177, 0.098882235484430203), (184, 0.10824173802799446), (189, 0.070222210142718372), (229, 0.1780751215299011), (263, 0.093946211477889036), (284, 0.1459408232705029), (320, 0.093307984869468841), (393, 0.15954853455035128), (395, 0.075996480816537565), (400, 0.065185213677369166), (503, 0.14273482053317252), (524, 0.13976108210516733), (535, 0.13769995989416453), (583, 0.1070673397198641), (745, 0.15157520697993834), (799, 0.20770816954757682), (911, 0.15280116670529242), (1017, 0.1328311145774638), (1048, 0.45276527571427461), (1067, 0.36008081131095404), (1326, 0.29006040305327346), (1796, 0.22895013640961157), (2239, 0.2316889244

[(2, 0.25599232315763387), (63, 0.52404217111002538), (70, 0.23093378141796997), (143, 0.15532280242172933), (144, 0.16623843925611761), (145, 0.21504060857513807), (146, 0.16318037813004091), (318, 0.29052077963910589), (766, 0.47743641771308826), (855, 0.41175086501469188)]
[(20, 0.076460370575610961), (25, 0.039776790261541101), (32, 0.11351191454095334), (66, 0.075471955216297765), (70, 0.11228146988238598), (102, 0.083007597166883662), (112, 0.12530477020065212), (128, 0.077092611974339434), (374, 0.16046077109938664), (395, 0.062494333123351876), (443, 0.090123725928395571), (453, 0.10850499481165712), (485, 0.59956314713649173), (497, 0.10001801171316548), (676, 0.073485992046128185), (697, 0.10310917678138458), (702, 0.10729518027555043), (713, 0.12812970385767872), (714, 0.10769365542900498), (751, 0.10612721219688871), (769, 0.31509620818647732), (774, 0.19835322957905915), (870, 0.12531387545247555), (911, 0.1256532790876202), (974, 0.29224542790493169), (3132, 0.20307304312

[(3, 0.12105715784147429), (7, 0.10313942885266905), (15, 0.040255545104515233), (20, 0.049391207139803772), (25, 0.05138933207815171), (26, 0.068272665851478023), (30, 0.066866764058646874), (85, 0.068708578763246625), (87, 0.11599894336598757), (91, 0.14751455477099046), (96, 0.36342353753580292), (116, 0.11840207921422571), (122, 0.064627633612322308), (140, 0.28853854040913351), (178, 0.11360448418512352), (185, 0.11929665538615535), (198, 0.12932080063139995), (220, 0.14978543752197029), (346, 0.17745985178850596), (449, 0.093506054691850554), (511, 0.2371267420944578), (518, 0.42053723287108524), (724, 0.14449998183623627), (809, 0.10382561952871423), (857, 0.15997791123979171), (903, 0.22151754349202168), (1157, 0.16436647114220848), (1223, 0.21902375669777627), (1654, 0.2321171373440738), (2351, 0.33257752299462112)]
[(15, 0.04112211591026331), (17, 0.17108083836651514), (20, 0.05045443900157303), (25, 0.10499115412707391), (26, 0.069742354037388321), (30, 0.068306187756863521)

[(15, 0.018907568835321392), (25, 0.02413698115740728), (27, 0.064110904118758402), (47, 0.035362048326594384), (66, 0.04579718848581342), (70, 0.068133595120606974), (87, 0.027241688080767691), (91, 0.10392884724214482), (99, 0.087923010097610124), (101, 0.065966942696845401), (105, 0.052334727410885212), (109, 0.18520639113753518), (110, 0.070430275251547242), (129, 0.061182265744499115), (131, 0.042139228689879869), (154, 0.061182265744499115), (157, 0.073628161459136948), (161, 0.1007138804597305), (167, 0.12769434779835007), (172, 0.05252896797140718), (185, 0.05603227375709658), (187, 0.03649034570792823), (207, 0.053291094918088047), (210, 0.089442206775050201), (211, 0.16333123958829204), (213, 0.059791616399022689), (218, 0.12427800520782005), (261, 0.082631762218936755), (285, 0.076455796448311089), (288, 0.065779797782414273), (304, 0.067869987753932548), (313, 0.28445590349879396), (318, 0.17142771449472868), (326, 0.099391342517874681), (376, 0.11785046371749956), (388, 0.

[(2, 0.12232940382033106), (4, 0.18290171025975299), (7, 0.1569259531516411), (63, 0.12521032970415785), (69, 0.26729498668268431), (322, 0.34683781252660861), (409, 0.18113508245132828), (412, 0.26410957350172232), (418, 0.20975036719450199), (437, 0.27859764410467752), (440, 0.22591585000028272), (495, 0.23258526743051697), (627, 0.26859653970379738), (704, 0.27859764410467752), (719, 0.1876611418113685), (766, 0.22814946800179062), (855, 0.19676073570665908), (1822, 0.33835199969146279)]
[(17, 0.13075008599386051), (20, 0.077120527362564625), (26, 0.053301228091173367), (35, 0.12075808756639209), (57, 0.11445177527168757), (66, 0.076123578051573354), (98, 0.043877764740115281), (116, 0.092437817565168306), (138, 0.15287286796412478), (164, 0.23159461101603859), (232, 0.14899894623170543), (332, 0.097076054170715576), (395, 0.06303390751786421), (398, 0.14379554892893134), (401, 0.087097484464087202), (561, 0.20157991975115269), (577, 0.1898985527959525), (583, 0.088805069887091956),

[(4, 0.15396784627800345), (15, 0.10311884238600795), (20, 0.12652080827823206), (25, 0.13163921693597352), (26, 0.17488766456727203), (30, 0.17128629822112607), (31, 0.20946189618923941), (32, 0.18783088636392292), (37, 0.45700312290619599), (66, 0.24977050737314521), (70, 0.18579484008829539), (98, 0.14396816127209855), (114, 0.30814567325526004), (332, 0.3185180718590962), (1484, 0.51995514852852198)]
[(4, 0.18148528403122247), (15, 0.060774222838723829), (19, 0.17161885549166137), (20, 0.074566428580079447), (66, 0.14720499301974652), (85, 0.10373006913328799), (87, 0.087562416741318747), (98, 0.084849217779999653), (285, 0.24575034744718335), (395, 0.24378533310371023), (425, 0.28812871650788696), (465, 0.22017310628004857), (492, 0.26036435051981738), (522, 0.29701554893992305), (583, 0.17172799843947015), (784, 0.2887713338385034), (847, 0.22867613410711587), (2004, 0.41078371333452079), (2541, 0.37392307831846477)]
[(4, 0.094477491372706793), (26, 0.10731427515400549), (30, 0.1

[(0, 0.091492578283442844), (25, 0.038441017051281917), (38, 0.093710384698034013), (47, 0.11263654670200583), (50, 0.087154826244740635), (61, 0.09406474550759418), (71, 0.14213505470754187), (90, 0.091266609087172337), (91, 0.11034604432375524), (99, 0.070013932320716871), (103, 0.12444755370134586), (122, 0.048343729431959484), (131, 0.067111740197817765), (136, 0.063763770494895594), (140, 0.14389127734334012), (143, 0.072982876146427719), (144, 0.039055950684119774), (145, 0.050521500569758418), (146, 0.038337491794205238), (158, 0.072937476812159563), (163, 0.1503516626475758), (165, 0.073274902895816466), (167, 0.15252634232892534), (169, 0.086245044409163266), (171, 0.10427138109000743), (176, 0.09973125255395246), (177, 0.078583355028015264), (179, 0.077364791243641817), (180, 0.14324322615276164), (182, 0.13135456314694194), (187, 0.17434566391106021), (191, 0.07151431552243856), (200, 0.10931853146288376), (207, 0.084872415281271901), (211, 0.13006222536851367), (229, 0.1415

[(20, 0.096132420891732948), (43, 0.16881655719876879), (69, 0.17096649300474745), (70, 0.1411697254383823), (90, 0.23747082296257191), (121, 0.25010838439659305), (122, 0.12578779170290103), (385, 0.43562037183870317), (395, 0.15714628349129536), (473, 0.23832599563211077), (881, 0.38013350337502644), (999, 0.38482629556665332), (2348, 0.46311967049035352)]
[(143, 0.25374673960140864), (144, 0.27157932576527749), (145, 0.35130613443149128), (146, 0.26658345247336468), (938, 0.60245931042855549), (939, 0.55174025049353248)]
[(143, 0.25374673960140864), (144, 0.27157932576527749), (145, 0.35130613443149128), (146, 0.26658345247336468), (938, 0.60245931042855549), (939, 0.55174025049353248)]
[(0, 0.072425979258173442), (2, 0.095218471971911509), (15, 0.047674496332755216), (43, 0.10272004529075139), (85, 0.081371321088130388), (87, 0.068688564342468242), (96, 0.4304011798251417), (122, 0.076538272519870054), (125, 0.17979404665316726), (159, 0.16036955205367218), (318, 0.1080616183020844

[(0, 0.036978006608499599), (3, 0.073198131461748042), (6, 0.10711468465284706), (19, 0.068735456413664853), (20, 0.029864769153164147), (25, 0.062145901196752625), (27, 0.24760117392162814), (29, 0.059971586839228895), (30, 0.080862995187710493), (31, 0.19777082556472833), (32, 0.088673572947097365), (47, 0.091047275012886744), (53, 0.081806933680762278), (87, 0.070139643597500939), (91, 0.13379369669476321), (98, 0.16991576169595468), (99, 0.056594222189736483), (103, 0.050297221365033437), (107, 0.1360970032581526), (143, 0.029497051313705287), (144, 0.031570018674617895), (145, 0.040837943732490112), (146, 0.03098926823392335), (150, 0.099949754865898646), (158, 0.05895740507414262), (163, 0.060766729709562504), (165, 0.059230156026961779), (167, 0.041097104635352696), (169, 0.069714284632626106), (176, 0.080615564348597624), (177, 0.063521126545187392), (180, 0.057893755566599633), (182, 0.053088785945637686), (184, 0.069533593218873799), (187, 0.046976160858963147), (189, 0.18044

[(122, 0.23606702466824347), (140, 0.70263478050370454), (143, 0.1781911597021755), (144, 0.1907139184734615), (145, 0.24670128807632682), (146, 0.18720561544261488), (179, 0.37777962721454122), (809, 0.37924651912069163)]
[(30, 0.24988421579269315), (143, 0.18230458860970541), (144, 0.19511642725463907), (145, 0.25239622946171386), (146, 0.1915271372930851), (179, 0.38650042818950225), (328, 0.68296303179466655), (374, 0.38735630978453955)]
[(4, 0.1797502329573423), (122, 0.19327247065333711), (143, 0.14588842186926471), (144, 0.15614103775462002), (145, 0.20197894020513951), (146, 0.15326872470909733), (170, 0.65992672553822007), (374, 0.30998013361337856), (946, 0.53739593407266517)]
[(30, 0.13296945924497586), (32, 0.14581301393455467), (102, 0.21325669593920771), (103, 0.16541544895556329), (132, 0.24244910818858553), (143, 0.097008698562300891), (144, 0.10382618901941733), (145, 0.13430616271832682), (146, 0.10191624067095022), (336, 0.26603129845515328), (374, 0.2061216987387071

[(2, 0.039534361100161279), (4, 0.029555045775692637), (14, 0.081946213458624045), (15, 0.019794276407244286), (18, 0.073733241323075974), (20, 0.024286423241181611), (24, 0.061141442331949056), (25, 0.025268932289889804), (26, 0.033570729583090521), (29, 0.048769683534234699), (31, 0.12062237823912671), (32, 0.072110516302880248), (50, 0.057290612269115811), (70, 0.035664426933544484), (77, 0.094053355740253849), (84, 0.22144490425286767), (98, 0.027635546638461093), (102, 0.052732091718935156), (103, 0.040902362233634852), (105, 0.10957813445942557), (106, 0.11573366170522023), (152, 0.080708335271540824), (162, 0.084938230837971543), (288, 0.068864670580246506), (343, 0.070625230441908313), (378, 0.068287418296204785), (389, 0.080261376420163122), (392, 0.08918223574320315), (395, 0.03970066618780551), (400, 0.10215853605774555), (401, 0.054856636573360651), (443, 0.057252742446619466), (457, 0.082968660635382968), (542, 0.097465810221123508), (586, 0.065335896701372173), (590, 0.10

[(50, 0.16496531205286913), (70, 0.10269384608149026), (90, 0.17274803125410568), (106, 0.16662450323047928), (131, 0.1270280676488891), (181, 0.19829220568153183), (295, 0.20019307887673138), (367, 0.22800357249768433), (405, 0.24782716698893387), (705, 0.18844970727958418), (873, 0.42892387721348413), (1069, 0.25125991143944781), (1081, 0.22984769465122362), (2447, 0.34851281776730203), (4638, 0.48865735072163963)]
[(43, 0.18002836729150257), (50, 0.12091653448544654), (84, 0.15579254227516118), (85, 0.071306170272725192), (87, 0.060192195472509528), (91, 0.22963721887881897), (138, 0.10160760385028095), (172, 0.11606600511428478), (189, 0.077424968042730288), (213, 0.13211327621242838), (295, 0.14673783854625211), (341, 0.14203804005871581), (397, 0.29871842154654099), (400, 0.14374264425945851), (429, 0.25000721637270179), (441, 0.14307008110650835), (479, 0.14645572647654434), (508, 0.085588526302682169), (511, 0.12304576919184286), (551, 0.34515092453376567), (718, 0.094071718304

[(0, 0.065926738521243505), (20, 0.053244807049858442), (25, 0.027699414828810758), (38, 0.067524821626219769), (40, 0.1554734035963444), (47, 0.081162431988243453), (50, 0.062801087787699106), (57, 0.079018685416988713), (64, 0.07548839244841582), (70, 0.039094796128735364), (85, 0.037034679154164953), (87, 0.031262352730249562), (90, 0.065763912066946525), (93, 0.50045475324658373), (102, 0.11560821538139088), (103, 0.089673080444403308), (116, 0.063820022096432505), (125, 0.081829995415919413), (136, 0.04594621228754138), (140, 0.051841814903902529), (143, 0.052589216333239652), (144, 0.028142517095471397), (145, 0.036404239778277933), (146, 0.027624817711955064), (152, 0.088471235475335447), (157, 0.084495114531479495), (158, 0.052556502968992079), (161, 0.11557847833912778), (165, 0.0527996418290489), (167, 0.14654105614510393), (176, 0.071863273861967156), (177, 0.05662474920101309), (179, 0.055746689608737035), (180, 0.051608332023699452), (182, 0.047325029530443041), (187, 0.08

[(14, 0.091368766273684357), (20, 0.054157975947667608), (46, 0.2833767627290561), (50, 0.12775630114346004), (64, 0.15356609476379973), (85, 0.075339676261111918), (87, 0.063597028181431189), (98, 0.061626417990136441), (99, 0.20526048658936091), (131, 0.098376051681836205), (138, 0.10735514122367006), (151, 0.19458658618394617), (167, 0.074527145779876069), (392, 0.19887363941470315), (396, 0.19776532534446672), (410, 0.44821038944687058), (630, 0.18730973724425282), (637, 0.14966998241405643), (794, 0.18309821437900972), (796, 0.16980814499493482), (1067, 0.20973608700604471), (1173, 0.18880020309886592), (1199, 0.38714454156294875), (1543, 0.24995922522938493), (1889, 0.23842201208855451)]
[(3, 0.1225579720549997), (4, 0.060851152840944008), (7, 0.10441810682229132), (15, 0.081509231836396037), (20, 0.050003537934779692), (26, 0.069119080794073556), (30, 0.067695749239705574), (43, 0.087810387417731964), (47, 0.07622168191919286), (74, 0.12229890803122516), (112, 0.0819467886823944

[(0, 0.078954969796182009), (25, 0.033173284622231881), (30, 0.043164409930678715), (31, 0.26392359592877424), (32, 0.047333671524551202), (33, 0.10992797871084646), (47, 0.097201492292107519), (50, 0.075211637958510547), (61, 0.081174662248879564), (85, 0.044353354035345842), (87, 0.037440318919751796), (91, 0.047612433490066532), (98, 0.036280197509391134), (101, 0.090663374656086357), (103, 0.10739398788455215), (134, 0.20006862209518686), (136, 0.055025955853145264), (143, 0.031490864559133816), (144, 0.033703951342071777), (145, 0.043598329245738747), (146, 0.033083945861657131), (158, 0.062942551043573908), (160, 0.15296788388138569), (165, 0.063233738227752287), (167, 0.087750015557868502), (169, 0.14885305462264262), (176, 0.17212932853937732), (178, 0.073334945908689608), (180, 0.061807005587718032), (182, 0.056677250551011686), (185, 0.077009405329238903), (187, 0.050151450847914128), (191, 0.061714411463804832), (193, 0.10670572321709348), (207, 0.073241995261101062), (246, 

[(15, 0.056870610863552072), (17, 0.11829974661547069), (20, 0.034888462124113874), (25, 0.036299877440072403), (48, 0.10388067951625257), (80, 0.11812513714712457), (85, 0.048533672015709892), (87, 0.081938162203774922), (91, 0.15629976229318585), (98, 0.03969961787292902), (99, 0.13222840378802064), (116, 0.083635665036273829), (118, 0.12302126723786448), (138, 0.069157971893744896), (220, 0.10580392475400122), (284, 0.10952149703524405), (287, 0.11142279959586623), (870, 0.11436011529727488), (941, 0.15899051886273799), (1116, 0.32948163206102066), (1117, 0.16896914724654427), (1235, 0.17607353801966899), (1401, 0.12835572421261246), (1407, 0.19631449075483498), (1433, 0.14500146777032594), (1574, 0.14760739867964587), (1730, 0.17844536099797007), (1798, 0.19219925503717897), (1905, 0.18532230801757454), (2127, 0.20993110844803686), (2170, 0.2037467408830643), (2192, 0.20668405658447295), (2346, 0.24378867887640471), (2453, 0.1698891704546964), (2569, 0.25628450073787418), (3232, 0.

[(20, 0.084850125189811165), (43, 0.074501951992661575), (70, 0.062300880209540728), (74, 0.10376343440499775), (90, 0.10480038300499354), (98, 0.048275523502904309), (99, 0.08039618208986489), (102, 0.18423151649129224), (122, 0.055512540797039753), (332, 0.10680574460537753), (348, 0.11336998878636301), (385, 0.0961237705835324), (395, 0.069351638623388903), (400, 0.059485667414714623), (401, 0.095827047781350538), (447, 0.120183777357731), (473, 0.10517778694112763), (671, 0.20658860413785415), (685, 0.18744495713982245), (713, 0.14218897097950139), (722, 0.20332777498519367), (775, 0.28118802275017707), (782, 0.14364251194766159), (784, 0.16429835982497087), (950, 0.21274600789421033), (958, 0.20332777498519367), (965, 0.10994432665735042), (1021, 0.17069270363805633), (1025, 0.18813260751803215), (1026, 0.29869787361296452), (1239, 0.21852294245235518), (1696, 0.19414159169767917), (1961, 0.35065049433114226), (2518, 0.22178377160501567), (3847, 0.25133191617250122)]
[(15, 0.11936

[(140, 0.507644438004063), (143, 0.25748156406221684), (144, 0.27557662287542461), (146, 0.27050721677752165), (179, 0.54588167812936528), (718, 0.47843174310463199)]
[(140, 0.507644438004063), (143, 0.25748156406221684), (144, 0.27557662287542461), (146, 0.27050721677752165), (179, 0.54588167812936528), (718, 0.47843174310463199)]
[(31, 0.54350064370073892), (143, 0.32424734700873364), (144, 0.34703451173456185), (146, 0.34065059262120062), (718, 0.60249060545925481)]
[(0, 0.090941163458700944), (4, 0.044690401242584679), (7, 0.076686913443970092), (25, 0.038209337639925019), (38, 0.093145603419330938), (91, 0.1096810019144874), (103, 0.061848761590958659), (109, 0.097728508878783163), (136, 0.12675894775541102), (143, 0.072543017082288455), (144, 0.038820565141317667), (145, 0.050217013529333492), (146, 0.038106436317188783), (158, 0.072497891364377148), (163, 0.074722755581121514), (165, 0.07283328382139162), (167, 0.15160708430929373), (178, 0.08446796091915712), (180, 0.0711899581

[(15, 0.044935306970730178), (20, 0.055133002142183495), (25, 0.057363411822077881), (35, 0.34531692809536696), (66, 0.10884057812668201), (74, 0.13484457774301578), (98, 0.12547180265187191), (99, 0.10447793375598105), (134, 0.23063978323952361), (136, 0.095151161618529004), (179, 0.11544721552808439), (189, 0.083277348521488256), (263, 0.33423576584746095), (320, 0.11065504147505471), (410, 0.30418645858554516), (418, 0.15388442912535347), (631, 0.14232029206800395), (648, 0.15677556674900392), (674, 0.12514937615676894), (676, 0.10597655560387768), (734, 0.11296191268154726), (781, 0.24359193814771846), (801, 0.15603653333857503), (802, 0.15813476288703199), (805, 0.15459128285732834), (854, 0.22657686674449148), (929, 0.16146200347439252), (931, 0.16485720139618493), (1644, 0.26996846627072396), (3416, 0.31383800257861372)]
[(143, 0.32545369737392399), (144, 0.34832564091056512), (146, 0.34191797066022528), (395, 0.53864669984231184), (718, 0.60473214966501176)]
[(143, 0.2420603281

[(7, 0.15265623072381765), (20, 0.07310371597686656), (25, 0.076061132214251148), (70, 0.10735224825558533), (85, 0.10169527569684951), (87, 0.085844771777301751), (106, 0.34836586063916197), (121, 0.19019444352653195), (198, 0.1914071679296965), (401, 0.16512204945457573), (495, 0.22625696728140293), (734, 0.14978207715913766), (781, 0.32299122428851657), (792, 0.31320478708183402), (873, 0.22419036926921349), (1378, 0.20060093490473288), (1658, 0.2956444834204825), (3407, 0.51082384415672699)]
[(1, 0.3670659705889267), (3, 0.18161699404918086), (4, 0.1803489936736217), (7, 0.15473577415968007), (15, 0.060393711747860714), (25, 0.077097266982353543), (26, 0.10242662696498173), (31, 0.12267575050211905), (32, 0.11000709614198664), (33, 0.25548108425238003), (46, 0.19385966794685575), (98, 0.084317971687358911), (105, 0.16716524843486791), (570, 0.29737495459101299), (873, 0.2272443789783935), (976, 0.25873419354847993), (2671, 0.4218028793172206), (4263, 0.45358196426599506)]
[(3, 0.41

[(2, 0.10970263973695112), (14, 0.037898323056589772), (25, 0.023372651860256347), (27, 0.062080747904721525), (38, 0.056977165674777654), (62, 0.070518359893772448), (63, 0.18714366377324457), (66, 0.044346960196758795), (69, 0.15980331090640701), (70, 0.032988027549023136), (87, 0.02637904414993841), (88, 0.096873984095644169), (119, 0.14930332638370436), (122, 0.029393633267617646), (128, 0.04529925036249656), (136, 0.038769224213977486), (154, 0.059244848721648119), (166, 0.070349479620037625), (187, 0.035334830852700519), (246, 0.086801625817165259), (260, 0.082645283341388695), (281, 0.14439750569565873), (282, 0.078820176990269594), (288, 0.06369679385255729), (318, 0.12449942260659773), (391, 0.26642356955363533), (400, 0.031497385411709253), (410, 0.1239403998349326), (442, 0.19236672312916747), (631, 0.057988228619888373), (648, 0.063878012576800022), (719, 0.056096958948082226), (720, 0.30969962407740925), (768, 0.075724535953578573), (782, 0.076058044852706536), (854, 0.092

[(15, 0.034336944795846286), (20, 0.042129429586818014), (25, 0.087667557554200803), (26, 0.058234828327899529), (27, 0.11642811377342172), (30, 0.057035630251622324), (31, 0.27899000414919745), (32, 0.062544716627928584), (102, 0.091473862700023556), (103, 0.14190588482658797), (121, 0.10960842837169431), (138, 0.083511445614877944), (167, 0.057974584269429835), (308, 0.11362568008143352), (315, 0.22196953366233679), (339, 0.201440579497592), (352, 0.1848095341767573), (400, 0.059071150011079238), (511, 0.10113150662569233), (566, 0.26285411390747088), (669, 0.10444487050835062), (841, 0.19707101984978126), (1049, 0.16781138500976137), (1245, 0.16699344551944903), (1825, 0.17935372272567685), (4971, 0.30947537296481831), (4972, 0.61895074592963661)]
[(0, 0.041845036873652312), (15, 0.055089101380753791), (25, 0.035162759781198268), (26, 0.046715052558120022), (30, 0.091506149889809082), (47, 0.15454632075613647), (70, 0.099257037267069373), (98, 0.038456001097407128), (105, 0.15248248

[(31, 0.59352826809534898), (143, 0.3540933548747951), (144, 0.37897801061767084), (145, 0.49023356092907983), (146, 0.37200647065920045)]
[(0, 0.10114711447519378), (4, 0.066274427091223789), (8, 0.081684382276597647), (15, 0.044386814303443545), (24, 0.2056559525994876), (29, 0.21872291182915263), (47, 0.20753708291805109), (50, 0.064234420996292715), (71, 0.10475567832327919), (81, 0.09373853710314338), (84, 0.082761582534354727), (85, 0.075759871558059688), (87, 0.063951730689696215), (91, 0.081326698380003906), (92, 0.074072401760695045), (98, 0.03098506486366424), (103, 0.045859861701539349), (124, 0.089017327637781987), (130, 0.1557528153022516), (136, 0.14098458070853909), (143, 0.026894740049997094), (144, 0.028784824509997642), (145, 0.037235107644519427), (146, 0.0282553094757561), (162, 0.095233020947617819), (174, 0.10592925021871369), (177, 0.057917117475468266), (180, 0.052786208693284507), (182, 0.048405146751548625), (187, 0.042831794317747272), (189, 0.205652105977277

[(0, 0.1105541301134078), (14, 0.075317514232405874), (31, 0.14782017466223696), (32, 0.066277435023774603), (47, 0.1361032301553575), (62, 0.14014518708425017), (64, 0.12658829707016334), (70, 0.06555900193031175), (98, 0.050800167314956221), (99, 0.084600625851382366), (108, 0.13114363492625059), (140, 0.086934783651620259), (143, 0.044094059523312398), (144, 0.047192862357190724), (145, 0.061047143411024025), (146, 0.046324719835830391), (158, 0.088133261220974757), (165, 0.088540986610904163), (169, 0.10421332571598045), (177, 0.094955400968162559), (179, 0.093482961763839814), (180, 0.086543250606057776), (182, 0.079360477852905939), (184, 0.10394321675259251), (185, 0.10782991670698518), (187, 0.070222938932522835), (245, 0.13182007954723365), (246, 0.17250585673984167), (287, 0.14257811952468616), (308, 0.12040695097284386), (323, 0.13675598252301011), (346, 0.16040249389430283), (353, 0.11091844192455429), (404, 0.21507205795972645), (471, 0.12404317179592654), (599, 0.12196999

[(4, 0.14678000374498706), (15, 0.098304837259773653), (25, 0.12549376523694214), (30, 0.32657992044607392), (31, 0.199683366691874), (32, 0.17906217999660101), (91, 0.36023346007513501), (98, 0.13724714452732129), (167, 0.16597813537850811), (391, 0.35762412723912157), (511, 0.28953409686807802), (654, 0.4480231771196419), (1039, 0.43110150307095535)]
[(3, 0.11085331112164989), (4, 0.11007936350254016), (7, 0.094445858463704319), (15, 0.03686242553031089), (25, 0.094115502441466492), (26, 0.062517997313707344), (70, 0.13283408343614453), (83, 0.16109988585860074), (85, 0.062917167346209737), (87, 0.053110725495353589), (100, 0.14805274998897441), (159, 0.24799887254796976), (172, 0.10241111307831285), (232, 0.1747636227879252), (393, 0.31043551872643532), (429, 0.073531485471358599), (450, 0.1545106277743056), (479, 0.12922555532420821), (511, 0.10856972435732527), (623, 0.19135249855522679), (627, 0.32330956432926145), (631, 0.11675164856949105), (648, 0.12860995159155875), (875, 0.2

[(2, 0.10212985390574927), (4, 0.15270020424014932), (63, 0.10453506909065007), (105, 0.14153762135237882), (112, 0.10281885210403402), (121, 0.16322997600330877), (122, 0.16418752430638966), (189, 0.09476692583678524), (320, 0.12592185384272969), (400, 0.087969351828479911), (418, 0.17511549708616719), (428, 0.20705577441169937), (442, 0.17908762611928386), (568, 0.25052563629818175), (618, 0.18172301542160202), (712, 0.2234749974104403), (719, 0.15667373827089492), (820, 0.38404460802486862), (898, 0.27425636008729665), (1040, 0.35713748387964156), (5044, 0.46087384385585872)]
[(29, 0.21390665433802208), (31, 0.17635224383685638), (91, 0.15907178460034926), (103, 0.17940012782278952), (134, 0.22280785659243127), (167, 0.14658515171683836), (189, 0.16089893309953493), (191, 0.20618609147872755), (374, 0.22354779637727973), (400, 0.14935775039318494), (549, 0.32667463592538126), (559, 0.28778431381865499), (751, 0.29570472907155465), (2272, 0.61389339583502212)]
[(4, 0.1102769844274249

[(31, 0.37597094675244086), (143, 0.22430071325539275), (144, 0.24006391794534113), (146, 0.23564779049294568), (718, 0.41677772780834588), (938, 0.53254695311059241), (939, 0.48771358367720041)]
[(143, 0.38628063722589812), (144, 0.41342732197773152), (146, 0.4058220651703679), (718, 0.71775592659868237)]
[(140, 0.507644438004063), (143, 0.25748156406221684), (144, 0.27557662287542461), (146, 0.27050721677752165), (179, 0.54588167812936528), (718, 0.47843174310463199)]
[(140, 0.507644438004063), (143, 0.25748156406221684), (144, 0.27557662287542461), (146, 0.27050721677752165), (179, 0.54588167812936528), (718, 0.47843174310463199)]
[(0, 0.09250585404514905), (2, 0.060808746811950962), (4, 0.090918646239733086), (8, 0.11205881033561672), (12, 0.10484040710349116), (14, 0.18906514767415439), (15, 0.0304460501922461), (25, 0.038866749406454244), (29, 0.075013817236574826), (30, 0.050572631657006005), (31, 0.12368811086665091), (33, 0.12879469880559122), (38, 0.094748222555663439), (47, 

[(13, 0.091790756780326749), (15, 0.058680084856145953), (20, 0.035998521676782406), (30, 0.0487354894689674), (43, 0.063216409587395189), (52, 0.415466092102239), (53, 0.098608787508608922), (58, 0.10764179905765622), (75, 0.11260109547696452), (128, 0.1451847560970761), (131, 0.065390043977397991), (136, 0.062127963669782967), (179, 0.075380061473781443), (355, 0.21325009294008948), (393, 0.12354302734535273), (400, 0.15142436736041109), (408, 0.14167125650777471), (441, 0.10047724125927547), (456, 0.13219026180610283), (526, 0.086888911239108801), (552, 0.1141123499521131), (559, 0.097255432100715078), (593, 0.10325245258658095), (611, 0.12354302734535273), (638, 0.15524126382367565), (719, 0.089895784560353451), (822, 0.12888962212763211), (855, 0.094254785707325039), (862, 0.16272402879829698), (904, 0.16543196640571203), (1424, 0.14269163450277111), (1554, 0.15093104562471321), (2059, 0.16762587433951495), (2206, 0.17832537261401865), (2651, 0.48479475858301663), (2974, 0.1794034

[(0, 0.055704458836096475), (25, 0.04680895635746507), (30, 0.060906871407204501), (32, 0.066789881974679097), (47, 0.13715555943268012), (61, 0.11454100086288437), (91, 0.13436645461484528), (98, 0.051192946408423694), (102, 0.097682567335742584), (103, 0.15153761676566593), (132, 0.11105419800217427), (143, 0.04443498802103741), (144, 0.0475577503226155), (145, 0.061519150550274161), (146, 0.046682895456580657), (158, 0.088814694064140109), (163, 0.18308059866766693), (165, 0.089225571924179459), (167, 0.061909556068837628), (169, 0.105019087148776), (177, 0.19137916309666855), (178, 0.10347881801901475), (180, 0.087212389731289905), (182, 0.079974080882103685), (187, 0.070765891913829593), (189, 0.067954914965630608), (192, 0.13908010078712535), (207, 0.10334766058748762), (233, 0.10964868416285203), (245, 0.13283929216171905), (263, 0.090912929105309473), (286, 0.10556913974560583), (287, 0.14368051165241277), (290, 0.12522869770595838), (333, 0.14648690078225329), (337, 0.11568532

In [61]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=35) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

In [62]:
lsi.print_topics(35)


[(0,
  u'0.445*"nbusi" + 0.437*"present" + 0.420*"locat" + 0.419*"associ" + 0.264*"nlocat" + 0.163*"film" + 0.163*"network" + 0.158*"paper" + 0.156*"server" + 0.139*"email"'),
 (1,
  u'-0.205*"locat" + 0.194*"laptop" + 0.169*"patient" + 0.169*"employe" + 0.165*"health" + 0.137*"individu" + 0.135*"comput" + 0.131*"report" + 0.128*"stolen" + 0.124*"contain"'),
 (2,
  u'-0.444*"locat" + 0.410*"film" + 0.404*"nlocat" + 0.398*"paper" + -0.355*"network" + -0.351*"server" + -0.120*"email" + 0.082*"laptop" + 0.061*"nbusi" + 0.059*"present"'),
 (3,
  u'-0.415*"film" + -0.388*"paper" + 0.322*"nlocat" + -0.290*"locat" + -0.274*"human" + 0.251*"laptop" + -0.178*"servic" + -0.167*"ocrport" + -0.167*"jsf" + -0.167*"report"'),
 (4,
  u'0.348*"human" + -0.274*"locat" + 0.273*"nlocat" + -0.234*"paper" + -0.228*"film" + 0.215*"ocrport" + 0.214*"jsf" + 0.204*"jsessionid" + 0.200*"servic" + 0.181*"report"'),
 (5,
  u'-0.512*"server" + -0.501*"network" + 0.299*"locat" + 0.260*"email" + 0.207*"electron" + -