In [2]:
import pandas as pd
from pandas import DataFrame
import re
import gensim
import gensim.corpora as corpora
from typing import Generator, List
import spacy
from pprint import pprint

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from spacy.lang.en import English

import nltk
from nltk.corpus import stopwords  

nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

In [6]:
def read_sample() -> DataFrame:
    df = pd.read_json('C:/Users/chiruco/Desktop/python/ProyPython/Topic_Model/Topic_model_py/data/raw/newsgroups.json')
    return df

nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

def clean_up_text(df:pd.DataFrame) -> List[str]:
    data = df.content.values.tolist() # convertir a lista
    data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data] # Quitar e-mail
    data = [re.sub(r'\s+', ' ', sent) for sent in data] # quitar enters (new line)
    data = [re.sub(r"\'", "", sent) for sent in data] # quitar comillas
    return(data)

def sentence_to_words(sentences: List[str]) -> List[List[str]]:
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))    

def remove_stopwords(texts: List[List[str]]) -> List[List[str]]:
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


def lemmatization(npl: English, texts:List[List[str]], allowed_postags:List[str]=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


def core_bigram(data_words: List[List[str]], min_count:int=5,threshold:int=10):
    bigram = gensim.models.Phrases(data_words, min_count=min_count, threshold=threshold)
    return(bigram)

def build_bigrams(data_words: List[List[str]], min_count:int=5,threshold:int=10) -> List[List[str]]:
    bigram = core_bigram(data_words,min_count,threshold)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in data_words]
    
def build_trigrams(data_words:List[List[str]],min_count:int=5,threshold:int=10) -> List[List[str]]:
    bigram = core_bigram(data_words, min_count, threshold)
    trigram = gensim.models.Phrases(bigram[data_words], threshold)  
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return[trigram_mod[bigram_mod[doc]] for doc in data_words]

def load_doc() -> pd.DataFrame:
    return read_sample()

def lda_model(raw_file: pd.DataFrame):
    doc       = clean_up_text(raw_file)
    lemma     = tokenize(doc)
    id2word, corpus = create_dictionary(lemma)
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100,
                                    update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)
    return(lda_model)

def tokenize(documents: List[str]) -> List[List[str]]:
    document_words = list(sentence_to_words(documents))
    document_words = remove_stopwords(document_words)
    document_words = build_bigrams(document_words)
    document_words = lemmatization(nlp, document_words)
    return document_words

def create_dictionary(documents: List[List[str]]):
    id2word = corpora.Dictionary(documents)
    texts   = documents
    corpus  = [id2word.doc2bow(text) for text in texts ]
    return (id2word,  corpus)

In [7]:
df              = load_doc()
doc             = clean_up_text(df)
lemma           = tokenize(doc)
id2word, corpus = create_dictionary(lemma)
lda_model       = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100,
                  update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)

In [250]:
pprint(lda_model.print_topics())

[(0,
  '0.118*"space" + 0.046*"launch" + 0.043*"mount" + 0.036*"orbit" + '
  '0.034*"satellite" + 0.033*"mission" + 0.028*"earth" + 0.027*"cool" + '
  '0.025*"moon" + 0.022*"fuel"'),
 (1,
  '0.064*"patient" + 0.032*"kill" + 0.031*"headache" + 0.030*"disease" + '
  '0.028*"soldier" + 0.027*"civilian" + 0.026*"murder" + 0.023*"village" + '
  '0.019*"turkish" + 0.018*"treatment"'),
 (2,
  '0.119*"sell" + 0.119*"price" + 0.051*"soon" + 0.031*"market" + 0.030*"pack" '
  '+ 0.030*"insurance" + 0.028*"communication" + 0.022*"cap" + 0.022*"ensure" '
  '+ 0.021*"agent"'),
 (3,
  '0.026*"people" + 0.014*"say" + 0.014*"may" + 0.013*"reason" + '
  '0.012*"believe" + 0.011*"evidence" + 0.010*"mean" + 0.009*"fact" + '
  '0.009*"state" + 0.009*"claim"'),
 (4,
  '0.076*"gun" + 0.041*"week" + 0.036*"fire" + 0.033*"kill" + 0.031*"carry" + '
  '0.026*"city" + 0.025*"police" + 0.025*"weapon" + 0.024*"office" + '
  '0.023*"shoot"'),
 (5,
  '0.037*"program" + 0.030*"file" + 0.024*"window" + 0.021*"available

In [227]:
data_words = list(sentence_to_words(data))
data_words = build_bigrams(data_words, 5,10)
#data_words = build_trigrams(data_words, 5,10)
data_words_nostops = remove_stopwords(data_words)

In [199]:
data_lemmatized = lemmatization(nlp, data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [201]:
print(data_lemmatized[:1])

[['where', 'car', 'nntp', 'post', 'line', 'wonder', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'door', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood']]


In [212]:
def create_dictionary(documents: List[List[str]]):
    id2word = corpora.Dictionary(documents)
    texts   = documents
    corpus  = [id2word.doc2bow(text) for text in texts ]
    return (id2word, texts, corpus)

In [214]:
id2word, texts, corpus = create_dictionary(data_lemmatized)

In [215]:
for key, value in id2word.items():
    print(key, value)

0 addition
1 body
2 bring
3 bumper
4 call
5 car
6 could
7 day
8 door
9 early
10 engine
11 enlighten
12 front
13 funky
14 history
15 info
16 know
17 late
18 line
19 look
20 mail
21 make
22 model
23 name
24 neighborhood
25 nntp
26 post
27 production
28 really
29 rest
30 see
31 separate
32 small
33 spec
34 sport
35 thank
36 where
37 wonder
38 year
39 acceleration
40 adapter
41 add
42 answer
43 article
44 attain
45 base
46 brave
47 brief
48 card
49 clock
50 detail
51 disk
52 do
53 especially
54 experience
55 final
56 floppy
57 functionality
58 guy
59 heat
60 hour
61 keyword
62 knowledge
63 message
64 network
65 next
66 number
67 oscillator
68 poll
69 procedure
70 rate
71 report
72 request
73 send
74 share
75 si
76 sink
77 soul
78 speed
79 summarize
80 summary
81 top
82 upgrade
83 usage
84 access
85 active
86 actually
87 advance
88 anymore
89 appearence
90 back
91 be
92 bite
93 breifly
94 bunch
95 computer
96 conviction
97 daily
98 dangerous
99 dirt
100 display
101 distribution
102 drop
103

1308 thus
1309 unhappy
1310 unit
1311 useless
1312 warranty
1313 acne
1314 advice
1315 anti
1316 appearance
1317 bury
1318 cap
1319 chemist
1320 chin
1321 counter
1322 cradle
1323 dandruff
1324 diet
1325 disfigure
1326 doctor
1327 escalate
1328 father
1329 flogging
1330 greasy
1331 hairline
1332 improve
1333 incline
1334 kid
1335 malady
1336 morale
1337 neighbour
1338 odd
1339 ointment
1340 overdose
1341 pharmacist
1342 prescription
1343 scaliness
1344 scalp
1345 senstitive
1346 serious
1347 severe
1348 shall
1349 shampoo
1350 son
1351 spot
1352 spotty
1353 teenage
1354 teenager
1355 vitamin
1356 wary
1357 wash
1358 wierd
1359 domestic
1360 explode
1361 television
1362 union
1363 act
1364 arab
1365 arguable
1366 army
1367 autumn
1368 battle
1369 begin
1370 bomb
1371 citizenship
1372 constant
1373 context
1374 cut
1375 desert
1376 easy
1377 elsewhere
1378 entirely
1379 expel
1380 fight
1381 fighting
1382 force
1383 grant
1384 grief
1385 imho
1386 intention
1387 invade
1388 invasion
1389

2528 revolutionary
2529 reward
2530 ringleader
2531 row
2532 sabotage
2533 saintly
2534 savage
2535 savagely
2536 scene
2537 scholar
2538 screw
2539 shocking
2540 slaughterhouse
2541 soldier
2542 spanish
2543 specially
2544 spell
2545 spurious
2546 squad
2547 stab
2548 stage
2549 stone
2550 street
2551 strike
2552 subject
2553 submit
2554 suffer
2555 summarily
2556 super
2557 superior
2558 supervise
2559 survive
2560 survivor
2561 systematic
2562 tartar
2563 tear
2564 terminate
2565 territorial
2566 terror
2567 terrorism
2568 text
2569 thunder
2570 tragedy
2571 traitorously
2572 treachery
2573 treasonable
2574 tree
2575 triangle
2576 troop
2577 uncultivated
2578 unfortunately
2579 urge
2580 urgently
2581 victory
2582 vilayet
2583 volunteer
2584 wire
2585 withdrawal
2586 witness
2587 attribution
2588 defend
2589 discussion
2590 forum
2591 glad
2592 harmless
2593 honest
2594 limit
2595 potentially
2596 prank
2597 punk
2598 taunt
2599 ugly
2600 vandalism
2601 alternative
2602 amp
2603 cam

3960 sketchiness
3961 testament
3962 roster
3963 brochure
3964 microwave
3965 blade
3966 blazer
3967 bull
3968 hardly
3969 instruction
3970 oakley
3971 orange
3972 perfect
3973 terminator
3974 advise
3975 ahead
3976 award
3977 mislable
3978 refund
3979 telephone
3980 visual
3981 digitally
3982 preset
3983 tune
3984 bi
3985 clue
3986 dramatically
3987 flawed
3988 hatemonger
3989 impose
3990 incredibly
3991 median
3992 mindless
3993 obsession
3994 repellent
3995 sexual
3996 shame
3997 statistic
3998 together
3999 doc
4000 gum
4001 processor
4002 butt
4003 colon
4004 fan
4005 headline
4006 origonal
4007 rectify
4008 resent
4009 projector
4010 semd
4011 forwards
4012 njd
4013 oiler
4014 partial
4015 shooter
4016 soo
4017 capital
4018 semiconductor
4019 symptom
4020 xrdb
4021 xresource
4022 xterm
4023 chastity
4024 climate
4025 clinic
4026 incarcerate
4027 intellect
4028 precendent
4029 quarantine
4030 resistant
4031 sanitarium
4032 shameful
4033 skepticism
4034 sporadically
4035 surrender


5307 fidonet
5308 powerpc
5309 quadra
5310 chap
5311 agnostic
5312 alternate
5313 amusing
5314 bake
5315 geek
5316 polite
5317 shoulder
5318 whirrr
5319 wholesome
5320 banding
5321 disturb
5322 drawing
5323 gxxor
5324 inverse
5325 redraw
5326 underlying
5327 xsetfunction
5328 colour
5329 cook
5330 equipped
5331 gramming
5332 grayscale
5333 premier
5334 quickdraw
5335 signature
5336 tidewater
5337 blessing
5338 criterion
5339 remark
5340 anticipate
5341 grantor
5342 guilt
5343 inadvertantly
5344 innocence
5345 string
5346 uphold
5347 verdict
5348 removable
5349 cannibalize
5350 convertible
5351 outrageous
5352 immerse
5353 knowledgeably
5354 merit
5355 creation
5356 ecological
5357 uucp
5358 critical
5359 standout
5360 compelling
5361 considerably
5362 council
5363 enormous
5364 existance
5365 harmony
5366 remarkable
5367 tract
5368 binary
5369 taxis
5370 endometriosis
5371 adrift
5372 afterward
5373 altitude
5374 compensate
5375 costar
5376 grapple
5377 guidance
5378 gyro
5379 hst
5380

6557 lambaste
6558 notation
6559 rancher
6560 sheriff
6561 abusive
6562 accent
6563 addict
6564 admiration
6565 adversary
6566 affirmation
6567 allege
6568 ambiguous
6569 amphiboly
6570 ample
6571 ant
6572 antecedent
6573 arguer
6574 argumentum
6575 arouse
6576 axe
6577 baculum
6578 bifurcation
6579 blackmail
6580 breaking
6581 careless
6582 cast
6583 causality
6584 causally
6585 certainty
6586 circular
6587 circumstantial
6588 collapse
6589 communist
6590 concise
6591 conclusive
6592 conditional
6593 consequent
6594 crumenam
6595 cum
6596 damaging
6597 deductive
6598 delve
6599 demolish
6600 demonstrando
6601 denote
6602 discrimination
6603 dislike
6604 disprove
6605 dolphin
6606 emotive
6607 employee
6608 english
6609 environmentally
6610 equivalently
6611 equivocation
6612 everyday
6613 examination
6614 fallacie
6615 fallacious
6616 fallacy
6617 falseness
6618 favourable
6619 gallery
6620 generalization
6621 gentle
6622 hasty
6623 hawking
6624 heroin
6625 herring
6626 hypostatizatio

8057 capitalism
8058 coercion
8059 coersive
8060 corrupt
8061 counterbalance
8062 decree
8063 delusion
8064 demonstrably
8065 dilute
8066 electorate
8067 geezer
8068 litigation
8069 multitude
8070 peril
8071 priciple
8072 productive
8073 protector
8074 questionare
8075 sarcasm
8076 socialize
8077 surprising
8078 ths
8079 totalitarian
8080 undesirable
8081 unworthy
8082 voter
8083 wholesale
8084 adder
8085 beverage
8086 excited
8087 mediot
8088 nominate
8089 tomorrow
8090 acre
8091 approx
8092 coastal
8093 imately
8094 residential
8095 abomination
8096 dolt
8097 misguided
8098 revocation
8099 riddance
8100 titty
8101 abstain
8102 concession
8103 intifadah
8104 neighbouring
8105 preferential
8106 secretary
8107 therein
8108 spoiler
8109 sunroof
8110 ipx
8111 recompile
8112 undefined
8113 isle
8114 overtime
8115 dialup
8116 cv
8117 fog
8118 gt
8119 moisture
8120 stopper
8121 taillamp
8122 taillight
8123 cannuck
8124 aquainte
8125 beleiver
8126 carelessly
8127 conversely
8128 handedly
8129

9216 formalize
9217 hypothesize
9218 meditate
9219 phenomena
9220 resonant
9221 farther
9222 mining
9223 reformation
9224 sighted
9225 yeasteryear
9226 asking
9227 yellowish
9228 ambassador
9229 unconfirmed
9230 devout
9231 explictly
9232 following
9233 ingore
9234 rhetorical
9235 tangential
9236 unmistakable
9237 biannual
9238 foward
9239 outrigger
9240 criticality
9241 hazardous
9242 corrupted
9243 interperate
9244 millennium
9245 owe
9246 pare
9247 pcurrent
9248 plagerize
9249 pshouldnt
9250 satanic
9251 scribe
9252 translator
9253 accumulate
9254 adventurer
9255 aficionado
9256 bazerie
9257 beneficiary
9258 bibliography
9259 bureau
9260 cabal
9261 cleartext
9262 cryptanalysis
9263 cryptogram
9264 cryptographic
9265 cryptologic
9266 discovery
9267 expertise
9268 fraught
9269 gillogly
9270 grille
9271 homophonic
9272 innkeeper
9273 journal
9274 jurisdiction
9275 mathematician
9276 miscellany
9277 ninth
9278 nutshell
9279 prerogative
9280 provably
9281 quest
9282 scholarly
9283 severi

10556 eosvcr
10557 concurrently
10558 gcc
10559 immortalize
10560 ioctl
10561 kink
10562 openlook
10563 architecture
10564 crypt
10565 cyphertext
10566 foo
10567 actuate
10568 carburated
10569 exaggerated
10570 idling
10571 linkage
10572 suction
10573 tach
10574 facist
10575 bluntly
10576 coaching
10577 demise
10578 dissappointe
10579 drastic
10580 finesse
10581 goaltending
10582 leafs
10583 monkey
10584 officiating
10585 skate
10586 stir
10587 clicking
10588 undimmed
10589 ixel
10590 targas
10591 protester
10592 synthesize
10593 explosive
10594 fertilizer
10595 microscopic
10596 taggant
10597 technologically
10598 apologie
10599 caulmny
10600 croak
10601 mischief
10602 pond
10603 proscribe
10604 sawdust
10605 swampland
10606 tranny
10607 babysitter
10608 beacon
10609 childcare
10610 disciplinary
10611 prospective
10612 sitter
10613 stats
10614 synthesizer
10615 boarding
10616 congenially
10617 defensively
10618 deserving
10619 dynasty
10620 flattery
10621 formulate
10622 honour
10623 

11788 experimentation
11789 dam
11790 saxophone
11791 biodegradable
11792 gimic
11793 replenish
11794 modal
11795 mwm
11796 slavishly
11797 civilised
11798 numberless
11799 occult
11800 rosicrucian
11801 aesthetic
11802 demonstable
11803 universalize
11804 diplomat
11805 internationally
11806 korean
11807 rig
11808 stunt
11809 transffere
11810 anyways
11811 excersize
11812 skinny
11813 compliment
11814 accomplice
11815 angst
11816 determinism
11817 discord
11818 existential
11819 getaway
11820 juxtaposition
11821 operationally
11822 repertoire
11823 subtext
11824 symbolize
11825 uncertainty
11826 uncountably
11827 waddle
11828 fielder
11829 infield
11830 puzzled
11831 relieve
11832 accommodate
11833 entitled
11834 exportable
11835 innovative
11836 intensify
11837 lawfully
11838 monopolism
11839 obscurity
11840 proprietary
11841 snooper
11842 totalitaristic
11843 trapdoor
11844 trusted
11845 unbreakable
11846 techwork
11847 braindead
11848 dmorph
11849 trekker
11850 cabling
11851 drivew

12959 voodoo
12960 automagically
12961 eg
12962 multitasker
12963 aggravate
12964 countrys
12965 ferigner
12966 fishermen
12967 infrequently
12968 potato
12969 unresolved
12970 appeasement
12971 defenceless
12972 depose
12973 emotionalism
12974 foam
12975 footage
12976 gobble
12977 grandiose
12978 hallucination
12979 incisive
12980 incredulity
12981 judicious
12982 militarily
12983 misidentifie
12984 misjudge
12985 naught
12986 overconfidence
12987 oversimplify
12988 reich
12989 shalt
12990 slavic
12991 thou
12992 wimpe
12993 commpression
12994 fps
12995 grainy
12996 colourmap
12997 meekly
12998 purche
12999 remedial
13000 sobriety
13001 circumcision
13002 foreskin
13003 bios
13004 flawlessly
13005 harddisk
13006 param
13007 performace
13008 singificant
13009 jennise
13010 tele
13011 unanimously
13012 rightness
13013 stain
13014 brazilian
13015 oher
13016 arbitration
13017 humbug
13018 overpay
13019 underpaid
13020 analytic
13021 avg
13022 disoriented
13023 gassing
13024 indefinitely
1

14306 heir
14307 immutable
14308 judging
14309 memorial
14310 mosaic
14311 nnregular
14312 numbering
14313 offering
14314 othere
14315 recorded
14316 sanctify
14317 teachings
14318 thinsg
14319 undergird
14320 outgrown
14321 vid
14322 skilled
14323 tichonov
14324 chlorine
14325 eminently
14326 pigment
14327 purification
14328 sulfate
14329 trendy
14330 clipperize
14331 scrambling
14332 talkie
14333 conflagration
14334 frantic
14335 imolation
14336 westernize
14337 trader
14338 putimage
14339 ximagehandler
14340 condescending
14341 destroyer
14342 entropic
14343 incresase
14344 intruder
14345 snooze
14346 overbroad
14347 folkert
14348 rabinic
14349 animator
14350 inumerable
14351 awe
14352 boggling
14353 entertaining
14354 excrement
14355 gator
14356 spafford
14357 dns
14358 expandibility
14359 hex
14360 hexdump
14361 sniffer
14362 subnet
14363 tcp
14364 tcpdump
14365 timestamp
14366 ttt
14367 uw
14368 although
14369 fraudulant
14370 nondisclosure
14371 revival
14372 rose
14373 differnt

15555 jowl
15556 keeping
15557 misinform
15558 aerosol
15559 asthsma
15560 builing
15561 css
15562 fireman
15563 igniter
15564 inhalation
15565 liberally
15566 mustard
15567 nonflammable
15568 nontoxic
15569 saferoom
15570 searflame
15571 baboon
15572 intercourse
15573 uniformly
15574 condidere
15575 mayfly
15576 gratefull
15577 backgroun
15578 hardcodin
15579 weared
15580 func
15581 quart
15582 allocation
15583 bignum
15584 bugfixe
15585 decprl
15586 exponentiation
15587 gcd
15588 gennum
15589 modified
15590 modulo
15591 multiplicative
15592 primality
15593 prl
15594 radix
15595 scalar
15596 uncommente
15597 undocumented
15598 unixes
15599 citadel
15600 alittle
15601 cooler
15602 cooling
15603 oscilator
15604 compilation
15605 fasten
15606 mossad_agent
15607 sigint
15608 unhumoristic
15609 vulgar
15610 inconsiderate
15611 hotly
15612 splitsville
15613 feeeling
15614 insistance
15615 benevolence
15616 polytheist
15617 plunk
15618 confessional
15619 prescribed
15620 budge
15621 disguste

16805 walled
16806 aboritionist
16807 sentencing
16808 strychnine
16809 wich
16810 cking
16811 plus
16812 scheduling
16813 hardwire
16814 oterhwise
16815 backplane
16816 breadboard
16817 pneumatic
16818 powered
16819 prototyping
16820 twiddle
16821 quixotic
16822 disinterested
16823 dipstick
16824 floation
16825 inlet
16826 abscence
16827 spat
16828 admiral
16829 affectionately
16830 breathtake
16831 carzy
16832 circuitous
16833 congenial
16834 exaggeration
16835 fundie
16836 institional
16837 luther
16838 martyrdom
16839 overrepresente
16840 piety
16841 prudent
16842 uncharitable
16843 upsetting
16844 weirdness
16845 wierdo
16846 worrisome
16847 camcorder
16848 jumpy
16849 martyrs
16850 metallurgy
16851 suprising
16852 victimize
16853 activation
16854 hepatic
16855 insulin
16856 intravenous
16857 mailinglist
16858 mouthful
16859 wotta
16860 byet
16861 cuckoo
16862 turncoat
16863 acrid
16864 chrisitan
16865 disturbed
16866 snidenes
16867 sweetness
16868 unloving
16869 cus
16870 formatt

17922 acquiescent
17923 addesse
17924 alltogether
17925 anonimity
17926 anonymitys
17927 anonymization
17928 anonymized
17929 assail
17930 astonishment
17931 attentive
17932 authentifiable
17933 authorship
17934 bouncer
17935 candidly
17936 commotion
17937 corrosive
17938 courtesey
17939 cowardice
17940 coworker
17941 detractor
17942 disappearance
17943 disconnection
17944 disservice
17945 draconian
17946 enforceable
17947 gutless
17948 heed
17949 heterogeneity
17950 identifier
17951 immeasurable
17952 impoliteness
17953 infiltration
17954 insensitivity
17955 interesection
17956 irrationally
17957 kbt
17958 louse
17959 mailbombe
17960 mindlessly
17961 newness
17962 newsadmin
17963 obsessively
17964 outbound
17965 overwrought
17966 parenthood
17967 parlay
17968 passed
17969 penet
17970 piercingly
17971 pseudonymous
17972 pseudonymously
17973 publius
17974 puerile
17975 quandary
17976 reasearch
17977 respectful
17978 responsibilty
17979 retriction
17980 saturation
17981 scandinavian
1798

18554 preferrable
18555 utils
18556 alu
18557 conserved
18558 electrophoresis
18559 rationality
18560 toricelli
18561 diagonally
18562 paging
18563 across
18564 amusement
18565 palye
18566 determinent
18567 depopulate
18568 dropping
18569 turkmanchay
18570 calulate
18571 charley
18572 charleys
18573 dispassionate
18574 failing
18575 fantastical
18576 gauntlet
18577 incoherence
18578 incoherent
18579 indefensible
18580 intransigence
18581 meta
18582 redefining
18583 specious
18584 theorize
18585 thingie
18586 umbrage
18587 unverifiable
18588 fob
18589 overvoltage
18590 accustomed
18591 hilltop
18592 immesurably
18593 pressumption
18594 revel
18595 whims
18596 withough
18597 yucky
18598 yukky
18599 oxymoron
18600 adventurous
18601 desolder
18602 nonacid
18603 resolder
18604 warmup
18605 wattage
18606 blacken
18607 cave
18608 cypruss
18609 dastardly
18610 doorway
18611 enosis
18612 ferocious
18613 ferocity
18614 glimpsed
18615 madden
18616 onslaught
18617 paratroop
18618 regiment
18619 to

19582 leaky
19583 spooky
19584 doddery
19585 napoleonic
19586 reabsorb
19587 airtime
19588 beleaguer
19589 demoralized
19590 keenan
19591 orgasm
19592 reclamation
19593 unprecedente
19594 comforting
19595 imbecile
19596 inconceivable
19597 insensitive
19598 judgemental
19599 dusting
19600 devastating
19601 phyic
19602 princple
19603 significence
19604 sugnificance
19605 confounding
19606 contently
19607 multiplexing
19608 muxed
19609 mack
19610 blit
19611 blitte
19612 dislocation
19613 drawable
19614 masking
19615 simplest
19616 wheeze
19617 abstacte
19618 malleable
19619 nre
19620 untainted
19621 facinating
19622 tilde
19623 uptodate
19624 affirmative
19625 demeaning
19626 segregation
19627 crosswise
19628 dispell
19629 isomorphic
19630 jaggie
19631 planar
19632 populous
19633 unfilled
19634 supplementary
19635 americas
19636 balderdash
19637 certitude
19638 crass
19639 feckless
19640 gonad
19641 manly
19642 pusillanimous
19643 tarnished
19644 aspire
19645 attractiveness
19646 chagne


20815 mitsu
20816 changed
20817 exept
20818 tournament
20819 butterfly
20820 enrish
20821 unfit
20822 voelkerde
20823 voelkerding
20824 romany
20825 fluoro
20826 violet
20827 superuser
20828 macx
20829 mailserver
20830 recharger
20831 supersport
20832 tropical
20833 grandma
20834 headlamp
20835 invincible
20836 neglegence
20837 pedistrian
20838 multipling
20839 rms
20840 sine
20841 vrm
20842 netware
20843 swathe
20844 pith
20845 abnormally
20846 cardiopulmonary
20847 catheterization
20848 clubbing
20849 congenital
20850 distal
20851 distally
20852 hypoxemia
20853 internist
20854 illicitly
20855 panda
20856 hlive
20857 kuwaiti
20858 laudable
20859 saved
20860 shenanigan
20861 surrendering
20862 tyrannical
20863 vigilantism
20864 adjourn
20865 arpanet
20866 beneficence
20867 borne
20868 casesighting
20869 cofounder
20870 compendium
20871 conferencing
20872 convene
20873 cpsr
20874 crosslink
20875 cud
20876 cyberpunk
20877 cypher
20878 declassify
20879 effector
20880 enbgineere
20881 face

21943 charred
21944 kindsa
21945 warming
21946 dataport
21947 faxstf
21948 autoanswer
21949 commnad
21950 ter
21951 menstruation
21952 psychologicall
21953 anatomically
21954 ape
21955 bilateral
21956 bipedally
21957 empircist
21958 exlude
21959 fancifully
21960 hominid
21961 inviolability
21962 laugher
21963 loving
21964 probaby
21965 disembodied
21966 keratotomy
21967 sunglasse
21968 blindfold
21969 bullock
21970 consular
21971 dagger
21972 debrief
21973 drama
21974 gurvitz
21975 innocuous
21976 launder
21977 skinhead
21978 stationery
21979 earthing
21980 btr
21981 connective
21982 moralistic
21983 mearly
21984 nuther
21985 radiology
21986 theyare
21987 trident
21988 cavitate
21989 cavitation
21990 condensate
21991 conductivity
21992 econazis
21993 entrain
21994 feedwater
21995 grandfathere
21996 hotwell
21997 impure
21998 micromho
21999 reflash
22000 stepchild
22001 removing
22002 tkp
22003 cber
22004 sofa
22005 thoroughfare
22006 vcrs
22007 setvalue
22008 xtvasetvalue
22009 convert

23303 bdftool
23304 bitmaps
23305 bldfamily
23306 combining
23307 cron
23308 cshrc
23309 decwrl
23310 displaymanager
23311 illegality
23312 interwork
23313 lc_ctype
23314 likeli
23315 lpr
23316 mkfontdir
23317 poskanzer
23318 prebuilt
23319 rsh
23320 screendump
23321 sdsc
23322 subshell
23323 umlaut
23324 unreferenced
23325 unrestricted
23326 unsetenv
23327 unspecified
23328 utoronto
23329 xgrabsc
23330 xloadimage
23331 xmh
23332 xpm
23333 xprinter
23334 xselection
23335 xset
23336 xsnap
23337 xtiff
23338 xwebster
23339 friendlyness
23340 gouroud
23341 airstrike
23342 ared
23343 beleagured
23344 coldhearted
23345 enthusiasm
23346 peacekeeping
23347 wre
23348 deshaie
23349 mahome
23350 pagliarulo
23351 trombley
23352 thiokol
23353 althoug
23354 runway
23355 chile
23356 tomatoe
23357 ketchup
23358 pickle
23359 islote
23360 exlcude
23361 origianl
23362 venial
23363 astral
23364 astrological
23365 coming
23366 eckankar
23367 etheric
23368 hypnotic
23369 hypnotist
23370 lowest
23371 transmu

24534 disgruntled
24535 wepaon
24536 descramble
24537 functionallity
24538 artur
24539 arturs
24540 evalute
24541 fide
24542 goonism
24543 irbe
24544 prompting
24545 undrafte
24546 calamari
24547 archiver
24548 bilevel
24549 blocky
24550 blurred
24551 blurriness
24552 buggier
24553 checkbox
24554 chrominance
24555 cjpeg
24556 colormappe
24557 colorview
24558 cray
24559 cropping
24560 cshow
24561 decodable
24562 decompressor
24563 denominator
24564 djpeg
24565 dvpeg
24566 filtering
24567 forevermore
24568 frill
24569 fullview
24570 gifconverter
24571 hiview
24572 hoopla
24573 indexing
24574 irretrievably
24575 jbig
24576 jpege
24577 jpegsrc
24578 jpegview
24579 kbytes
24580 microsystem
24581 milnet
24582 nonprofessional
24583 pictpixie
24584 pmjpeg
24585 pnmconvol
24586 precompute
24587 prepress
24588 qt
24589 quantize
24590 quantizer
24591 recompresse
24592 roundoff
24593 slideshow
24594 sloppiness
24595 spiffier
24596 subsample
24597 technologys
24598 transportable
24599 unusably
2460

25631 zumdahl
25632 chessboard
25633 chesspiece
25634 prejudgment
25635 repeated
25636 unsolvable
25637 salad
25638 movment
25639 orelativity
25640 eliass
25641 underpredict
25642 overnite
25643 rollerblader
25644 rucksack
25645 shifting
25646 surf
25647 carel
25648 spidery
25649 bundled
25650 macland
25651 outscore
25652 dewy
25653 mananger
25654 asinine
25655 reoccure
25656 gobs
25657 assad
25658 digita
25659 emirs
25660 gulf
25661 infidelity
25662 mistress
25663 mentionne
25664 packagaing
25665 nourish
25666 waif
25667 acetylene
25668 propane
25669 verboten
25670 supertuner
25671 impertinent
25672 imortant
25673 prosyletize
25674 electoral
25675 pavarottis
25676 shakeup
25677 usa
25678 dcw
25679 stubby
25680 olvwm
25681 commendation
25682 hamartia
25683 iota
25684 uncompromising
25685 aestetic
25686 bewilder
25687 sculptured
25688 tookit
25689 scandanavian
25690 sideline
25691 unmarried
25692 pesky
25693 unaccessible
25694 befuddlement
25695 entwine
25696 fertilize
25697 kilobyte
25

27053 dolphins
27054 indeterminism
27055 petri
27056 solipsism
27057 stony
27058 circularize
27059 cryogenically
27060 descoping
27061 sirtf
27062 classc
27063 uims
27064 visuallib
27065 acquaintace
27066 surviving
27067 roy
27068 salmon
27069 duplexer
27070 hardline
27071 stil
27072 synching
27073 sieger
27074 spotlight
27075 inherence
27076 contex
27077 preform
27078 twiddling
27079 voronoi
27080 densitometry
27081 micrograph
27082 egotistical
27083 globule
27084 shuold
27085 ramcheck
27086 ruffle
27087 reccurre
27088 deemphasize
27089 genealogical
27090 knowledgably
27091 messge
27092 complane
27093 stirring
27094 backround
27095 hershis
27096 lingo
27097 steamed
27098 appology
27099 empathic
27100 contardictory
27101 giveth
27102 taketh
27103 wince
27104 ifthe
27105 jurisdication
27106 kisser
27107 pike
27108 shedule
27109 monstrosity
27110 audibly
27111 judged
27112 yardstick
27113 nonserbian
27114 legalitie
27115 ssn
27116 interferance
27117 lurid
27118 overstatement
27119 robohe

28320 cx_scub
28321 gazebo
28322 gcx_
28323 gcx_g
28324 gcx_s
28325 gcx_sc
28326 glryxj
28327 iak_q
28328 mcx
28329 mcx_scx
28330 neccesary
28331 pieced
28332 qsw
28333 reguarding
28334 rfusv
28335 scx_s
28336 scx_scx
28337 simrmc
28338 sqs
28339 txr
28340 uucoder
28341 wxte
28342 x_g
28343 x_syx
28344 y
28345 yx_g
28346 yx_s
28347 yx_scx
28348 indefinable
28349 doper
28350 foriegn
28351 mecha
28352 asymmetry
28353 chiltepin
28354 commissioning
28355 moonlite
28356 niven
28357 sundae
28358 adlib
28359 amplified
28360 audition
28361 bagging
28362 czechoslovakian
28363 varialble
28364 intercessory
28365 lovingkindness
28366 aeronautic
28367 alluvium
28368 asteroids
28369 astrodynamic
28370 ellipsoidal
28371 minovitch
28372 parallax
28373 parallelize
28374 perubation
28375 topos
28376 trig
28377 trignometry
28378 polarising
28379 canonicity
28380 crosswire
28381 glassner
28382 reencrypted
28383 karate
28384 mallard
28385 raptor
28386 tailed
28387 cheapish
28388 arrese
28389 deference
2839

In [216]:
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 5), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]]


In [222]:
[[(id2word[id],freq) for id, freq in cp] for cp in corpus[1:]]

[[('call', 2),
  ('day', 2),
  ('thank', 1),
  ('acceleration', 1),
  ('adapter', 1),
  ('add', 2),
  ('answer', 1),
  ('article', 1),
  ('attain', 1),
  ('base', 1),
  ('brave', 1),
  ('brief', 1),
  ('card', 1),
  ('clock', 4),
  ('detail', 1),
  ('disk', 1),
  ('do', 1),
  ('especially', 1),
  ('experience', 2),
  ('final', 2),
  ('floppy', 2),
  ('functionality', 1),
  ('guy', 1),
  ('heat', 1),
  ('hour', 1),
  ('keyword', 1),
  ('knowledge', 1),
  ('message', 1),
  ('network', 1),
  ('next', 1),
  ('number', 1),
  ('oscillator', 1),
  ('poll', 3),
  ('procedure', 1),
  ('rate', 1),
  ('report', 1),
  ('request', 1),
  ('send', 1),
  ('share', 1),
  ('si', 1),
  ('sink', 1),
  ('soul', 1),
  ('speed', 2),
  ('summarize', 1),
  ('summary', 1),
  ('top', 1),
  ('upgrade', 2),
  ('usage', 1)],
 [('could', 3),
  ('day', 1),
  ('info', 1),
  ('know', 1),
  ('line', 1),
  ('look', 2),
  ('make', 1),
  ('post', 1),
  ('really', 2),
  ('thank', 1),
  ('wonder', 1),
  ('answer', 1),
  ('di