In [1]:
import numpy as np

import artm
from artm import hARTM

import sys
sys.path.append('utils/')
# you need sklearn for simple loading
from sklearn.datasets import fetch_20newsgroups

import glob 
import os

In [2]:
hier = hARTM()

In [3]:
regularizers_list = []

regularizers_list.append(artm.SmoothSparsePhiRegularizer(name='SPPhiCatReg',class_ids=['text'],tau=0.05))
regularizers_list.append(artm.SmoothSparsePhiRegularizer(name='SPPhiAuthorReg',class_ids=['text'],tau=0.05))
regularizers_list.append(artm.SmoothSparsePhiRegularizer(name='SPPhiTagReg',class_ids=['text'],tau=0.05))
regularizers_list.append(artm.DecorrelatorPhiRegularizer(name='DecorrPhiReg',class_ids=['text'],tau=50000))

regularizers_list.append(artm.SmoothSparsePhiRegularizer(name='SPPhiNgrammRegSubj',
                                                         class_ids=['text'],
                                                         #topic_names=topics_subj,
                                                         tau=0.01))
regularizers_list.append(artm.SmoothSparseThetaRegularizer(name='SmoothThetaRegSubj',
                                                           #topic_names=topics_subj,
                                                           tau=0.01))

In [4]:
# quality measures of models
scores_list = []
scores_list.append(artm.SparsityPhiScore(name='SparsityScoreNgrammSubj',
                                         class_id='text',
                                         #topic_names=topics_subj
                                        ))
scores_list.append(artm.SparsityThetaScore(name='SparsityScoreThetaSubj',
                                           #topic_names=topics_subj
                                          ))

scores_list.append(artm.SparsityPhiScore(name='SparsityScoreCat', class_id='text'))
scores_list.append(artm.SparsityPhiScore(name='SparsityScoreAuthor', class_id='text'))
scores_list.append(artm.SparsityPhiScore(name='SparsityScoreTag', class_id='text'))
scores_list.append(artm.TopTokensScore(name='TopTokensScoreNgramm', class_id='text', num_tokens=800))
scores_list.append(artm.TopTokensScore(name='TopTokensTag', class_id='text', num_tokens=800))
scores_list.append(artm.TopicKernelScore(name='TopicKernelNgramm', class_id='text', probability_mass_threshold=0.25))
scores_list.append(artm.PerplexityScore(name='PerplexityScore', class_ids=['text']))


In [5]:
data_path = 'data/lang_data_small.vw'
batches_path = 'data/batches_small__'

In [6]:
if len(glob.glob(os.path.join(batches_path + '*.batch'))) < 1:
    batch_vectorizer = artm.BatchVectorizer(data_path=data_path, data_format='vowpal_wabbit',
                                            target_folder=batches_path)
else:
    batch_vectorizer = artm.BatchVectorizer(data_path=batches_path, data_format='batches')

In [7]:
dictionary = artm.Dictionary('dictionary')
dictionary.gather(batches_path)
dictionary.filter(min_df=30, max_tf=12245)

artm.Dictionary(name=dictionary, num_entries=8328)

In [8]:
topicNum0 = 6
level0 = hier.add_level(num_topics=topicNum0)

for reg in regularizers_list:
    level0.regularizers.add(reg)
for score in scores_list:
    level0.scores.add(score)

In [9]:
level0.initialize(dictionary=dictionary)
level0.fit_offline(batch_vectorizer, num_collection_passes=45)

In [10]:
for topic_name in level0.topic_names:
    print (topic_name + ': ')
    print (", ".join(level0.score_tracker['TopTokensTag'].last_tokens[topic_name]))

topic_0: 
topic_1: 
topic_2: 


affinity, routing, jobs, sack, spi, cores, xxx, nskb, dai, rwnd, dst, voice, xx, submit, full, serial, extern, jd, retained, audio, stop, space, ip, after, chunks, dm, contexts, running, eq, ticks, tp, pkt, para, jsctx, single, caller, led, retain, ref, peer, pctrl, scheduled, inline, association, sender, up, ready, spin, more, coreref, checksum, trace, pa, band, spcr, pcm, flight, bundle, backend, frontend, been, refcount, jm, transmit, timeout, retry, unicode, sched, controls, allowed, tmp, overhead, resource, ipv, dequeued, evict, jctx, cookie, need, hold, gpu, datasize, before, padding, delay, ecn, wait, vtag, print, was, sock, pending, fit, rt, sh, power, change, hard, don, io, there, dl, tap, pmtu, sent, empty, echo, bt, ve, atomic, always, finish, nolock, spaces, route, done, inc, ucon, schedule, capable, asm, va, soft, later, into, chip, held, given, xfer, opt, buff, fast, socket, each, private, uart, disable, scheduling, tsn, send, portqs, last, keep, ufcon, reserved, destinat

topic_5: 


In [11]:
level1 = hier.add_level(num_topics=80, topic_names=['child_topic_' + str(i) for i in range(80)], 
                        parent_level_weight=1)
for reg in regularizers_list:
    level1.regularizers.add(reg)
for score in scores_list:
    level1.scores.add(score)

In [12]:
level1.initialize(dictionary=dictionary)
level1.fit_offline(batch_vectorizer, num_collection_passes=60)

In [13]:
len(level1.get_psi())

80

In [14]:
psi = level1.get_psi()

In [15]:
print ("Psi support:", psi.values.max(axis=1).min())

Psi support: 0.0006697354


In [16]:
psi_threshold = 0.01
parent_counts = np.zeros(0)
for level_idx in range(1, hier.num_levels):
    psi = hier.get_level(level_idx).get_psi().values
    parent_counts = np.hstack((parent_counts, (psi > psi_threshold).sum(axis=1)))
print ("Mean parents count:", parent_counts.mean())


Mean parents count: 0.7875


In [17]:
batch = artm.messages.Batch()
batch_name = 'phi1.batch'

with open(batch_name, "rb") as f:
    batch.ParseFromString(f.read())
    
Ntw = np.zeros(len(level0.topic_names))
    
for i,item in enumerate(batch.item):
    for (token_id, token_weight) in zip(item.field[0].token_id, item.field[0].token_weight):
        Ntw[i] += token_weight

Nt1t0 = np.array(psi) * Ntw
psi_bayes = (Nt1t0 / Nt1t0.sum(axis=1)[:, np.newaxis]).T

In [18]:
indexes_child = np.argmax(psi_bayes, axis=0)

In [19]:
topic_parent_name = 'topic_1'
print(topic_parent_name + ':')
print(" ".join(level0.score_tracker['TopTokensTag'].last_tokens[topic_parent_name]))
print('')
i=9
for child in np.where(indexes_child == i)[0]:
    print('    ' + level1.topic_names[child] + ': ')
    print(" ".join(level1.score_tracker['TopTokensTag'].last_tokens[level1.topic_names[child]]))
    print('')

topic_1:



In [20]:
psi1 = level1.get_psi()
psi1

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5
child_topic_0,1.019807e-04,2.484762e-05,4.045079e-06,5.253645e-02,6.192169e-06,1.805299e-06
child_topic_1,2.791120e-05,1.490238e-05,2.373737e-05,1.188465e-04,3.662143e-02,1.478123e-06
child_topic_2,1.516472e-07,3.194613e-07,6.216765e-02,1.490964e-06,1.002261e-07,7.343132e-08
child_topic_3,6.550054e-08,1.939580e-08,8.764327e-09,2.047467e-02,1.454845e-08,3.462985e-08
child_topic_4,3.336809e-06,7.038830e-06,4.654015e-08,1.259463e-02,1.230452e-05,5.515368e-08
child_topic_5,6.699028e-08,2.666194e-08,9.957080e-09,4.550886e-07,9.336261e-02,2.209437e-08
child_topic_6,5.213396e-02,6.168172e-08,1.931733e-08,6.006938e-06,8.311237e-08,2.387240e-08
child_topic_7,9.541399e-06,3.370177e-06,4.926781e-07,5.967570e-02,2.724274e-06,1.769178e-07
child_topic_8,3.698001e-04,2.311767e-05,2.349500e-06,5.961562e-02,7.051883e-06,3.980674e-05
child_topic_9,4.446622e-06,4.208609e-05,6.445102e-07,9.435395e-06,1.000080e-02,2.520814e-06


In [21]:
tokens0 = level0.score_tracker["TopTokensTag"].last_tokens
tokens1 = level1.score_tracker["TopTokensTag"].last_tokens
for t, topic_name in enumerate(level0.topic_names):
    print (topic_name + ': ')
    for word in tokens0[topic_name]:
        print (word, end=' ')
    print()
    for s, topic_name1 in enumerate(level1.topic_names):
        if psi1[topic_name ][ topic_name1 ] > 0.05:
            print ("\t", topic_name1 + ': ')
            for word in tokens1[topic_name1]:    
                print (word, end=' ')
            print()
    print("=="*30)

topic_0: 
	 child_topic_6: 
	 child_topic_19: 


ri ereg ebit supplyv voltage sys pdata matches enc uv regmap vinldo sel nbits vreg linear supply slew rdev pname ene step min rtc rw volt voltages vdata match disable vin vcc goreg gobit regs setting dvm exit apply sl mike vsel np compulab bv parse owner ltd probe ena enb settings rail preinit dt av fixed find author ti devm ro smode co il ascend rapoport initcall dbg constraints available description search enodev modify terms mfd gfp inline subsys foundation redistribute drvdata machine published ed pmic continue based gpl specific don converter alias determine da kzalloc clr want switching program slab under found unregister regulators allows clearing specified exiting probing child platdata ifdef supplyena supplyenc supplyenb supplyene supplyend marvell invalid international single actual ch bg sequential padding tv namelen lo processed bi leave tgt lowest tim vt congestion sup nmax cycle th march enter nc hdl derived operation porting trigger notes balance batching memmove csr med

inode dentry lookup link last op old open seq may unlikely sb rcu root hash follow o rename filename flag walk export permission need symbol vfs exit retry mount sync directory mounted dirty fs pathname opened current target acl don security enoent returns filesystem want drop managed child delegated up inline access creat links writeback source depth negative excl empty exec caller filp audit got getname bug spin eperm here component mapping done victim stack now created likely reference newname fail unlink then acc truncate cap move syscall backing generic wait exchange point nested super umode begin word atomic cookie non qstr also eacces ebusy there special was before permissions rmdir its real norm re does code complete mkdir pointer because more other manage symlinks know used already than continue uid them finish save sticky delete sequence failure zero otherwise saved operations owner internal total success right called trunc been how whether tmp always immutable look dead file

enet ndev mac phy eth tmp mii netdev writel local command buff bus netdata rxstatus phydev definitions readl ethtool netif clock tsv setup napi interface tbuff stats link mcfg rsv crc speed txstatus stat spin duplex errors rxfltrw maxf disable dbg sa last load buffers macint xffff pad mdio iram supp drv used txrx xx ethernet descriptors done mind rmii ptxrxdesc txidx rxconsidx ndo div unregister collision pa txdesc prxstat ptxstat wake change word jiffies hash host txstat io align va buffs ipgt clrt mcmd multicast receive range mdiobus stop madr ipgr probe hardware drvdata full budget controller si vlan recv ioctl match resume defer memcpy ether modname interrupt clkpwr nxp skblen statusinfo ethst passruntframe unprepare iounmap irqsave irqrestore ctrl wakeup down le overrun busy timeout underrun mcs alloc station xff collisions xf params resource enomem coherent pause transmit intclear txconsumeindex intenable phaddr regbase mrdd rxfltrwsts stigge wells txcidx hashlo hashhi phyreg kev

	 child_topic_13: 
	 child_topic_17: 


opcode bind cache cntr cccr retired bsq pebs hwc events pmu active branch perf uop entries allocation fsb reference miss cru ioq metric pack tc replay shared execution load firm nbogus rd gen ht instr op ovf clear cpuc bogus esel mispred ndl itlb alias activity unpack mob disable need access deliver assist ms match memory overflow bsu bpu counter arch vert cycles aliases may rdl wrmsrl saat conf dac used power checking os usr raw nmi drdy swap completed continue counters intel store dp split own packed sp machine assign wc scalar pass mmx dtlb tbpu other handled original writes warn same logical hits running which alternative cancel some apic netburst instructions regs period drv pmi prefetch misc boot done hit ts front eventmask non fp threads ids once ia again iq alf walk evnt pmh perfctr lvtpc rat uops hitm hite complete reserve allowed resource indirect allow reserved before fetch specific low sse build call misses high stall here conditional wt cleared second matrix pmc bus metric

	 child_topic_71: 
ar txq bf beacon common tsf rs ieee ds setup ts mac calibration iter work rad ghz channels short dbg tasklet chan queues radio channel bc update hardware nl spin tu iftype txbuf stats cal stop daddr band interrupts vif code ap antenna ibss timers imask jiffies unlikely rxbuf bh tune receive pending full chip bitrates need invalid rates intr active poll mgmt revision flag schedule beacons descriptors bus tries capabilities warn tstamp up nb called llx phy rssi current freq interrupt bufsize wiphy descriptor buff don support mesh unmap txdesc complete frames received bssid since found send resetting based modes da decrypt calibrate clear names cap msecs also after mactime linked link each preamble stat interval change min alloc used needed cc cab fc cw might allocate fifo disable supported done bitrate timestamp empty stuck atheros tail mapping mark buffers macaddr sr always iface le ch cache single station them keep assoc hdrlen bands which too padding then limits las

	 child_topic_78: 
pll cb expire vos commit hdd zone bh bmap tape pmac ent omap tz sha mux mark ref vm regs unreachable trace wlan ap scan total dirty dat psessionentry arg tree common elem handler mlm strcmp webkit elim reachable sta success fl complete beacon channel argv web reclaim found scsi disk lim bus mac right continue frac pbeacon limit webkitwebpolicydecision sc rpm incomplete insert delete stats arm collected rsp copy hostapd parse ie argc glimmlmstate limmlmstate bt pending sectors var root update pattern param formdatabuilder seen pos gfp extra memset timestamp left channels commits ue nb expiry recovery md sector move lookup webkitwebpolicydecisionprivate pt parents prxpacketinfo tip exit params wt lu loge alloc plug verbose pub scanned kind active macaddr each union always present clear fixed framepolicyfunction send dot inactive bs usage swap wl prune prefix reference git cache nf assoc private readl freq rst dry keep iscancelled show nbytes bi last div old custom writ

spi spcr xfer portqs immr uchar tranram transfer xx newlen printf eeprom curraddr reserved immap ddrqs pqspar volatile ssize alen sys newaddr spics dlen xff transmit recram comdram pc tm after dprint controll short im peri memdump denx de txbuf program dump used memory before qspi quickhex spimiso spsr spimosi mpc mhz connected pv received normal txd udelay rxd com wrap foundation delay rxbuf act ifdef disabled hex mpl warranty digit xe rdsr trinetcommunication halta nspi mennchen spiclk wren gerd wd command board ported done asm ch flag ushort pins wait ms place setup engineering into active clock controller inc irqs fault description polarity credits ma phase which accesses common option implied usec temple siemens malloc contributed later people fitness ram writing particular merchantability code suite lowest last copy published who either usa interface along your purpose even parts peek specific ignore redistribute ctype undef helper rom terms send project wired icn finished more m

nskb tp rwnd sack chunks ip full checksum cookie padding peer flight after delay datasize destination sender pmtu sent association tsn echo bundling pkt ecn round cwnd inflight make finish capable transmit vtag transmission receiver rule autoclose more cid big keep sctphdr bundle tmp overhead fmt been adler psize owner dst rtt timeout org empty offered implementation sending later calculate atomic tail section need protocol zero inc progress stats del karl source send until described common fill fragment sport csum pad assign pathmtu fast adding how just given reserve ipfragok fit outq our buff outbound each private generation its ntohs ibm trip including than longer back mac word allowed timeouts setting alloc resent pointer track sock bundled layer too com up last rto route containing does made es established hmac summed then rfc update computation total inability reports timestamp orphan cisco dport useful furthermore options other know wmem unreachable mod real under sri dest merch

	 child_topic_41: 
serial dm xx led pa resource ucon io va ufcon phys mapbase simtec upf mach ctrl samsung ulcon board virtual devices platdata bast mt pdata sz plat asm gpb baudbase ioresource boot machine power off ioremap uartclk initdata autoconf upio iotype ben devs regshift vam lcon co trigger uk isa iodesc hwport uartcfgs thorcom interrupt leds resources named space modify types macros pwm dooks electronics highlevel isaio tty uartcfg uarts ports clocks registers source terms some byte addresses wdt arch tmp hardware common regs http entries redistribute published limit physical st support external nor foundation program under extra areas ethernet lcd moment warn written tested word adc pnone designed fifomode definitions routines ourselves controls actually www ohci tlv uclk maintainer stopb dclk until xc used better direction mods cpld aic bitonly platf atag rxtrig audio pv drv communications rd clksel logical wireless duplicated wr made ints ms frequency ir choose clks mobile

	 child_topic_0: 
	 child_topic_7: 
stack segment cray trailer alloca direction storage current unknown stkl pseg ifdef ssptr hp linkage which segments sss ssa last stk afunc code depth initial ifndef your structure addresses implementation words o arg dummy abort automatically find more used deep memory routine align reclaimed high cplusplus sssize sspseg libiberty allocated pointer reserved space was garbage does following previous allocate water known reclaim calls overflow link some grows mp procedure once deeper growth stackseg stkstat mark blocks np system what information area pad measure there fprintf force want stat discard union work stdlib ac under okay stderr maximum exit eventually track library least common know contain after y auto date determine linked upon keep toward required preprocessor autoconf overflows hits than cell stko stkretn ssusr deftypefn stkofen grew deallocations probe extensions word fatal task contains sizes lm variables stdio like possibility structur

	 child_topic_20: 
	 child_topic_22: 


	 child_topic_50: 
fh dprintk vidioc encoder decoder fmt hfi link ioctl filp resources enc dec ctrl delay pwr cores collapse debugfs remove vid mplane buffers unregister parm poll invalid root handler cap fops snprintf dt probe show exit open warn prepare sub container kfree sys drvdata subscribe unsubscribe streamoff dqbuf framesizes qbuf reqbufs streamon querycap match close mach ssize initialize sysfs enomem capture suspend board exceeded venus pt work stop program foundation warranty format parse private gfp resume kzalloc defer allocate drv owner store subscription unload iommu fw fsize vfl streamparm grabber api domains reserved rights target sleep exist completion types operations system called usage capability purpose under hope distributed merchantability fitness useful supports details modify more redistribute implied terms qcom published even particular already inline response maximum tail controls devices uninit uses delayed instances compatible instance internal memory dbg

topic_4: 
	 child_topic_5: 


	 child_topic_15: 
	 child_topic_23: 
	 child_topic_29: 


	 child_topic_45: 
	 child_topic_51: 


	 child_topic_61: 
	 child_topic_66: 
topic_5: 


	 child_topic_34: 
	 child_topic_43: 


	 child_topic_67: 
	 child_topic_68: 


In [22]:
!mkdir model_devsagul

In [23]:
hier.save('model_devsagul')

In [24]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

import matplotlib.ticker as ticker
import matplotlib.cm as cm
import matplotlib as mpl


fig = plt.figure()
fig, ax = plt.subplots(1,1, figsize=(11,20))
heatplot = ax.imshow(psi1, cmap='hot')
ax.set_xticklabels(['child_topic_' + str(i) for i in range(80)], rotation=40)
ax.set_yticklabels(['topic_' + str(i) for i in range(80)])

tick_spacing = 1
ax.xaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))
ax.yaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))


ModuleNotFoundError: No module named 'matplotlib'