In [1]:
import sys
sys.path.append('../..')
import src.data.data_loader as dl
from src.features.vectorizer import Vectorizer
from src.models.sampler import HierarchicalLDA
from src.visualization import hlda_graph as hg
from src.Intern_Consistency import hLda_jenson_shannon as h

### Get articles from Zeit

In [2]:
data = dl.get_articles_by_sources(["zeit"])
texts = data["article_texts"]

In [3]:
doc_converted,vocab2 = hg.param_for_Hlda(texts)

### Tree with 3 Levels

In [4]:
n_samples = 50      # no of iterations for the sampler
alpha = 10          # smoothing over level distributions
gamma = 1.0           # CRP smoothing parameter; number of imaginary customers at next, as yet unused table
eta = 0.1             # smoothing over topic-word distributions
num_levels = 3        # the number of levels in the tree
display_topics = 10   # the number of iterations between printing a brief summary of the topics so far
n_words = 5           # the number of most probable words to print for each topic after model estimation
with_weights = True  # whether to print the words with the weights

hlda = HierarchicalLDA(doc_converted, vocab2, alpha=alpha, gamma=gamma, eta=eta, num_levels=num_levels, verbose=False)
hlda.estimate(n_samples, display_topics=display_topics, n_words=n_words, with_weights=with_weights)

HierarchicalLDA sampling

.......... 10
topic=0 level=0 (documents=48): bio (105), prozent (93), konventionell (79), lebensmittel (75), deutschland (73), 
    topic=1 level=1 (documents=32): produkt (46), ökologisch (43), anbau (28), studie (28), bio (27), 
        topic=2 level=2 (documents=11): hof (23), hatje (22), biobauern (22), feld (20), assaf (19), 
        topic=5 level=2 (documents=4): dosch (29), schmecken (23), geschmack (13), unterschied (12), wissenschaftler (8), 
        topic=8 level=2 (documents=11): region (22), kunde (20), wachsen (15), alberti (15), italien (14), 
        topic=9 level=2 (documents=6): basic (11), lidl (8), münchner (7), münchen (5), spanrunft (5), 
    topic=3 level=1 (documents=6): dioxin (32), lebensmittel (9), häufig (9), fett (9), schmecken (8), 
        topic=4 level=2 (documents=6): huhn (24), fleisch (15), eiern (14), häde (10), güngörmüs (9), 
    topic=6 level=1 (documents=10): bauer (27), rehn (13), zahlen (12), 's (11), agrarpolitik (10)

In [None]:
dot = hg.graph(hlda)
dot.render("TestData/ZeitLevel3", view = True)

### Tree with 4 Levels

In [6]:
num_levels2 = 4        # the number of levels in the tree

hlda2 = HierarchicalLDA(doc_converted, vocab2, alpha=alpha, gamma=gamma, eta=eta, num_levels=num_levels2, verbose=False)
hlda2.estimate(n_samples, display_topics=display_topics, n_words=n_words, with_weights=with_weights)

HierarchicalLDA sampling

.......... 10
topic=0 level=0 (documents=48): bio (113), prozent (100), deutschland (81), konventionell (70), jed (69), 
    topic=1 level=1 (documents=30): gemüse (43), obst (37), landwirtschaft (34), bauer (26), teilen (24), 
        topic=2 level=2 (documents=11): alberti (15), kontrolle (14), italien (14), kontrolleure (12), italienisch (9), 
            topic=3 level=3 (documents=4): assaf (19), arbeiter (11), arbeit (9), schulz (8), chef (8), 
            topic=9 level=3 (documents=2): bauer (20), ökologisch (12), bauernverband (11), funken (9), agrarpolitik (9), 
            topic=15 level=3 (documents=5): dioxin (16), falsch (10), getreide (8), essen (6), skandal (6), 
        topic=7 level=2 (documents=3): huhn (19), dioxin (16), häde (10), eiern (6), stall (6), 
            topic=8 level=3 (documents=3): keimen (8), darm (7), mark (7), erkranken (6), erreger (6), 
        topic=13 level=2 (documents=16): dosch (29), ökologisch (24), konventionell (19

In [None]:
dot2 = hg.graph(hlda2)
dot2.render("TestData/ZeitLevel4", view = True)

In [7]:
num_levels3 = 5        # the number of levels in the tree

hlda3 = HierarchicalLDA(doc_converted, vocab2, alpha=alpha, gamma=gamma, eta=eta, num_levels=num_levels3, verbose=False)
hlda3.estimate(n_samples, display_topics=display_topics, n_words=n_words, with_weights=with_weights)

HierarchicalLDA sampling

.......... 10
topic=0 level=0 (documents=48): bio (88), lebensmittel (84), prozent (80), deutschland (79), stehen (72), 
    topic=1 level=1 (documents=37): konventionell (90), ökologisch (75), landwirtschaft (44), bauer (42), bioprodukt (41), 
        topic=2 level=2 (documents=25): verbraucher (37), schmecken (22), inzwischen (13), zeigen (10), enthalten (10), 
            topic=3 level=3 (documents=16): schwartze (20), frage (10), herkömmlich (8), bangladesch (7), organisation (6), 
                topic=4 level=4 (documents=7): meier (17), bioprodukt (13), geschmack (10), deutsch (7), forscher (6), 
                topic=14 level=4 (documents=6): mark (7), zeichen (6), hersteller (5), alternativ (4), grüne (4), 
                topic=24 level=4 (documents=3): assaf (19), schaffrin (14), arbeiten (12), arbeiter (11), magdalena (8), 
            topic=12 level=3 (documents=6): huhn (22), basic (11), häde (10), häufig (9), darm (7), 
                topic=13 

.......... 50
topic=0 level=0 (documents=48): prozent (99), deutschland (80), jed (76), landwirtschaft (71), stehen (71), 
    topic=1 level=1 (documents=42): konventionell (104), bio (104), lebensmittel (71), bioprodukt (59), gemüse (48), 
        topic=2 level=2 (documents=32): verbraucher (51), streng (16), standard (14), kriterium (14), hersteller (13), 
            topic=3 level=3 (documents=15): schwartze (20), frage (17), bangladesch (7), leute (6), asien (4), 
                topic=4 level=4 (documents=7): schmecken (27), studie (20), produkt (16), geschmack (14), wissenschaftler (13), 
                topic=14 level=4 (documents=5): mark (8), beispielsweise (4), alternativ (4), grüne (4), urteilen (3), 
                topic=24 level=4 (documents=3): assaf (19), arbeiter (11), arbeiten (7), arbeit (7), naturland (6), 
            topic=12 level=3 (documents=4): huhn (20), häde (10), tier (9), keimen (7), darm (7), 
                topic=13 level=4 (documents=4): dioxin (32), e

In [None]:
dot3 = hg.graph(hlda3)
dot3.render("TestData/ZeitLevel5", view = True)

### Calculate Jensen Shannon for every level

In [8]:
node_dict1 = hlda.dict_for_tree(5)
node_dict2 = hlda2.dict_for_tree(5)
node_dict3 = hlda3.dict_for_tree(5)

#### Show matrix with topic numbers 

##### For trees with level 3 and 4

In [10]:
res_l34,res_l34_tw = h.get_matrix_for_all_common_levels(node_dict1,node_dict2)
for l, df in enumerate(res_l34):
    print("Level: " + str(l))
    display(df)

Level: 0


Unnamed: 0,Topic 0
Topic 0,0.829


Level: 1


Unnamed: 0,Topic 1,Topic 4,Topic 10
Topic 1,0.542,0.345,0.364
Topic 3,0.444,0.335,0.324
Topic 6,0.353,0.34,0.367


Level: 2


Unnamed: 0,Topic 2,Topic 7,Topic 13,Topic 5,Topic 11
Topic 2,0.358,0.333,0.355,0.343,0.355
Topic 5,0.343,0.329,0.447,0.33,0.323
Topic 8,0.562,0.333,0.465,0.343,0.327
Topic 9,0.323,0.319,0.374,0.442,0.42
Topic 4,0.36,0.688,0.342,0.336,0.336
Topic 13,0.332,0.334,0.331,0.503,0.523
Topic 7,0.426,0.321,0.33,0.316,0.334
Topic 11,0.334,0.407,0.34,0.34,0.331
Topic 14,0.352,0.326,0.323,0.341,0.322


##### For trees with level 4 and 5

In [11]:
res_l45,res_l45_tw = h.get_matrix_for_all_common_levels(node_dict2,node_dict3)
for l, df in enumerate(res_l45):
    print("Level: " + str(l))
    display(df)

Level: 0


Unnamed: 0,Topic 0
Topic 0,0.817


Level: 1


Unnamed: 0,Topic 1,Topic 15
Topic 1,0.521,0.352
Topic 4,0.367,0.345
Topic 10,0.389,0.318


Level: 2


Unnamed: 0,Topic 2,Topic 9,Topic 19,Topic 16
Topic 2,0.408,0.361,0.335,0.32
Topic 7,0.351,0.334,0.328,0.32
Topic 13,0.377,0.327,0.456,0.316
Topic 5,0.36,0.323,0.34,0.432
Topic 11,0.334,0.325,0.344,0.37


Level: 3


Unnamed: 0,Topic 3,Topic 12,Topic 22,Topic 10,Topic 20,Topic 17
Topic 3,0.479,0.333,0.339,0.34,0.345,0.312
Topic 9,0.332,0.327,0.33,0.479,0.341,0.345
Topic 15,0.326,0.366,0.389,0.333,0.328,0.315
Topic 8,0.349,0.59,0.325,0.324,0.326,0.317
Topic 14,0.387,0.33,0.343,0.328,0.332,0.32
Topic 6,0.351,0.323,0.351,0.325,0.351,0.378
Topic 12,0.341,0.335,0.345,0.513,0.554,0.383


##### For trees with level 5 and 3

In [12]:
res_l53,res_l53_tw = h.get_matrix_for_all_common_levels(node_dict3,node_dict1)
for l, df in enumerate(res_l53):
    print("Level: " + str(l))
    display(df)

Level: 0


Unnamed: 0,Topic 0
Topic 0,0.796


Level: 1


Unnamed: 0,Topic 1,Topic 3,Topic 6
Topic 1,0.5,0.384,0.357
Topic 15,0.335,0.359,0.321


Level: 2


Unnamed: 0,Topic 2,Topic 5,Topic 8,Topic 9,Topic 4,Topic 13,Topic 7,Topic 11,Topic 14
Topic 2,0.351,0.373,0.48,0.384,0.365,0.354,0.363,0.38,0.35
Topic 9,0.343,0.329,0.33,0.333,0.333,0.316,0.327,0.329,0.511
Topic 19,0.42,0.354,0.327,0.321,0.344,0.331,0.337,0.338,0.326
Topic 16,0.318,0.321,0.334,0.329,0.315,0.5,0.316,0.318,0.326


In [13]:
display(res)
display(type(res))
print("res2")
display(res2)
display(type(res2))

NameError: name 'res' is not defined

# Todo: topic with words for all trees

#### show matrix with topic words

##### For trees with level 3 and 4

In [15]:
for l, df in enumerate(res_l34_tw):
    print("Level: " + str(l))
    display(df)

Level: 0


Unnamed: 0,"prozent, ökologisch, bio, deutschland, stehen,"
"bio, prozent, ökologisch, deutschland, konventionell,",0.829


Level: 1


Unnamed: 0,"gemüse, obst, fleisch, essen, boden,","schaffrin, grün, magdalena, mode, kleidung,","hatje, hans_hinrich, konventionell, entscheidung, rehn,"
"feld, obst, anbau, gemüse, klein,",0.542,0.345,0.364
"schwer, sonnleitner, darm, keimen, gesünder,",0.444,0.335,0.324
"alnatura, bauer, rehn, 's, mark,",0.353,0.34,0.367


Level: 2


Unnamed: 0,"italien, alberti, kontrolle, wachsen, italienisch,","huhn, dioxin, häde, eiern, wirken,","dosch, discounter, gentechnik, forscher, dennree,","münchner, oktoberfest, erklären, leute, jahrzehnt,","eiern, künftig, cent, fleisch, küken,"
"hatje, biobauern, assaf, hof, meier,",0.358,0.333,0.355,0.343,0.355
"dosch, schmecken, bio, konventionell, studie,",0.343,0.329,0.447,0.33,0.323
"region, kunde, regional, alberti, italien,",0.562,0.333,0.465,0.343,0.327
"basic, amazon, lidl, discounter, moralisch,",0.323,0.319,0.374,0.442,0.42
"dioxin, huhn, häde, essen, tier,",0.36,0.688,0.342,0.336,0.336
"eiern, künftig, münchner, küken, laengenfelder,",0.332,0.334,0.331,0.503,0.523
"schwartze, frage, bangladesch, löhne, ökobauern,",0.426,0.321,0.33,0.316,0.334
"mark, bioprodukt, grüne, eugh, bundesrepublik,",0.334,0.407,0.34,0.34,0.331
"schaffrin, reden, bse-krise, magdalena, mode,",0.352,0.326,0.323,0.341,0.322


##### For trees with level 4 and 5

In [16]:
for l, df in enumerate(res_l45_tw):
    print("Level: " + str(l))
    display(df)

Level: 0


Unnamed: 0,"prozent, deutschland, jed, landwirtschaft, stehen,"
"prozent, ökologisch, bio, deutschland, stehen,",0.817


Level: 1


Unnamed: 0,"konventionell, bio, lebensmittel, bioprodukt, gemüse,","lebensmittel, wies’n, münchen, kattendorfer, supermarktkette,"
"gemüse, obst, fleisch, essen, boden,",0.521,0.352
"schaffrin, grün, magdalena, mode, kleidung,",0.367,0.345
"hatje, hans_hinrich, konventionell, entscheidung, rehn,",0.389,0.318


Level: 2


Unnamed: 0,"verbraucher, streng, standard, kriterium, hersteller,","bauer, bauernverband, funken, agrarpolitik, politik,","feld, rauhaus, ernten, schäfer, studie,","sonnleitner, münchner, jahrzehnt, ausschließlich, oktoberfest,"
"italien, alberti, kontrolle, wachsen, italienisch,",0.408,0.361,0.335,0.32
"huhn, dioxin, häde, eiern, wirken,",0.351,0.334,0.328,0.32
"dosch, discounter, gentechnik, forscher, dennree,",0.377,0.327,0.456,0.316
"münchner, oktoberfest, erklären, leute, jahrzehnt,",0.36,0.323,0.34,0.432
"eiern, künftig, cent, fleisch, küken,",0.334,0.325,0.344,0.37


Level: 3


Unnamed: 0,"schwartze, frage, bangladesch, leute, asien,","huhn, häde, tier, keimen, darm,","transport, anbau, umweltfreundlich, völlig, problem,","rehn, alnatura, basic, bse-krise, konzept,","hatje, timmermann, hans_hinrich, dennree, greim,","amazon, walmart, google, whole_foods, hektar,"
"schwartze, assaf, arbeit, arbeiter, arbeiten,",0.479,0.333,0.339,0.34,0.345,0.312
"bauer, landwirtschaft, bauernverband, funken, bse-krise,",0.332,0.327,0.33,0.479,0.341,0.345
"dioxin, falsch, zertifikat, umso, gramm,",0.326,0.366,0.389,0.333,0.328,0.315
"keimen, mark, darm, erreger, ehec-bakterien,",0.349,0.59,0.325,0.324,0.326,0.317
"bio, konventionell, schmecken, produkt, studie,",0.387,0.33,0.343,0.328,0.332,0.32
"timmermann, gemüse, martina, perchlorat, kroh,",0.351,0.323,0.351,0.325,0.351,0.378
"alnatura, mitarbeiter, basic, 's, amazon,",0.341,0.335,0.345,0.513,0.554,0.383


##### For trees with level 5 and 3

In [17]:
for l, df in enumerate(res_l53_tw):
    print("Level: " + str(l))
    display(df)

Level: 0


Unnamed: 0,"bio, prozent, ökologisch, deutschland, konventionell,"
"prozent, deutschland, jed, landwirtschaft, stehen,",0.796


Level: 1


Unnamed: 0,"feld, obst, anbau, gemüse, klein,","schwer, sonnleitner, darm, keimen, gesünder,","alnatura, bauer, rehn, 's, mark,"
"konventionell, bio, lebensmittel, bioprodukt, gemüse,",0.5,0.384,0.357
"lebensmittel, wies’n, münchen, kattendorfer, supermarktkette,",0.335,0.359,0.321


Level: 2


Unnamed: 0,"hatje, biobauern, assaf, hof, meier,","dosch, schmecken, bio, konventionell, studie,","region, kunde, regional, alberti, italien,","basic, amazon, lidl, discounter, moralisch,","dioxin, huhn, häde, essen, tier,","eiern, künftig, münchner, küken, laengenfelder,","schwartze, frage, bangladesch, löhne, ökobauern,","mark, bioprodukt, grüne, eugh, bundesrepublik,","schaffrin, reden, bse-krise, magdalena, mode,"
"verbraucher, streng, standard, kriterium, hersteller,",0.351,0.373,0.48,0.384,0.365,0.354,0.363,0.38,0.35
"bauer, bauernverband, funken, agrarpolitik, politik,",0.343,0.329,0.33,0.333,0.333,0.316,0.327,0.329,0.511
"feld, rauhaus, ernten, schäfer, studie,",0.42,0.354,0.327,0.321,0.344,0.331,0.337,0.338,0.326
"sonnleitner, münchner, jahrzehnt, ausschließlich, oktoberfest,",0.318,0.321,0.334,0.329,0.315,0.5,0.316,0.318,0.326


In [19]:
res_l34,res_l34_tw = h.get_matrix_for_all_commmon_levels(node_dict1,node_dict1)
for l, df in enumerate(res_l34):
    print("Level: " + str(l))
    display(df)

Level: 0


Unnamed: 0,Topic 0
Topic 0,1.0


Level: 1


Unnamed: 0,Topic 1,Topic 3,Topic 6
Topic 1,1.0,0.372,0.363
Topic 3,0.372,1.0,0.339
Topic 6,0.363,0.339,1.0


Level: 2


Unnamed: 0,Topic 2,Topic 5,Topic 8,Topic 9,Topic 4,Topic 13,Topic 7,Topic 11,Topic 14
Topic 2,1.0,0.342,0.349,0.341,0.34,0.337,0.364,0.339,0.349
Topic 5,0.342,1.0,0.346,0.34,0.346,0.327,0.337,0.337,0.325
Topic 8,0.349,0.346,1.0,0.335,0.342,0.337,0.358,0.341,0.337
Topic 9,0.341,0.34,0.335,1.0,0.342,0.328,0.327,0.324,0.328
Topic 4,0.34,0.346,0.342,0.342,1.0,0.333,0.338,0.334,0.333
Topic 13,0.337,0.327,0.337,0.328,0.333,1.0,0.321,0.331,0.322
Topic 7,0.364,0.337,0.358,0.327,0.338,0.321,1.0,0.341,0.338
Topic 11,0.339,0.337,0.341,0.324,0.334,0.331,0.341,1.0,0.334
Topic 14,0.349,0.325,0.337,0.328,0.333,0.322,0.338,0.334,1.0
