In [221]:
# !pip install bertopic[all]
# !pip install bertopic

In [222]:
from bertopic import BERTopic

In [223]:
# from sklearn.datasets import fetch_20newsgroups
# docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
# docs[0]

## **Reading Dataset**

In [224]:
dataset = "20News"
input_dataset = f"Baselines/textual_folds/{dataset}/{dataset}Pre.txt"

In [225]:
def read_input(input_dataset):
    arq = open(input_dataset, 'r', encoding="utf-8")
    doc = arq.readlines()
    arq.close()
    documents = list(map(str.rstrip, doc))
    return documents


In [226]:
docs = read_input(input_dataset=input_dataset)

In [227]:
docs[:3]

['faces week running world faces',
 'buy cable operator billion cable mobile phone',
 'voting austerity millionaires']

## **BERTopic**

In [228]:
number_topics = 35
top_words = 10

In [229]:
topic_model = BERTopic(nr_topics=number_topics, top_n_words=top_words)
topics, _ = topic_model.fit_transform(docs)

## **Get Topic Information**

In [230]:
len(topic_model.get_topic_info())

16

## **Get Top Words of a Topic**

In [231]:
topic_model.get_topic(0)

[('wind', 0.3880908116987635),
 ('mph', 0.30879826826356577),
 ('wain', 0.25627910056429754),
 ('wash', 0.202420325142793),
 ('wine', 0.1614931667475484),
 ('gust', 0.13899792236263692),
 ('slowly', 0.1275788752484128),
 ('walling', 0.11327872238058245),
 ('ave', 0.11227258067575165),
 ('wife', 0.1071744370000687)]

In [232]:
topic_results = []
for topic_id in range(0, len(topic_model.get_topic_info())-1):
  topic_info = topic_model.get_topic(topic_id)
  words = ' '.join(word for word, score in topic_info)
  top_5_words = '_'.join(word for word, score in topic_info[:5])
  topic_results.append(words)

# topic_results.keys()


In [233]:
with open(f"Baselines/BertTopicsResultsHierarchical/topic_words_{top_words}_{dataset}.txt", "w") as output_file:
  for topic in topic_results:
    output_file.write(f"{topic}\n")

  output_file.close()

with open(f"Baselines/BertTopicsResultsHierarchical/config_{dataset}_{top_words}.txt\n", "w") as output_file:
  output_file.write('Number of topics:{}\n'.format(number_topics))
  output_file.write('Number of words:{}\n'.format(top_words))
  output_file.close()

## Hierarchy Structure

In [234]:
from scipy.cluster import hierarchy as sch

linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)

100%|██████████| 14/14 [00:00<00:00, 199.58it/s]


In [235]:
# import pandas as pd
# import io

# def print_hierarchy(parent: int, child: int, indent: str, output_file: io.TextIOWrapper):
#     # if parent == -1:
#     #     output_file.write(f"""{indent}{" ".join(hierarchical_topics.Parent_Name.values.astype(str)[0].split("_"))}\n""")
        
#     df_isin = hierarchical_topics["Parent_ID"].astype(int).isin([child])
#     if df_isin.any():
#         df_aux = hierarchical_topics[df_isin]
        
#         if df_aux.Child_Left_Name.values.astype(str) != df_aux.Parent_Name.values.astype(str):
#             indent_left = indent + "\t"
#         else:
#             indent_left = indent
            
#         print_hierarchy(parent=child, child=df_aux.Child_Left_ID.values.astype(int)[0], indent=indent_left, output_file=output_file)
        
#         if df_aux.Child_Right_Name.values.astype(str) != df_aux.Parent_Name.values.astype(str):
#             indent_right = indent + "\t"
#         else:
#             indent_right = indent
            
#         print_hierarchy(parent=child, child=df_aux.Child_Right_ID.values.astype(int)[0], indent=indent_right, output_file=output_file)
#     else:
#         df_isin_left = hierarchical_topics["Child_Left_ID"].astype(int).isin([child])
#         df_isin_right = hierarchical_topics["Child_Right_ID"].astype(int).isin([child])
#         if df_isin_left.any():
#             df_aux = hierarchical_topics[df_isin_left]
#             output_file.write(f"""{indent}{" ".join(df_aux.Child_Left_Name.values.astype(str)[0].split("_"))}\n""")
#         elif df_isin_right.any():
#             df_aux = hierarchical_topics[df_isin_right]
#             output_file.write(f"""{indent}{" ".join(df_aux.Child_Right_Name.values.astype(str)[0].split("_"))}\n""")

# output_file = open(f"Baselines/BertTopicsResultsHierarchical/hierarchical_topic_words_5_{dataset}.txt", "w")
# print_hierarchy(parent=-1, child=hierarchical_topics.head(1).Parent_ID.astype(int).values[0], indent="", output_file=output_file)
# output_file.close()


In [236]:
import pandas as pd
import io

def print_binary_hierarchy(df: pd.DataFrame, output_file: io.TextIOWrapper):
    indent = ""
    cont = 0
    for idx in range(df.index.max()):
        if len(df.iloc[idx].Topics) == 2 and idx+1 != df.index.max():
            topic_ids = df.iloc[idx].Topics
            for topic_id in topic_ids:
                topic_info = topic_model.get_topic(topic_id)
                words = ' '.join(word for word, score in topic_info)
                indent = "\t"*cont
                output_file.write(f"""{indent}{words}\n""")
                
            cont -= 1
        else:
            topic_id = set(df.iloc[idx].Topics).difference(set(df.iloc[idx+1].Topics)).pop()
            topic_info = topic_model.get_topic(topic_id)
            words = ' '.join(word for word, score in topic_info)
            indent = "\t"*cont
            output_file.write(f"""{indent}{words}\n""")
            cont += 1
    
    for topic_id in df.iloc[df.index.max()].Topics:
        topic_info = topic_model.get_topic(topic_id)
        words = ' '.join(word for word, score in topic_info)
        indent = "\t"*cont
        output_file.write(f"""{indent}{words}\n""")


In [237]:
hierarchical_topics.head(3)

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
13,28,wind_mph_wee_west_wain,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",0,wind_mph_wain_wash_wine,27,wee_west_wan_wet_pips,0.97686
12,27,wee_west_wan_wet_pips,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]",26,wee_west_wet_pips_troops,13,wan_wave_heat_people_walking,0.949624
11,26,wee_west_wet_pips_troops,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14]",14,wee_woody_wings_wax_wad,25,west_wet_pips_troops_season,0.948877


In [238]:
output_file = open(f"Baselines/BertTopicsResultsHierarchical/hierarchical_topic_words_{top_words}_{dataset}.txt", "w")
print_binary_hierarchy(hierarchical_topics, output_file)
output_file.close()


In [239]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [240]:
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

.
├─■──wind_mph_wain_wash_wine ── Topic: 0
└─wee_west_wan_wet_pips
     ├─wee_west_wet_pips_troops
     │    ├─■──wee_woody_wings_wax_wad ── Topic: 14
     │    └─west_wet_pips_troops_season
     │         ├─west_pips_troops_wet_season
     │         │    ├─pips_troops_wet_season_goals
     │         │    │    ├─pips_troops_military_plane_shot
     │         │    │    │    ├─■──pips_total_engineer_cases_reports ── Topic: 2
     │         │    │    │    └─troops_military_plane_shot_toll
     │         │    │    │         ├─■──flooding_floods_toll_flood_storms ── Topic: 3
     │         │    │    │         └─troops_military_plane_shot_landing
     │         │    │    │              ├─troops_plane_military_landing_crashes
     │         │    │    │              │    ├─■──plane_landing_crashes_flight_ship ── Topic: 5
     │         │    │    │              │    └─■──troops_army_military_waterloo_soldiers ── Topic: 1
     │         │    │    │              └─■──shot_police_church_woof_shoot

## **Topic Visualization**

In [241]:
topic_model.visualize_topics()

# **Hierarchy Visualization**

In [242]:
topic_model.visualize_hierarchy()