### Imports

In [17]:
import plotly.express as px
import plotly.io as pio
import matplotlib as plt
import os
from bertopic.representation import PartOfSpeech
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.backend import BaseEmbedder
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
# Name of the output folder and the file
output_folder_name = "tp1_compare"
# Create the output folder if it doesn't exist
if not os.path.exists(output_folder_name):
    os.makedirs(output_folder_name)
# Get the current working directory
current_working_directory = os.getcwd()

In [19]:
from transformers.pipelines import pipeline
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
#embedding_model = pipeline("feature-extraction", model="bert-base-german-cased")
#embedding_model = pipeline(model="ZurichNLP/swissbert")
#embedding_model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

In [20]:
from umap import UMAP
from sklearn.decomposition import PCA
# n_neighbors: höhere Werte nehmen eine "globalere" Perspektive der Embeddings ein (grössere Cluster)
# n_neighbors: tiefere Werte nehmen eine "lokalere" Perspektive der Embeddings ein
# n_components: tiefere Werte beeinflussen die Qualität der Embeddings
# n_components: hohe Werte dauern länger und HDBScan braucht länger für die Berechnung

dim_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine')


# Eine schnellere Art die Dimensionen zu reduzieren
#dim_model = PCA(n_components=5)
#topic_model = BERTopic(umap_model=dim_model)

In [21]:
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans

# min_cluster_size: wie gross die Cluster mindestens sein müssen
# min_samples: die Zahl der Outlier, tiefere Zahlen reduzieren die Outlier
cluster_model = HDBSCAN(min_cluster_size=25, metric='euclidean', 
                        cluster_selection_method='eom', prediction_data=True, min_samples=3)

# andere Art des Clustering, das keine Ausreisser produziert
#cluster_model = KMeans(n_clusters=50)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
# min_df: wie oft ein Wort vorkommen muss, bevor es in die Repräsentation gelangt
# so kann man bei grossen Dokumenten die Berechnung verkürzen
# ngram_range: bestimmt die Länge der Ngrams, die in der Repräsentation erscheinen
vectorizer_model = CountVectorizer(min_df=1, ngram_range=(1, 3))

# max_features: topic term matrix wird kontrolliert. anstelle das man min_df einstellen muss
#vectorizer_model = CountVectorizer(max_features=10_000)

In [23]:
from bertopic.vectorizers import ClassTfidfTransformer
# man kann folgende Parameter einfügen: 
# reduce_frequent_words oder BM25
ctfidf_model = ClassTfidfTransformer()
# besser mit Stoppwörtern:
#ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)
#ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [24]:
# verschiedene Repräsentationsmöglichkeiten:
# Keywords
representation_model = [KeyBERTInspired(top_n_words=10), MaximalMarginalRelevance(diversity=0.95)]
# Sprachliche Muster
#pos_patterns = [[{'POS': 'ADJ'}, {'POS': 'NOUN'}], [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]]
#representation_model = PartOfSpeech("de_core_web_sm", pos_patterns=pos_patterns)
# möglichst unterschiedliche Wörter
#representation_model = MaximalMarginalRelevance(diversity=0.3)

### Zwei unterschiedliche Quellen

In [25]:
import pandas as pd
# Path zu der .csv Datei
csv_file_path = "./sampled_subc_tp1.csv"  
# .csv Datei einlesen
df = pd.read_csv(csv_file_path, sep='\t')

# Quellen, die man miteinander vergleichen möchte
corpora = ['faz', 'BILD']  # nur zwei Quellen als Eingabemöglichkeit

# Listen für Korpus 1 (Position 1 in Liste)
korpus1_content = []
korpus1_text_date = []
korpus1_text_source = []
korpus1_text_id = []

# Listen für Korpus 2 (Position 2 in Liste)
korpus2_content = []
korpus2_text_date = []
korpus2_text_source = []
korpus2_text_id = []

# Loop durch alle Reihen in der .csv Datei
for index, row in df.iterrows():
    quelle = row['text_source']
    # Loop durch Quellen
    for corpus in quelle:
        if corpus in channel_id:
            # Liste für Korpus 1
            if corpora[0] == corpus:
                korpus1_content.append(row['text_content'])
                korpus1_text_date.append(row['text_date'])
                korpus1_text_source.append(row['text_source'])
                korpus1_text_id.append(row['text_id'])
            # Liste für Korpus 2
            elif corpora[1] == corpus:
                korpus2_content.append(row['text_content'])
                korpus2_text_date.append(row['text_date'])
                korpus2_text_source.append(row['text_source'])
                korpus2_text_id.append(row['text_id'])
                
# Kontrolle der Listenlänge: alle Listen müssen gleich lang sein         
print(len(korpus1_content))
print(len(korpus1_text_date))
print(len(korpus1_text_source))
print(len(korpus1_text_id))

print(len(korpus2_content))
print(len(korpus2_text_date))
print(len(korpus2_text_source))
print(len(korpus2_text_id))

NameError: name 'channel_id' is not defined

### Zwei unterschiedliche Zeiträume

In [26]:
import pandas as pd
from datetime import datetime

# Path zur .csv Datei
csv_file_path = "./sampled_subc_tp1.csv"  # neuer File Path erstellen
# .csv Datei einlesen
df = pd.read_csv(csv_file_path, sep='\t') 
# Funktion um Datum zu parsen
def parse_date(date_str):
    for fmt in ('%d-%m-%Y', '%Y-%m'):
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            pass
    raise ValueError('no valid date format found')

# Define your time frames as a dictionary with corpus names as keys
time_frames = {
    'korpus1': (parse_date('01-01-2020'), parse_date('01-01-2021')),
    'korpus2': (parse_date('01-01-2019'), parse_date('01-01-2020')),
}

# Initialize your sentences_corpus dictionary
korpus1_content = []
korpus1_text_date = []
korpus1_text_source = []
korpus1_text_id = []

korpus2_content = []
korpus2_text_date = []
korpus2_text_source = []
korpus2_text_id = []

# Loop through each row in the DataFrame
for index, row in df.iterrows():
    try:
        channel_date = parse_date(row['text_date'])
    except ValueError:
        continue  # Skip rows with invalid date formats

    # Check each corpus defined in time_frames
    for corpus_name, (start_date, end_date) in time_frames.items():
        if start_date <= channel_date <= end_date:
            if corpus_name == 'korpus1':
                korpus1_content.append(row['text_content'])
                korpus1_text_date.append(row['text_date'])
                korpus1_text_source.append(row['text_source'])
                korpus1_text_id.append(row['text_id'])
            elif corpus_name == 'korpus2':
                korpus2_content.append(row['text_content'])
                korpus2_text_date.append(row['text_date'])
                korpus2_text_source.append(row['text_source'])
                korpus2_text_id.append(row['text_id'])


print(len(korpus1_content))
print(len(korpus1_text_date))
print(len(korpus1_text_source))
print(len(korpus1_text_id))

print(len(korpus2_content))
print(len(korpus2_text_date))
print(len(korpus2_text_source))
print(len(korpus2_text_id))

721
721
721
721
663
663
663
663


### Zwei Korpora

In [None]:
filename = "./sampled_subc_tp1.csv"
csv_file_path = os.path.join(current_working_directory, filename)

df = pd.read_csv(csv_file_path, sep='\t', low_memory=False)

korpus1_content = df['text_content'].astype(str).tolist()
korpus1_text_date = df['text_date'].astype(str).tolist()
korpus1_text_source = df['text_source'].astype(str).tolist()
korpus1_text_id = df['text_id'].astype(str).tolist()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

<bertopic._bertopic.BERTopic at 0x17e5b7f70>

In [None]:
filename2 = "./modularity_sampled_telegram_data.csv"
csv_file_path2 = os.path.join(current_working_directory, filename2)
df_2 = pd.read_csv(csv_file_path2, sep=',', low_memory=False)

korpus2_content = df_2['message'].astype(str).tolist()
korpus2_text_date = df_2['date'].astype(str).tolist()
korpus2_text_source = df_2['mapped_number'].astype(str).tolist()
korpus2_text_source2 = df_2['channel_id'].astype(str).tolist()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

<bertopic._bertopic.BERTopic at 0x17e5d7f70>

### BERTopic

In [27]:
korpus1_model = BERTopic(embedding_model=embedding_model, representation_model=representation_model, umap_model=dim_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model)
korpus1_model.fit(korpus1_content)

<bertopic._bertopic.BERTopic at 0x170b242b0>

In [28]:
korpus2_model = BERTopic(embedding_model=embedding_model, representation_model=representation_model, umap_model=dim_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model)
korpus2_model.fit(korpus2_content)

<bertopic._bertopic.BERTopic at 0x170b240a0>

### Comparison

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
sim_matrix = cosine_similarity(korpus1_model.topic_embeddings_, korpus2_model.topic_embeddings_)

import csv
# Define the file name for your CSV file
csv_file_path_compare = os.path.join(output_folder_name, 'compare' + '.csv')
print(csv_file_path_compare)
# Open the CSV file for writing
with open(csv_file_path_compare, mode='w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    # Write a header row to the CSV file (optional)
    csv_writer.writerow(["Media Number", "Representation Media Topic", "Telegram Number", "Representation Media Topic"])
    # Initialize the topic variable
    topic = 0
    x = 0
    topic_dict_en = {}
    topic_dict_nl = {}
    # Loop 100 times
    for i in range(200):
        # Get the topics
        korpus1_topic = korpus1_model.get_topic(topic)
        korpus1_topic2 = korpus1_model.get_topic_info(topic)
        strings = korpus1_topic2['Topic']
        strings = str(strings).split('\n')
        strings =strings[0].split(' ')
        try:
            most_similar_topic_index = np.argmax(sim_matrix[topic + 1]) - 1
            korpus2_topic = korpus2_model.get_topic(most_similar_topic_index)
            korpus2_topic2 = korpus2_model.get_topic_info(most_similar_topic_index)
            strings2 = korpus2_topic2['Topic']
            strings2 = str(strings2).split('\n')
            strings2 = strings2[0].split(' ')
            if len(strings2) <= 4:
                strings2.append(strings2[3])
        except IndexError:
            break
        korpus1_topic = str(korpus1_topic).replace("[", "")
        korpus1_topic = korpus1_topic.replace("]", "")
        korpus1_topic = korpus1_topic.replace("(", "")
        korpus1_topic = korpus1_topic.replace('"', "")
        korpus1_topic = korpus1_topic.replace(')"', "")

        korpus2_topic = str(korpus2_topic).replace("[", "")
        korpus2_topic = korpus2_topic.replace("]", "")
        korpus2_topic = korpus2_topic.replace("(", "")
        korpus2_topic = korpus2_topic.replace('"', "")
        korpus2_topic = korpus2_topic.replace(')"', "")


        # Write the topics to the CSV file
        csv_writer.writerow([strings[4], korpus1_topic.replace("),", "\n"), strings2[4], korpus2_topic.replace("),", "\n")])
        # Increment the topic variable
        topic += 1
        x += 1


tp1_compare/compare.csv


In [30]:
def visualize_and_save_topics_html(model, output_folder_name, m, text_content, text_id, text_date, text_source):
    meta_data_id = {}
    meta_data_id['text_id'] = text_id
    meta_data_id['text_date'] = text_date
    meta_data_id['text_source'] = text_source
    topic_info = model.get_document_info(text_content, metadata=meta_data_id)
    # Assuming topic_info is a list of dictionaries with keys like 'document_id', 'topic_distribution', etc.
    # Saving the data to a .txt file
    topic_data = pd.DataFrame(topic_info)
    csv_file_path1 = os.path.join(output_folder_name, m + 'list_' + '.csv')
    topic_data.to_csv(csv_file_path1, index=False)  # Set index=False to exclude row indices in the output

visualize_and_save_topics_html(korpus1_model, output_folder_name, "korpus1", korpus1_content, korpus1_text_id, korpus1_text_date, korpus1_text_source)#, korpus1_embeddings)
visualize_and_save_topics_html(korpus2_model, output_folder_name, "korpus2", korpus2_content, korpus2_text_source, korpus2_text_date, korpus2_text_source)#, korpus2_embeddings)

In [31]:
import os
import plotly.io as pio
import pandas as pd


### Topics

In [32]:
def visualize_and_save_topics_html(model, output_folder_name, m):
    fig = model.visualize_topics()
    html = pio.to_html(fig)
    html_file_path = os.path.join(output_folder_name, m + '_topic' + '.html')
    with open(html_file_path, 'w') as f:
        f.write(html)
visualize_and_save_topics_html(korpus1_model, output_folder_name, "korpus1")#, korpus1_embeddings)
visualize_and_save_topics_html(korpus2_model, output_folder_name, "korpus2")#, korpus2_embeddings)

### Barchart

In [33]:
def visualize_and_save_barchart_topics(model, output_folder_name, m):
    fig0 = model.visualize_barchart(top_n_topics=20, n_words=20)
    # Convert the figure to HTML
    html0 = pio.to_html(fig0)
    html_file_path0 = os.path.join(output_folder_name, m + '_barchart' + '.html')
    with open(html_file_path0, 'w') as f:
        f.write(html0)
    

visualize_and_save_barchart_topics(korpus1_model, output_folder_name, "korpus1")#, korpus1_embeddings)
visualize_and_save_barchart_topics(korpus2_model, output_folder_name, "korpus2")#, korpus2_embeddings)


### Heatmap

In [34]:
# Function to visualize and save heatmap
def visualize_and_save_heatmap(model, output_folder_name, m):
    heatmap = model.visualize_heatmap()
    html4 = pio.to_html(heatmap)
    html_file_path3 = os.path.join(output_folder_name, m + '_heatmap' + '.html')
    with open(html_file_path3, 'w') as f:
        f.write(html4)

visualize_and_save_heatmap(korpus1_model, output_folder_name, "korpus1")#, korpus1_embeddings)
visualize_and_save_heatmap(korpus2_model, output_folder_name, "korpus2")#, korpus2_embeddings)


### Hierarchy

In [35]:
def visualize_and_save_hierarchical_topics(model, output_folder_name, m, text_content):
    hierarchical_topics = model.hierarchical_topics(text_content)
    fig2 = model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
    html2 = pio.to_html(fig2)
    html_file_path2 = os.path.join(output_folder_name, m + '_hierarchical' + '.html')
    with open(html_file_path2, 'w') as f:
        f.write(html2)
    
visualize_and_save_hierarchical_topics(korpus1_model, output_folder_name, "korpus1", korpus1_content)#, korpus1_embeddings)
visualize_and_save_hierarchical_topics(korpus2_model, output_folder_name, "korpus2", korpus2_content)#, korpus2_embeddings)

100%|██████████| 9/9 [00:15<00:00,  1.70s/it]
100%|██████████| 9/9 [00:16<00:00,  1.85s/it]


### Dynamic

In [36]:
def visualize_clusters(model, output_folder_name, m, text_content, text_date):
    try:
        topics_over_time = model.topics_over_time(text_content, text_date, nr_bins=10)
        fig1 = model.visualize_topics_over_time(topics_over_time, top_n_topics=100)
        html1 = pio.to_html(fig1)
        html_file_path1 = os.path.join(output_folder_name, m + 'dynamic' + '.html')
        with open(html_file_path1, 'w') as f:
            f.write(html1)
    except ValueError:
        print('ValueError')
        pass
visualize_clusters(korpus1_model, output_folder_name, "korpus1", korpus1_content, korpus1_text_date)#, korpus1_embeddings)
visualize_clusters(korpus2_model, output_folder_name, "korpus2", korpus2_content, korpus2_text_date)#, korpus2_embeddings)

ValueError


### Document Clusters

In [37]:
korpus1_embeddings = embedding_model.encode(korpus1_content, show_progress_bar=True)
korpus2_embeddings = embedding_model.encode(korpus2_content, show_progress_bar=True)
def visualize_clusters(model, output_folder_name, m, text_content, embeddings):
    try:
        visualize_docs = model.visualize_documents(text_content, embeddings=embeddings, hide_annotations=True, hide_document_hover=True, width=2400, height=1400)
        html_file_path7 = os.path.join(output_folder_name, m + 'documents_' + '.html')
        html3 = pio.to_html(visualize_docs)
        with open(html_file_path7, 'w') as f:
            f.write(html3)
    except ValueError:
        pass
    # Get document info
    
visualize_clusters(korpus1_model, output_folder_name, "korpus1", korpus1_content, korpus1_embeddings)#, korpus1_embeddings)
visualize_clusters(korpus2_model, output_folder_name, "korpus2", korpus2_content, korpus2_embeddings)#, korpus2_embeddings)

Batches: 100%|██████████| 23/23 [00:24<00:00,  1.06s/it]
Batches: 100%|██████████| 21/21 [00:21<00:00,  1.04s/it]


### Class

In [40]:
def visualize_clusters(model, output_folder_name, m, text_content, classes):
    # Topics ordered by a specific class 
    try:
        topics_per_class = model.topics_per_class(text_content, classes=classes)
        visualize_class = model.visualize_topics_per_class(topics_per_class, top_n_topics=10)
        html5 = pio.to_html(visualize_class)
        html_file_path5 = os.path.join(output_folder_name, 'class' + '.html')
        with open(html_file_path5, 'w') as f:
            f.write(html5)
    except ValueError:
        print(f'inconsistent shapes {m}')
visualize_clusters(korpus1_model, output_folder_name, "korpus1", korpus1_content, korpus1_text_source)#, korpus1_embeddings)
visualize_clusters(korpus2_model, output_folder_name, "korpus2", korpus2_content, korpus2_text_source)#, korpus2_embeddings)


inconsistent shapes korpus1
inconsistent shapes korpus1


### Distribution of Topics in Topic

In [43]:
def visualize_clusters(model, output_folder_name, m, text_content, classes):
    # Topics ordered by a specific class 
    try:
        topic_distr, _ = model.approximate_distribution(text_content)
        visualize_class = model.visualize_distribution(topic_distr[0])
        html5 = pio.to_html(visualize_class)
        html_file_path5 = os.path.join(output_folder_name, 'distribution' + '.html')
        with open(html_file_path5, 'w') as f:
            f.write(html5)
    except ValueError:
        print(f'inconsistent shapes {m}')
visualize_clusters(korpus1_model, output_folder_name, "korpus1", korpus1_content, korpus1_text_source)#, korpus1_embeddings)
visualize_clusters(korpus2_model, output_folder_name, "korpus2", korpus2_content, korpus2_text_source)#, korpus2_embeddings)

inconsistent shapes korpus1
