## Installation

In [None]:
%pip install plotly
%pip install matplotlib
%pip install bertopic
%pip install sc

In [None]:
import plotly.express as px
import plotly.io as pio
import matplotlib as plt
import os
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.backend import BaseEmbedder
import pandas as pd

## Output Folder

In [None]:
# Englishcsv_file_path = '/Users/davideventre/Desktop/telegram_daten/subcorpus_tp1.csv'
# Read the CSV file
# wort = input("Enter the name of the variable: ")
# Name of the output folder and the file
teilprojekt = ''
output_folder_name = ''
# Create the output folder if it doesn't exist
if not os.path.exists(output_folder_name):
    os.makedirs(output_folder_name)

### Corpus

In [None]:
import os
import pandas as pd

# Specify the TSV file path
filename = ''

# Construct the file path (no need for os.getcwd() since a relative path is specified)
tsv_file_path = os.path.join(filename)

# Read the TSV file
df = pd.read_csv(tsv_file_path, sep=',', low_memory=False)

# Print the header to know available columns
print(df.columns)
print(df)
# Get the number of rows in the DataFrame
num_rows = len(df)

# Extract specific columns (adjust column names as needed based on printed header)
korpus_content = df['text_content'].astype(str).tolist()
korpus_text_date = df['text_date'].astype(str).tolist()
korpus_text_source = df['text_source'].astype(str).tolist()
korpus_text_id = df['text_id'].astype(str).tolist()


## Data Processing

Here, the steps are carried out that determine which models and algorithms are decisive for processing the texts.

### BERTopic Parameter

In [None]:
from transformers.pipelines import pipeline
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from transformers import AutoModelForMaskedLM, AutoTokenizer
from safetensors import safe_open
import torch
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
#matryoshka_dim = 1024 # How big your embeddings should be, choose from: 64, 128, 256, 512, 768, 1024
#model = SentenceTransformer("aari1995/German_Semantic_V3")
#embedding_model = pipeline("feature-extraction", model="dbmdz/bert-base-german-cased")
#embedding_model = pipeline(model="ZurichNLP/swissbert")
#embedding_model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')
#embedding_model = AutoModelForMaskedLM.from_pretrained('checkpoint-88500', use_safetensors=True)



In [None]:
from umap import UMAP
from sklearn.decomposition import PCA
# n_neighbors: höhere Werte nehmen eine "globalere" Perspektive der Embeddings ein (grössere Cluster)
# n_neighbors: tiefere Werte nehmen eine "lokalere" Perspektive der Embeddings ein
# n_components: tiefere Werte beeinflussen die Qualität der Embeddings
# n_components: hohe Werte dauern länger und HDBScan braucht länger für die Berechnung

dim_model = UMAP(n_neighbors=12, n_components=10, min_dist=0.0, metric='cosine')


# Eine schnellere Art die Dimensionen zu reduzieren
#dim_model = PCA(n_components=5)
#topic_model = BERTopic(umap_model=dim_model)

In [None]:
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans

# min_cluster_size: wie gross die Cluster mindestens sein müssen
# min_samples: die Zahl der Outlier, tiefere Zahlen reduzieren die Outlier
cluster_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True, min_samples=3)
# andere Art des Clustering, das keine Ausreisser produziert
#cluster_model = KMeans(n_clusters=20)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# min_df: wie oft ein Wort vorkommen muss, bevor es in die Repräsentation gelangt
# so kann man bei grossen Dokumenten die Berechnung verkürzen
# ngram_range: bestimmt die Länge der Ngrams, die in der Repräsentation erscheinen
stopword = []
vectorizer_model = CountVectorizer(stop_words=stopword,min_df=3, max_df=0.7)

# max_features: topic term matrix wird kontrolliert. anstelle das man min_df einstellen muss
#vectorizer_model = CountVectorizer(max_features=10_000)

In [None]:
from bertopic.vectorizers import ClassTfidfTransformer
# man kann folgende Parameter einfügen: 
# reduce_frequent_words oder BM25
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
# besser mit Stoppwörtern:
#ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)
#ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [None]:
# verschiedene Repräsentationsmöglichkeiten:
# Keywords
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

representation_model = [KeyBERTInspired(top_n_words=10), MaximalMarginalRelevance(diversity=0.3)]
# Sprachliche Muster
""" 
import spacy
spacy_de = spacy.load('de_core_news_sm')

pos_patterns = [[{'POS': 'ADJ'}, {'POS': 'NOUN'}], [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]]
representation_model = PartOfSpeech(spacy_de, pos_patterns=pos_patterns)
"""
# möglichst unterschiedliche Wörter
representation_model = MaximalMarginalRelevance(diversity=0.3)


## BERTopic, Zero-Shot BERTopic, Guided BERTopic

Two different ways to perform BERTopic modeling:
	1.	BERTopic: Standard topic modeling, without specifying which topics will appear.
	2.	Zero-Shot BERTopic: Topic modeling in which a specific topic is guaranteed to appear.

### Ausführung BERTopic

In [None]:
#data = 'smaller_clusters'
topic_model = BERTopic(embedding_model=embedding_model, 
                       representation_model=representation_model, 
                       umap_model=dim_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model)
topic_model.fit(korpus_content)

In [None]:
# Open a file to write the logs
with open(f'{output_folder_name}/model_log.txt', 'a') as log_file:
    # Log the variables
    log_file.write(f'embedding_model: {embedding_model}\n')
    log_file.write(f'dim_model: {dim_model}\n')
    log_file.write(f'cluster_model: {cluster_model}\n')
    log_file.write(f'vectorizer_model: {vectorizer_model}\n')
    log_file.write(f'ctfidf_model: {ctfidf_model}\n')
    log_file.write(f'representation_model: {representation_model}\n')
    # Log additional messages
    log_file.write('This is a debug message\n')
    log_file.write('This is an info message\n')
    log_file.write('This is a warning message\n')
    log_file.write('This is an error message\n')
    log_file.write('This is a critical message\n')

    # A function to demonstrate logging in a function
    def divide(a, b):
        try:
            result = a / b
            log_file.write(f'Divide {a} by {b} = {result}\n')
            return result
        except ZeroDivisionError:
            log_file.write('Division by zero error\n')
            return None

    # Call the function with sample values
    divide(10, 2)
    divide(10, 0)


### Ausführung BERTopic Zero Shot

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

# We define a number of topics that we know are in the documents
zeroshot_topic_list = ["Solidarität"

stopword = []
vectorizer_model = CountVectorizer(stop_words=stopword, ngram_range=(1,3), max_df=0.7)

zeroshot_min_similarity=0.37

topic_model = BERTopic(
    top_n_words=10,
    min_topic_size=10,
    nr_topics=None,
    low_memory=False,
    calculate_probabilities=False,
    seed_topic_list=None,
    zeroshot_topic_list=zeroshot_topic_list,
    zeroshot_min_similarity=zeroshot_min_similarity,
    umap_model=None,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model = KMeans(),
    ctfidf_model=ctfidf_model,
    representation_model=KeyBERTInspired()
)

# Open a file to write the logs
with open(f'{output_folder_name}/model_log.txt', 'a') as log_file:
    # Log the variables
    log_file.write(f'embedding_model: {embedding_model}\n')
    log_file.write(f'dim_model: {dim_model}\n')
    log_file.write(f'cluster_model: {cluster_model}\n')
    log_file.write(f'vectorizer_model: {vectorizer_model}\n')
    log_file.write(f'ctfidf_model: {ctfidf_model}\n')
    log_file.write(f'representation_model: {representation_model}\n')
    log_file.write(f'cluster_model: {zeroshot_topic_list}\n')
    log_file.write(f'ctfidf_model: {zeroshot_min_similarity}\n')
    # Log additional messages
    log_file.write('This is a debug message\n')
    log_file.write('This is an info message\n')
    log_file.write('This is a warning message\n')
    log_file.write('This is an error message\n')
    log_file.write('This is a critical message\n')

    # A function to demonstrate logging in a function
    def divide(a, b):
        try:
            result = a / b
            log_file.write(f'Divide {a} by {b} = {result}\n')
            return result
        except ZeroDivisionError:
            log_file.write('Division by zero error\n')
            return None

    # Call the function with sample values
    divide(10, 2)
    divide(10, 0)

topics, _ = topic_model.fit_transform(korpus_content)

## Save the model

Saving of the model, so one does not have to run it twice.

In [None]:
# Method 1 - safetensors
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save("model_solidarität_orgspende_tp2", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)


In [None]:
# Method 2 - pytorch
embedding_model = embedding_model
topic_model.save("model", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)


In [None]:
# Load from directory
topic_model = BERTopic.load("model")


In [None]:
topics, probs = topic_model.fit_transform(korpus_content)
topic_model_distr = BERTopic().fit(korpus_content)
# Reduce outliers
new_topics = topic_model.reduce_outliers(korpus_content, topics)
# versions of dependencies and python used. loading and saving model => same dependencies and python
# saved in one version of Bertopic should not be loaded in others
#topic_model = BERTopic.load("meditopic_model")

## Dataanalysis

### Overview Data

In [None]:
def visualize_and_save_topics_html(model, output_folder_name, m, text_content, text_date, text_source, text_id):
    topic_info = topic_model.get_topic_info()
    # Assuming topic_info is a list of dictionaries with keys like 'document_id', 'topic_distribution', etc.
    # Saving the data to a .txt file
    topic_data = pd.DataFrame(topic_info)
    csv_file_path1 = os.path.join(output_folder_name, m + '_topic_list' + '.csv')
    topic_data.to_csv(csv_file_path1, index=False)  # Set index=False to exclude row indices in the output

visualize_and_save_topics_html(topic_model, output_folder_name, teilprojekt, korpus_content, korpus_text_id, korpus_text_date, korpus_text_source)#, korpus_embeddings)


In [None]:
def visualize_and_save_topics_html(model, output_folder_name, m, text_content, text_id, text_date, text_source):
    meta_data_id = {}
    meta_data_id['text_id'] = text_id
    meta_data_id['text_date'] = text_date
    meta_data_id['text_source'] = text_source
    topic_info = model.get_document_info(text_content, metadata=meta_data_id)
    # Assuming topic_info is a list of dictionaries with keys like 'document_id', 'topic_distribution', etc.
    # Saving the data to a .txt file
    topic_data = pd.DataFrame(topic_info)
    csv_file_path1 = os.path.join(output_folder_name, m + '_list' + '.csv')
    topic_data.to_csv(csv_file_path1, index=False)  # Set index=False to exclude row indices in the output

visualize_and_save_topics_html(topic_model, output_folder_name, teilprojekt, korpus_content, korpus_text_id, korpus_text_date, korpus_text_source)#, korpus_embeddings)

### Topics

In [None]:
def visualize_and_save_barchart_topics(model, output_folder_name, m):
    fig0 = model.visualize_topics()
    # Convert the figure to HTML
    html0 = pio.to_html(fig0)
    html_file_path0 = os.path.join(output_folder_name, m + '_topics' + '.html')
    with open(html_file_path0, 'w') as f:
        f.write(html0)
    

visualize_and_save_barchart_topics(topic_model, output_folder_name, teilprojekt)

### Term Rank

In [None]:
def visualize_and_save_barchart_topics(model, output_folder_name, m):
    fig0 = model.visualize_term_rank()
    # Convert the figure to HTML
    html0 = pio.to_html(fig0)
    html_file_path0 = os.path.join(output_folder_name, m + '_term_rank' + '.html')
    with open(html_file_path0, 'w') as f:
        f.write(html0)
    

visualize_and_save_barchart_topics(topic_model, output_folder_name, teilprojekt)

### Barchart

In [None]:
def visualize_and_save_barchart_topics(model, output_folder_name, m):
    fig0 = model.visualize_barchart(top_n_topics=20, n_words=20)
    # Convert the figure to HTML
    html0 = pio.to_html(fig0)
    html_file_path0 = os.path.join(output_folder_name, m + '_barchart' + '.html')
    with open(html_file_path0, 'w') as f:
        f.write(html0)
    

visualize_and_save_barchart_topics(topic_model, output_folder_name, teilprojekt)

### Heatmap

In [None]:
# Function to visualize and save heatmap
def visualize_and_save_heatmap(model, output_folder_name, m):
    heatmap = model.visualize_heatmap()
    html4 = pio.to_html(heatmap)
    html_file_path3 = os.path.join(output_folder_name, m + '_heatmap' + '.html')
    with open(html_file_path3, 'w') as f:
        f.write(html4)

visualize_and_save_heatmap(topic_model, output_folder_name, teilprojekt)

### Hierarchy

In [None]:
def visualize_and_save_hierarchical_topics(model, output_folder_name, m, korpus_content):
    try:
        hierarchical_topics = model.hierarchical_topics(korpus_content)
        fig2 = model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
        html2 = pio.to_html(fig2)
        html_file_path2 = os.path.join(output_folder_name, m + '_hierarchical' + '.html')
        with open(html_file_path2, 'w') as f:
            f.write(html2)
    except TypeError:
        pass
    
visualize_and_save_hierarchical_topics(topic_model, output_folder_name, teilprojekt, korpus_content)

In [None]:
def visualize_and_save_hierarchical_topics(model, output_folder_name, m, korpus_content):
    try:
        hierarchical_topics = model.hierarchical_topics(korpus_content)
        fig2 = model.visualize_hierarchical_documents(korpus_content, hierarchical_topics)
        html2 = pio.to_html(fig2)
        html_file_path2 = os.path.join(output_folder_name, m + '_hierarchical_documents' + '.html')
        with open(html_file_path2, 'w') as f:
            f.write(html2)
    except TypeError:
        pass
visualize_and_save_hierarchical_topics(topic_model, output_folder_name, teilprojekt, korpus_content)

In [None]:
def visualize_and_save_hierarchical_topics(model, output_folder_name, m, korpus_content):
    try:
        hierarchical_topics = model.hierarchical_topics(korpus_content)
        tree = model.get_topic_tree(hierarchical_topics)
        print(tree)
    except TypeError:
        pass
visualize_and_save_hierarchical_topics(topic_model, output_folder_name, teilprojekt, korpus_content)

### Dynamic

In [None]:
topics_over_time = topic_model.topics_over_time(korpus_content, korpus_text_date, nr_bins=20)

In [None]:
import pandas as pd
import os
import plotly.io as pio

def visualize_clusters(model, output_folder_name, m, korpus_content, text_date):
    try:
        # Ensure that text_date is in a consistent format
        # Attempt to parse the date using pandas, handling various formats
        formatted_dates = pd.to_datetime(text_date, errors='coerce', format='mixed')

        # Check for any NaT (Not a Time) in formatted_dates which indicates failed parsing
        if formatted_dates.isnull().any():
            # Handle dates that couldn't be parsed, you can choose to fill them with a default date
            # or drop those entries depending on your requirements
            print("Some dates could not be parsed and will be excluded.")
            # Example: Drop rows with unparsable dates
            valid_indices = ~formatted_dates.isnull()
            formatted_dates = formatted_dates[valid_indices]
            korpus_content = [content for i, content in enumerate(korpus_content) if valid_indices[i]]

        topics_over_time = model.topics_over_time(korpus_content, formatted_dates, nr_bins=100)
        fig1 = model.visualize_topics_over_time(topics_over_time, top_n_topics=100)
        html1 = pio.to_html(fig1)
        html_file_path1 = os.path.join(output_folder_name, m + 'dynamic_loaded' + '.html')
        with open(html_file_path1, 'w') as f:
            f.write(html1)
    except (IndexError, ValueError) as e:
        print(f'Error: {str(e)}')
        pass

# Example usage
visualize_clusters(topic_model, output_folder_name, teilprojekt, korpus_content, korpus_text_date)


In [None]:
def visualize_clusters(model, output_folder_name, m, korpus_content, text_date):
    try:
        topics_over_time = model.topics_over_time(korpus_content, text_date, nr_bins=100)
        fig1 = model.visualize_topics_over_time(topics_over_time, top_n_topics=100)
        html1 = pio.to_html(fig1)
        html_file_path1 = os.path.join(output_folder_name, m + 'dynamic_loaded' + '.html')
        with open(html_file_path1, 'w') as f:
            f.write(html1)
    except IndexError or ValueError:
        print('ValueError: Expected 2D array, got scalar array instead')
        pass

visualize_clusters(topic_model, output_folder_name, teilprojekt, korpus_content, korpus_text_date)

### Mapping

In [None]:
#korpus_embeddings = embedding_model.encode(korpus_content, show_progress_bar=True)
korpus_embeddings = embedding_model(korpus_content, show_progress_bar=True)

def visualize_clusters(model, output_folder_name, m, korpus_content, embeddings):
    try:
        visualize_docs = model.visualize_documents(korpus_content, embeddings=embeddings, hide_annotations=True, hide_document_hover=True, width=2400, height=1400)
        html_file_path7 = os.path.join(output_folder_name, m + '_documents' + '.html')
        html3 = pio.to_html(visualize_docs)
        with open(html_file_path7, 'w') as f:
            f.write(html3)
    except IndexError:
        try:
            print('ValueError')    
            pass
        except TypeError:
            pass
    # Get document info
    
visualize_clusters(topic_model, output_folder_name, teilprojekt, korpus_content, korpus_embeddings)

In [None]:
def visualize_and_save_barchart_topics(model, output_folder_name, m):
    fig0 = model.visualize_document_datamap()
    # Convert the figure to HTML
    html0 = pio.to_html(fig0)
    html_file_path0 = os.path.join(output_folder_name, m + '_datamap' + '.html')
    with open(html_file_path0, 'w') as f:
        f.write(html0)
    

visualize_and_save_barchart_topics(topic_model, output_folder_name, teilprojekt)

### Class

In [None]:
def visualize_clusters(model, output_folder_name, m, korpus_content, classes):
    # Topics ordered by a specific class 
    try:
        topics_per_class = model.topics_per_class(korpus_content, classes=classes)
        visualize_class = model.visualize_topics_per_class(topics_per_class, top_n_topics=10)
        html5 = pio.to_html(visualize_class)
        html_file_path5 = os.path.join(output_folder_name, m + '_class' + '.html')
        with open(html_file_path5, 'w') as f:
            f.write(html5)
    except IndexError:
        print(f'ValueError: Expected 2D array, got scalar array instead:')
        pass
visualize_clusters(topic_model, output_folder_name, teilprojekt, korpus_content, korpus_text_source)

### Distribution of Topics in Topic

In [None]:
def visualize_clusters(model, output_folder_name, m, text_content):
    # Topics ordered by a specific class 
    try:
        topic_distr, _ = model.approximate_distribution(text_content)
        visualize_class = model.visualize_distribution(topic_distr[0])
        html5 = pio.to_html(visualize_class)
        html_file_path5 = os.path.join(output_folder_name, '_distribution' + '.html')
        with open(html_file_path5, 'w') as f:
            f.write(html5)
    except ValueError:
        print(f'NotFittedError: Vocabulary not fitted or provided')
        pass
visualize_clusters(topic_model, output_folder_name, "korpus", korpus_content)#, korpus1_embeddings)
