In [None]:
# select Kernel conda_amazonei_pytorch_latest_p37
!pip install -r requirements.txt > /dev/null 2>&1 && echo "All packages were installed successfully." || echo "An error occurred while installing packages."

In [None]:
import boto3
import pandas as pd

# Verbindung zum S3-Bucket herstellen
s3 = boto3.resource('s3')

# Bucket-Namen und Pfad zur Datei
bucket_name = 'task-bazaar-sagemaker'
file_key = 'pandas_df2.csv'

# Datei aus dem Bucket herunterladen
s3.Bucket(bucket_name).download_file(file_key, 'local-file-name.csv')

# Datei in ein Pandas DataFrame laden
df = pd.read_csv('local-file-name.csv', dtype='str')

# Anzeigen der ersten fünf Zeilen des DataFrames
df.head()

In [None]:
# track progress
from tqdm import tqdm
tqdm.pandas()

# keep only the columns we deem necessary
df = df[['Name', 'AT_MaraMatkl', 'AT_MaraMaktx', 'AT_MaraBrgew', 'AT_MaraMtart', 'AT_MaraLabor']]

# merge all columns into one
df = df.astype(str).progress_apply(lambda x: ' '.join(x), axis=1)

# lower case
df = df.astype(str).progress_apply(lambda x: x.lower())

# remove all short words from one column dataframe
df = df.astype(str).progress_apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

# remove strings that contain numbers
df = df.astype(str).progress_apply(lambda x: ' '.join([w for w in x.split() if not any(c.isdigit() for c in w)]))

# remove non alphanumeric characters
import re
df = df.astype(str).progress_apply(lambda x: re.sub(r'[^a-zA-Z0-9]', ' ', x))

# remove uneccessary spaces
df = df.astype(str).progress_apply(lambda x: ' '.join(x.split()))

# remove duplicate words in one row
df = df.astype(str).progress_apply(lambda x: ' '.join(sorted(set(x.split()), key=x.split().index)))

In [None]:
# create list from dataframe
df_list = df.values.tolist()
df_list = df_list[:20000] # remove [:20000] once more computing power is available

In [None]:
%%time
from bertopic import BERTopic

topic_model = BERTopic()
print("topic_model build")
topics, probs = topic_model.fit_transform(df_list)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_document_info(df_list)[:10]

In [None]:
#visualization
"""
from scipy.cluster import hierarchy as sch

# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(df_list, linkage_function=linkage_function)

topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
"""