## BERTopic
BERTopic is a topic modeling technique that leverages 🤗 transformers and a custom class-based TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.

# Enabling the GPU

First, you'll need to enable GPUs for the notebook:

- Navigate to Edit→Notebook Settings
- select GPU from the Hardware Accelerator drop-down

# **Installing BERTopic**

We start by installing BERTopic from PyPi:

In [1]:
%%capture
!pip install bertopic

## Restart the Notebook
After installing BERTopic, some packages that were already loaded were updated and in order to correctly use them, we should now restart the notebook.

From the Menu:

Runtime → Restart Runtime

#Connect to sharepoint

In [2]:
!pip install office365
!pip install Office365-REST-Python-Client

Collecting office365
  Downloading office365-0.3.15-py3-none-any.whl (32 kB)
Collecting azure-storage-blob (from office365)
  Downloading azure_storage_blob-12.17.0-py3-none-any.whl (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.0/388.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting O365 (from office365)
  Downloading O365-2.0.27-py3-none-any.whl (164 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.2/164.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymiscutils (from office365)
  Downloading pymiscutils-0.3.14-py3-none-any.whl (14 kB)
Collecting pathmagic (from office365)
  Downloading pathmagic-0.3.14-py3-none-any.whl (21 kB)
Collecting pyiotools (from office365)
  Downloading pyiotools-0.3.18-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pysubtypes (from office365)
  Downloading pysub

In [3]:
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
from office365.runtime.auth.client_credential import ClientCredential
from office365.sharepoint.files.file import File

####inputs########
# This will be the URL that points to your sharepoint site.
# Make sure you change only the parts of the link that start with "Your"
url_shrpt = 'https://ihuedu.sharepoint.com/sites/EDYTEProject2023/'
username_shrpt = '###############'
password_shrpt = '###############'
folder_files_url_shrpt = '/sites/EDYTEProject2023/Shared%20Documents/General/wp5_data/raw_Data/'
folder_analysis_url_shrpt = '/sites/EDYTEProject2023/Shared%20Documents/General/wp5_data/analysis_exports/'

In [4]:
###Authentication###For authenticating into your sharepoint site###
ctx_auth = AuthenticationContext(url_shrpt)
if ctx_auth.acquire_token_for_user(username_shrpt, password_shrpt):
  ctx = ClientContext(url_shrpt, ctx_auth)
  web = ctx.web
  ctx.load(web)
  ctx.execute_query()
  print('Authenticated into sharepoint as: ',web.properties['Title'])

else:
  print(ctx_auth.get_last_error())
############################

Authenticated into sharepoint as:  EDYTE Project 2023


In [5]:
####Function for extracting the file names of a folder in sharepoint###
###If you want to extract the folder names instead of file names, you have to change "sub_folders = folder.files" to "sub_folders = folder.folders" in the below function

global print_folder_contents
def print_folder_contents(ctx, folder_url):
    try:

        folder = ctx.web.get_folder_by_server_relative_url(folder_url)
        fold_names = []
        sub_folders = folder.files #Replace files with folders for getting list of folders
        ctx.load(sub_folders)
        ctx.execute_query()

        for s_folder in sub_folders:

            fold_names.append(s_folder.properties["Name"])

        return fold_names

    except Exception as e:
        print('Problem printing out library contents: ', e)
######################################################

# Call the function by giving your folder URL as input
filelist_shrpt=print_folder_contents(ctx,folder_files_url_shrpt)
#Print the list of files present in the folder
print(filelist_shrpt)

['process-provision-digital-locations.csv', 'process-steps-digital.csv', 'process-steps.csv', 'process-evidences-cost.csv', 'process.csv', 'process-rules.csv', 'process-conditions.csv', 'process-evidences.csv']


# Import Data

In [6]:
import pandas as pd
import csv
import numpy as np
import io
import os
import tempfile

In [37]:
def process_csv_file(file_name, column_names):
    file_url= folder_files_url_shrpt+file_name
    response = File.open_binary(ctx, file_url)  # Assuming File is imported and ctx is defined
    df = pd.read_csv(io.BytesIO(response.content))
    df = df.dropna(subset=column_names)
    df = df[column_names]
    df.rename(columns = {column_names[-1]:"word"}, inplace = True)


    column_names_list = df.columns.tolist()

    if column_names_list[0] == "id":
      df ['ihu_unique_step_id'] = 'N/A'
      df ['ihu_unique_step_digital_id'] = 'N/A'
      df ['ihu_unique_evidence_id'] = 'N/A'
      df ['ihu_unique_condition_id'] = 'N/A'

    elif column_names_list[1] == "ihu_unique_step_id":
      df ['id'] = 'N/A'
      df ['ihu_unique_step_digital_id'] = 'N/A'
      df ['ihu_unique_evidence_id'] = 'N/A'
      df ['ihu_unique_condition_id'] = 'N/A'

    elif column_names_list[1] == "ihu_unique_step_digital_id":
      df ['id'] = 'N/A'
      df ['ihu_unique_step_id'] = 'N/A'
      df ['ihu_unique_evidence_id'] = 'N/A'
      df ['ihu_unique_condition_id'] = 'N/A'

    elif column_names_list[1] == "ihu_unique_evidence_id":
      df ['id'] = 'N/A'
      df ['ihu_unique_step_id'] = 'N/A'
      df ['ihu_unique_step_digital_id'] = 'N/A'
      df ['ihu_unique_condition_id'] = 'N/A'

    elif column_names_list[1] == "ihu_unique_condition_id":
      df ['id'] = 'N/A'
      df ['ihu_unique_step_id'] = 'N/A'
      df ['ihu_unique_step_digital_id'] = 'N/A'
      df ['ihu_unique_evidence_id'] = 'N/A'

    #docs = df[column_names[-1]].values.tolist()
    return df

In [38]:
#process steps
process_steps = process_csv_file( "process-steps.csv", ["process_id", "ihu_unique_step_id", "step_title"])

#process steps digital
process_digital_steps = process_csv_file( "process-steps-digital.csv", ["process_id", "ihu_unique_step_digital_id", "step_digital_title"])

#process title
process_title = process_csv_file( "process.csv", ["id", "title_el"])

#process evidences
process_evidences = process_csv_file( "process-evidences.csv", ["process_id", "ihu_unique_evidence_id", "evidence_description"])

#process conditions
process_conditions = process_csv_file( "process-conditions.csv", ["process_id", "ihu_unique_condition_id", "conditions_name"])

In [40]:
# Concatenate the data frames
combined = pd.concat([process_steps, process_digital_steps, process_title, process_evidences, process_conditions], ignore_index=True, sort=False)

# Reorder the columns
combined = combined[['id', 'ihu_unique_step_id', 'ihu_unique_step_digital_id', 'ihu_unique_evidence_id', 'ihu_unique_condition_id', 'word']]

In [42]:
docs = combined["word"].values.tolist()

# **Topic Modeling**



## Functions

### Stop Words for Count Vectorizer

In [44]:
# get Greek stop_words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
greek_stopwords = stopwords.words('greek')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [45]:
new_words = ['της', 'τη', 'του', 'από']

for word in new_words:
  greek_stopwords.append(word)

### Topic

In [46]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer # Embeddings
from umap import UMAP #Dimensionality reduction
from hdbscan import HDBSCAN #clustering
from sklearn.feature_extraction.text import CountVectorizer # CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from transformers.pipelines import pipeline

In [47]:
def create_topic_model(n_neighbors, n_components, min_dist, min_cluster_size, documents):
    # Define the embedding model
    embedding_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

    # Define the UMAP model
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist, metric='cosine') #increasing n_neighbors in general larger clusters | n_components - in which dimension. Too small loss of info, too high performance | min_dist how far the points should be in low dimensional

    # Define the HDBSCAN model
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom') #min_cluster_size, increasing -> fewer clusters | cluster_selection_method also the "leaf" method available (smaller clusters)

    # Define the vectorizer model
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=greek_stopwords) #ngram_range combination of words

    # Define the class-TFIDF transformer
    ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)

    # Create the topic model
    topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model,
                           vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model)

    # Fit the topic model to the process steps documents
    topics, probs = topic_model.fit_transform(documents)

    return topic_model, topics, probs


In [48]:
def reduce_outliers(documents, topic_model, topics, threshold):
    # Define the vectorizer model
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=greek_stopwords) #ngram_range combination of words

    # Define the class-TFIDF transformer
    ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)

    # Reduce outliers using embeddings
    new_topics = topic_model.reduce_outliers(documents, topics, strategy="embeddings", threshold=threshold)

    # Update topics
    topic_model.update_topics(documents, topics=new_topics, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model)


In [49]:
def merge_topics(documents, topic_model, list):
    # Define the vectorizer model
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=greek_stopwords) #ngram_range combination of words

    # Define the class-TFIDF transformer
    ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)

    # Merge the specified topics
    merged_topics = topic_model.merge_topics(documents, topics_to_merge)

    # Update the topics with the merged topics
    topic_model.update_topics(documents, topics=merged_topics, vectorizer_model=vectorizer_model,
                              ctfidf_model=ctfidf_model)

# Training

In [56]:
full_topic_model, full_topics, steps_probs = create_topic_model(50, 10, 0.1, 15, docs)
freq = full_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,18058,-1_λειτουργίας_φορέα_ίδρυσης_υγείας,"[λειτουργίας, φορέα, ίδρυσης, υγείας, ασφάλεια...",[Ο ενδιαφερόμενος πρέπει να είναι κάτοχος πτυχ...


In [57]:
freq["Topic"].nunique()

723

-1 refers to all outliers. But this amount is quite high, so we would like to reduce the noise. Closely related items will be assigned in the same topic. And in that topic they would be recognised as unique in the similarity process

In [60]:
reduce_outliers(docs, full_topic_model, full_topics, 0.6)
freq = full_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2650,-1_ηλικίας_τουλάχιστον_έτος_έτη,"[ηλικίας, τουλάχιστον, έτος, έτη, μόνο, άλλη, ...",[Ο ενδιαφερόμενος πρέπει να είναι κάτοχος πτυχ...


We reduced a lot the outliers to 10%, which would be good for our semantic analysis.  Finally have a look if we should merge some topics

In [64]:
full_topic_model.visualize_hierarchy(top_n_topics=100)

In [65]:
topics_to_merge = [[7,45], [48, 79]]
merge_topics (docs, full_topic_model, topics_to_merge)
freq = full_topic_model.get_topic_info(); freq.head(3)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2650,-1_ηλικίας_τουλάχιστον_έτος_έτη,"[ηλικίας, τουλάχιστον, έτος, έτη, μόνο, άλλη, ...",[Οι υποψήφιοι οδηγοί και οδηγοί προκειμένου να...
1,0,1259,0_πλοίου_ναυτικού_πλοίο_σκάφους,"[πλοίου, ναυτικού, πλοίο, σκάφους, πλοία, αλιε...",[Να είναι επιβατηγό πλοίο ή ταχύπλοο σκάφος πο...
2,1,762,1_κυκλοφορίας_οχήματος_οχημάτων_οδήγησης,"[κυκλοφορίας, οχήματος, οχημάτων, οδήγησης, άδ...","[Άδεια κυκλοφορίας οχήματος σε ισχύ, του δηλού..."


# Save Topics

In [87]:
result = full_topic_model.get_document_info(docs)
combined["Topic"] = result["Topic"]
combined["Probability"] = result["Probability"]

Unique Topics

In [70]:
# Select the desired columns for the new dataframe
selected_columns = ["Topic", "Name", "Representation", "Representative_Docs", "Top_n_words"]
unique_topics = result[selected_columns]

# Drop duplicates based on the "Topic" column to keep only unique topics
unique_topics = unique_topics.drop_duplicates(subset=["Topic"]).reset_index(drop=True)

In [99]:
def upload_csv_to_target_folder(df, name):

  path = name+".csv"

  result = df
  result.to_csv(path, index=False)

  url=folder_analysis_url_shrpt+"topics_extraction/"+"combined_docs_topic"
  target_folder = ctx.web.get_folder_by_server_relative_url(url)
  with open(path, "rb") as content_file:
      file_content = content_file.read()
      target_folder.upload_file(os.path.basename(path), file_content).execute_query()

In [100]:
upload_csv_to_target_folder(unique_topics, "unique_topics")

Dataframes

In [88]:
def filter_dataframe_by_step_id(df, column_name):
    # Filter the rows where "ihu_unique_step_id" is not "N/A"
    filtered_df = df[df[column_name] != "N/A"]

    # Select the desired columns for the new dataframe
    selected_columns = [column_name, "word", "Topic", "Probability"]
    new_df = filtered_df[selected_columns].reset_index(drop=True)

    return new_df

In [89]:
step_topics = filter_dataframe_by_step_id(combined, "ihu_unique_step_id")

In [92]:
def upload_csv_to_target_folder(unique_id, name):

  result = filter_dataframe_by_step_id(combined, unique_id)

  path = name+".csv"

  result.to_csv(path, index=False)

  url=folder_analysis_url_shrpt+"topics_extraction/"+"combined_docs_topic"
  target_folder = ctx.web.get_folder_by_server_relative_url(url)
  with open(path, "rb") as content_file:
      file_content = content_file.read()
      target_folder.upload_file(os.path.basename(path), file_content).execute_query()

In [93]:
upload_csv_to_target_folder("ihu_unique_step_id", "step_topics_full")

In [94]:
upload_csv_to_target_folder("ihu_unique_step_digital_id", "steps_digital_topics_full")

In [95]:
upload_csv_to_target_folder("id", "process_title_topics_full")

In [96]:
upload_csv_to_target_folder("ihu_unique_evidence_id", "evidences_topics_full")

In [97]:
upload_csv_to_target_folder("ihu_unique_condition_id", "conditions_topics_full")