## BERTopic
BERTopic is a topic modeling technique that leverages 🤗 transformers and a custom class-based TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.

# Enabling the GPU

First, you'll need to enable GPUs for the notebook:

- Navigate to Edit→Notebook Settings
- select GPU from the Hardware Accelerator drop-down

# **Installing BERTopic**

We start by installing BERTopic from PyPi:

In [1]:
%%capture
!pip install bertopic

## Restart the Notebook
After installing BERTopic, some packages that were already loaded were updated and in order to correctly use them, we should now restart the notebook.

From the Menu:

Runtime → Restart Runtime

#Connect to sharepoint

In [2]:
!pip install office365
!pip install Office365-REST-Python-Client

Collecting office365
  Downloading office365-0.3.15-py3-none-any.whl (32 kB)
Collecting azure-storage-blob (from office365)
  Downloading azure_storage_blob-12.17.0-py3-none-any.whl (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.0/388.0 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting O365 (from office365)
  Downloading O365-2.0.27-py3-none-any.whl (164 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.2/164.2 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymiscutils (from office365)
  Downloading pymiscutils-0.3.14-py3-none-any.whl (14 kB)
Collecting pathmagic (from office365)
  Downloading pathmagic-0.3.14-py3-none-any.whl (21 kB)
Collecting pyiotools (from office365)
  Downloading pyiotools-0.3.18-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pysubtypes (from office365)
  Downloading pysub

In [3]:
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
from office365.runtime.auth.client_credential import ClientCredential
from office365.sharepoint.files.file import File

####inputs########
# This will be the URL that points to your sharepoint site.
# Make sure you change only the parts of the link that start with "Your"
url_shrpt = 'https://ihuedu.sharepoint.com/sites/EDYTEProject2023/'
username_shrpt = '###############'
password_shrpt = '###############'
folder_files_url_shrpt = '/sites/EDYTEProject2023/Shared%20Documents/General/wp5_data/raw_Data/'
folder_analysis_url_shrpt = '/sites/EDYTEProject2023/Shared%20Documents/General/wp5_data/analysis_exports/'

In [4]:
###Authentication###For authenticating into your sharepoint site###
ctx_auth = AuthenticationContext(url_shrpt)
if ctx_auth.acquire_token_for_user(username_shrpt, password_shrpt):
  ctx = ClientContext(url_shrpt, ctx_auth)
  web = ctx.web
  ctx.load(web)
  ctx.execute_query()
  print('Authenticated into sharepoint as: ',web.properties['Title'])

else:
  print(ctx_auth.get_last_error())
############################

Authenticated into sharepoint as:  EDYTE Project 2023


In [5]:
####Function for extracting the file names of a folder in sharepoint###
###If you want to extract the folder names instead of file names, you have to change "sub_folders = folder.files" to "sub_folders = folder.folders" in the below function

global print_folder_contents
def print_folder_contents(ctx, folder_url):
    try:

        folder = ctx.web.get_folder_by_server_relative_url(folder_url)
        fold_names = []
        sub_folders = folder.files #Replace files with folders for getting list of folders
        ctx.load(sub_folders)
        ctx.execute_query()

        for s_folder in sub_folders:

            fold_names.append(s_folder.properties["Name"])

        return fold_names

    except Exception as e:
        print('Problem printing out library contents: ', e)
######################################################

# Call the function by giving your folder URL as input
filelist_shrpt=print_folder_contents(ctx,folder_files_url_shrpt)
#Print the list of files present in the folder
print(filelist_shrpt)

['process-provision-digital-locations.csv', 'process-steps-digital.csv', 'process-steps.csv', 'process-evidences-cost.csv', 'process.csv', 'process-rules.csv', 'process-conditions.csv', 'process-evidences.csv']


# Import Data

In [6]:
import pandas as pd
import csv
import numpy as np
import io
import os
import tempfile

In [7]:
def process_csv_file(file_name, column_names):
    file_url= folder_files_url_shrpt+file_name
    response = File.open_binary(ctx, file_url)  # Assuming File is imported and ctx is defined
    df = pd.read_csv(io.BytesIO(response.content))
    df = df.dropna(subset=column_names)
    data = df[column_names]
    docs = df[column_names[-1]].values.tolist()
    return df, docs

In [8]:
#process steps
process_steps, process_steps_docs = process_csv_file( "process-steps.csv", ["process_id", "ihu_unique_step_id", "step_title"])

#process steps digital
process_digital_steps, process_digtial_steps_docs = process_csv_file( "process-steps-digital.csv", ["process_id", "ihu_unique_step_digital_id", "step_digital_title"])

#process title
process_title, process_title_docs = process_csv_file( "process.csv", ["id", "title_el"])

#process evidences
process_evidences, process_evidences_docs = process_csv_file( "process-evidences.csv", ["process_id", "ihu_unique_evidence_id", "evidence_description"])

#process conditions
process_conditions, process_conditions_docs = process_csv_file( "process-conditions.csv", ["process_id", "ihu_unique_condition_id", "conditions_name"])

# **Topic Modeling**



## Functions

### Stop Words for Count Vectorizer

In [9]:
# get Greek stop_words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
greek_stopwords = stopwords.words('greek')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
new_words = ['της', 'τη', 'του', 'από']

for word in new_words:
  greek_stopwords.append(word)

### Topic

In [11]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer # Embeddings
from umap import UMAP #Dimensionality reduction
from hdbscan import HDBSCAN #clustering
from sklearn.feature_extraction.text import CountVectorizer # CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from transformers.pipelines import pipeline

In [12]:
def create_topic_model(n_neighbors, n_components, min_dist, min_cluster_size, documents):
    # Define the embedding model
    embedding_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

    # Define the UMAP model
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist, metric='cosine') #increasing n_neighbors in general larger clusters | n_components - in which dimension. Too small loss of info, too high performance | min_dist how far the points should be in low dimensional

    # Define the HDBSCAN model
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom') #min_cluster_size, increasing -> fewer clusters | cluster_selection_method also the "leaf" method available (smaller clusters)

    # Define the vectorizer model
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=greek_stopwords) #ngram_range combination of words

    # Define the class-TFIDF transformer
    ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)

    # Create the topic model
    topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model,
                           vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model)

    # Fit the topic model to the process steps documents
    topics, probs = topic_model.fit_transform(documents)

    return topic_model, topics, probs


In [13]:
def reduce_outliers(documents, topic_model, topics, threshold):
    # Define the vectorizer model
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=greek_stopwords) #ngram_range combination of words

    # Define the class-TFIDF transformer
    ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)

    # Reduce outliers using embeddings
    new_topics = topic_model.reduce_outliers(documents, topics, strategy="embeddings", threshold=threshold)

    # Update topics
    topic_model.update_topics(documents, topics=new_topics, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model)


In [60]:
def merge_topics(documents, topic_model, list):
    # Define the vectorizer model
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=greek_stopwords) #ngram_range combination of words

    # Define the class-TFIDF transformer
    ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)

    # Merge the specified topics
    merged_topics = topic_model.merge_topics(documents, topics_to_merge)

    # Update the topics with the merged topics
    topic_model.update_topics(documents, topics=merged_topics, vectorizer_model=vectorizer_model,
                              ctfidf_model=ctfidf_model)

# Training

##Steps

In [44]:
steps_topic_model, steps_topics, steps_probs = create_topic_model(100, 10, 0.5, 25, process_steps_docs)
freq = steps_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,10284,-1_στοιχείων_φακέλου_κοινοποίηση_φορέα,"[στοιχείων, φακέλου, κοινοποίηση, φορέα, εξέτα...","[Παραλαβή της Αίτησης, Διαβίβαση απόφασης στο ..."


In [45]:
freq["Topic"].nunique()

163

-1 refers to all outliers. But this amount is quite high, so we would like to reduce the noise. Closely related items will be assigned in the same topic. And in that topic they would be recognised as unique in the similarity process

In [46]:
reduce_outliers(process_steps_docs, steps_topic_model, steps_topics, 0.7)
freq = steps_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1995,-1_εγγραφή_αγγλικής_μητρώο_μελών,"[εγγραφή, αγγλικής, μητρώο, μελών, γλώσσας, μέ...","[Παραλαβή της Αίτησης, Διαβίβαση απόφασης στο ..."


We reduced a lot the outliers to 10%, which would be good for our semantic analysis.  Finally have a look if we should merge some topics

In [51]:
steps_topic_model.visualize_hierarchy(top_n_topics=50)

In [52]:
topics_to_merge = [[5,27]]
merge_topics (process_steps_docs, steps_topic_model, topics_to_merge)
freq = steps_topic_model.get_topic_info(); freq.head(3)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1995,-1_εγγραφή_αγγλικής_μητρώο_μελών,"[εγγραφή, αγγλικής, μητρώο, μελών, γλώσσας, μέ...",[Συγκρότηση εκλεκτορικών σωμάτων επιλογής και ...
1,0,840,0_άδειας_κατηγορίας_χορήγηση άδειας_εμπορίας,"[άδειας, κατηγορίας, χορήγηση άδειας, εμπορίας...",[Τροποποίηση ως προς την επωνυμία του κατόχου ...
2,1,671,1_εκπαίδευσης_εκπαιδευτικών_δευτεροβάθμιας_δευ...,"[εκπαίδευσης, εκπαιδευτικών, δευτεροβάθμιας, δ...",[Διαβίβαση των αιτήσεων απόσπασης εκπαιδευτικώ...


In [53]:
freq["Topic"].nunique()

162

##Digital Steps

In [21]:
digital_steps_topic_model, digital_steps_topics, digital_steps_probs = create_topic_model(15, 8, 0.1, 8, process_digtial_steps_docs)
freq = digital_steps_topic_model.get_topic_info(); freq.head(3)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,274,-1_υποβολής_αρχής_otp_αρχής δημοσίων εσόδων,"[υποβολής, αρχής, otp, αρχής δημοσίων εσόδων, ...",[Είσοδος στο διαδικτυακό τόπο της Ανεξάρτητης ...
1,0,69,0_ταυτοποίηση χρήστη_ταυτοποίηση_χρήστη ταυτοπ...,"[ταυτοποίηση χρήστη, ταυτοποίηση, χρήστη ταυτο...","[Ταυτοποίηση Χρήστη, Ταυτοποίηση Χρήστη, Ταυτο..."
2,1,40,1_καταγραφή_συμπλήρωση αίτησης_καταγραφή αίτησ...,"[καταγραφή, συμπλήρωση αίτησης, καταγραφή αίτη...","[Καταγραφή της αίτησης - Έκδοση Βεβαίωσης, Κατ..."


In [22]:
freq["Topic"].nunique()

80

In [55]:
digital_steps_topic_model.visualize_hierarchy(top_n_topics=50)

No need of further processing

## Process Title

In [24]:
process_title_topic_model, process_title_topics, process_title_probs = create_topic_model(15, 8, 0.1, 8, process_title_docs)
freq = process_title_topic_model.get_topic_info(); freq.head(3)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,703,-1_λειτουργίας_μτν_έγκριση_λόγω,"[λειτουργίας, μτν, έγκριση, λόγω, αναγγελία, π...",[Άδεια εμπορίας πετρελαιοειδών προϊόντων για τ...
1,0,199,0_ναυτικού_πλοίου_πλοίων_πλοία,"[ναυτικού, πλοίου, πλοίων, πλοία, σκαφών, σκάφ...",[Πρόσληψη έκτακτου εκπαιδευτικού προσωπικού στ...
2,1,143,1_απόφασης_βεβαίωση_δικαστικών_ενδίκου,"[απόφασης, βεβαίωση, δικαστικών, ενδίκου, πιστ...",[Βεβαίωση συζήτησης ή μη ενδίκου μέσου (αίτηση...


In [25]:
freq["Topic"].nunique()

70

-1 refers to all outliers. But this amount is quite high, so we would like to reduce the noise. Closely related items will be assigned in the same topic. And in that topic they would be recognised as unique in the similarity process

In [26]:
reduce_outliers(process_title_docs, process_title_topic_model, process_title_topics, 0.7)
freq = process_title_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,395,-1_λόγω_ταυτότητας_πιστοποιητικό_οδήγησης,"[λόγω, ταυτότητας, πιστοποιητικό, οδήγησης, λε...",[Άδεια εμπορίας πετρελαιοειδών προϊόντων για τ...


In [56]:
digital_steps_topic_model.visualize_hierarchy(top_n_topics=50)

In [28]:
freq["Topic"].nunique()

70

## Evidences

In [29]:
process_evidences_topic_model, process_evidences_topics, process_evidences_title_probs = create_topic_model(100, 10, 0.5, 25, process_evidences_docs)
freq = process_evidences_topic_model.get_topic_info(); freq.head(3)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,7203,-1_ότι_βεβαίωση_δήλωση_υπεύθυνη,"[ότι, βεβαίωση, δήλωση, υπεύθυνη, οποία, υπεύθ...","[Υπεύθυνη Δήλωση, Υπεύθυνη δήλωση του νόμου 15..."
1,0,550,0_σπουδών_τίτλου_αλλοδαπής_πτυχίου,"[σπουδών, τίτλου, αλλοδαπής, πτυχίου, τίτλου σ...",[Πράξη ισοτιμίας τίτλου σπουδών της αλλοδαπής ...
2,1,413,1_πλοίου_σκάφους_ναυτικού_πλοίο,"[πλοίου, σκάφους, ναυτικού, πλοίο, εθνικότητας...","[Έγγραφο Εθνικότητας του πλοίου, Έγγραφο Εθνικ..."


In [30]:
freq["Topic"].nunique()

78

-1 refers to all outliers. But this amount is quite high, so we would like to reduce the noise. Closely related items will be assigned in the same topic. And in that topic they would be recognised as unique in the similarity process

In [31]:
reduce_outliers(process_evidences_docs, process_evidences_topic_model, process_evidences_topics, 0.6)
freq = process_evidences_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,866,-1_πιστοποιητικό_οποία_βεβαίωση_κάθε,"[πιστοποιητικό, οποία, βεβαίωση, κάθε, 000, ηχ...","[Υπεύθυνη Δήλωση, Υπεύθυνη δήλωση του νόμου 15..."


In [57]:
process_evidences_topic_model.visualize_hierarchy(top_n_topics=50)

In [61]:
topics_to_merge = [[3,7]]
merge_topics (process_evidences_docs, process_evidences_topic_model, topics_to_merge)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,866,-1_πιστοποιητικό_οποία_βεβαίωση_κάθε,"[πιστοποιητικό, οποία, βεβαίωση, κάθε, 000, ηχ...",[Τα δικαιολογητικά δεν αναφέρονται στην παρούσ...
1,0,766,0_σπουδών_τίτλου_πτυχίου_σχολής,"[σπουδών, τίτλου, πτυχίου, σχολής, αλλοδαπής, ...",[Πράξη ισοτιμίας τίτλου σπουδών της αλλοδαπής ...
2,1,665,1_ταυτότητας_αστυνομικής ταυτότητας_αστυνομική...,"[ταυτότητας, αστυνομικής ταυτότητας, αστυνομικ...",[Φωτοτυπία αστυνομικής ταυτότητας ή διαβατηρίο...


In [62]:
freq["Topic"].nunique()

77

## Conditions

In [33]:
process_conditions_topic_model, process_conditions_topics, process_conditions_probs = create_topic_model(15, 8, 0.1, 8, process_conditions_docs)
freq = process_conditions_topic_model.get_topic_info(); freq.head(3)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3055,-1_σύμφωνα_αίτηση_εφόσον_προϋπόθεση,"[σύμφωνα, αίτηση, εφόσον, προϋπόθεση, οποία, c...",[Δεν χορηγείται άδεια λειτουργίας αν δεν παρέλ...
1,0,205,0_εκπαιδευτικοί_εκπαίδευσης_διευθυντή_δευτεροβ...,"[εκπαιδευτικοί, εκπαίδευσης, διευθυντή, δευτερ...","[Να έχει διοριστεί νομίμως, με πράξη του κατά ..."
2,1,121,1_taxisnet αιτών είναι_κωδικών taxisnet αιτών_...,"[taxisnet αιτών είναι, κωδικών taxisnet αιτών,...","[Ο αιτών να είναι κάτοχος κωδικών TAXISnet., Ο..."


In [34]:
freq["Topic"].nunique()

322

-1 refers to all outliers. But this amount is quite high, so we would like to reduce the noise. Closely related items will be assigned in the same topic. And in that topic they would be recognised as unique in the similarity process

In [35]:
reduce_outliers(process_conditions_docs, process_conditions_topic_model, process_conditions_topics, 0.6)
freq = process_conditions_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,254,-1_00_γεγονότα_συσκευή_24 00,"[00, γεγονότα, συσκευή, 24 00, μήνα, ευρώ μήνα...",[Δεν χορηγείται άδεια λειτουργίας αν δεν παρέλ...


In [63]:
process_conditions_topic_model.visualize_hierarchy(top_n_topics=50)

No merge

# Save Topics

In [37]:
def upload_csv_to_target_folder(topic_model, documents, dataframe, process_id, unique_id, name):

  result = topic_model.get_document_info(documents)
  df = dataframe.reset_index(drop=True) #need also to reindex the df

  result[process_id] = df.loc[result.index, process_id]
  if unique_id != "None":
      result[unique_id] = df.loc[result.index, unique_id]

  path = name+".csv"

  result.to_csv(path, index=False)

  url=folder_analysis_url_shrpt+"topics_extraction"
  target_folder = ctx.web.get_folder_by_server_relative_url(url)
  with open(path, "rb") as content_file:
      file_content = content_file.read()
      target_folder.upload_file(os.path.basename(path), file_content).execute_query()

In [54]:
upload_csv_to_target_folder(steps_topic_model, process_steps_docs, process_steps, "process_id", "ihu_unique_step_id", "step_topics")

In [39]:
upload_csv_to_target_folder(digital_steps_topic_model, process_digtial_steps_docs, process_digital_steps, "process_id", "ihu_unique_step_digital_id", "steps_digital_topics")

In [40]:
upload_csv_to_target_folder(process_title_topic_model, process_title_docs, process_title, "id", "None", "process_title_topics")

In [64]:
upload_csv_to_target_folder(process_evidences_topic_model, process_evidences_docs, process_evidences, "process_id", "ihu_unique_evidence_id", "evidences_topics")

In [42]:
upload_csv_to_target_folder(process_conditions_topic_model, process_conditions_docs, process_conditions, "process_id", "ihu_unique_condition_id", "conditions_topics")