#Import Models

In [None]:
%%capture
!pip install bertopic

In [None]:
import pandas as pd

#Connect to sharepoint

In [None]:
!pip install office365
!pip install Office365-REST-Python-Client

Collecting office365
  Downloading office365-0.3.15-py3-none-any.whl (32 kB)
Collecting azure-storage-blob (from office365)
  Downloading azure_storage_blob-12.17.0-py3-none-any.whl (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.0/388.0 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting O365 (from office365)
  Downloading O365-2.0.27-py3-none-any.whl (164 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.2/164.2 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymiscutils (from office365)
  Downloading pymiscutils-0.3.14-py3-none-any.whl (14 kB)
Collecting pathmagic (from office365)
  Downloading pathmagic-0.3.14-py3-none-any.whl (21 kB)
Collecting pyiotools (from office365)
  Downloading pyiotools-0.3.18-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pysubtypes (from office365)
  Downloading pysub

In [None]:
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
from office365.runtime.auth.client_credential import ClientCredential
from office365.sharepoint.files.file import File

####inputs########
# This will be the URL that points to your sharepoint site.
# Make sure you change only the parts of the link that start with "Your"
url_shrpt = 'https://ihuedu.sharepoint.com/sites/EDYTEProject2023/'
username_shrpt = '###############'
password_shrpt = '###############'
folder_files_url_shrpt = '/sites/EDYTEProject2023/Shared%20Documents/General/wp5_data/keywords/'
folder_analysis_url_shrpt = '/sites/EDYTEProject2023/Shared%20Documents/General/wp5_data/analysis_exports/'

In [None]:
###Authentication###For authenticating into your sharepoint site###
ctx_auth = AuthenticationContext(url_shrpt)
if ctx_auth.acquire_token_for_user(username_shrpt, password_shrpt):
  ctx = ClientContext(url_shrpt, ctx_auth)
  web = ctx.web
  ctx.load(web)
  ctx.execute_query()
  print('Authenticated into sharepoint as: ',web.properties['Title'])

else:
  print(ctx_auth.get_last_error())
############################

Authenticated into sharepoint as:  EDYTE Project 2023


In [None]:
####Function for extracting the file names of a folder in sharepoint###
###If you want to extract the folder names instead of file names, you have to change "sub_folders = folder.files" to "sub_folders = folder.folders" in the below function

global print_folder_contents
def print_folder_contents(ctx, folder_url):
    try:

        folder = ctx.web.get_folder_by_server_relative_url(folder_url)
        fold_names = []
        sub_folders = folder.files #Replace files with folders for getting list of folders
        ctx.load(sub_folders)
        ctx.execute_query()

        for s_folder in sub_folders:

            fold_names.append(s_folder.properties["Name"])

        return fold_names

    except Exception as e:
        print('Problem printing out library contents: ', e)
######################################################

# Call the function by giving your folder URL as input
filelist_shrpt=print_folder_contents(ctx,folder_files_url_shrpt)
#Print the list of files present in the folder
print(filelist_shrpt)

['process-steps-digital-keywords.csv', 'process-evidences-keywords.csv', 'process-steps-keywords.csv', 'process-conditions-keywords.csv', 'process-keywords.csv', 'process-rules-keywords.csv']


# Import Data

In [None]:
import pandas as pd
import csv
import numpy as np
import io
import os
import tempfile

In [None]:
def process_csv_file(file_name):
    file_url= folder_files_url_shrpt+file_name
    response = File.open_binary(ctx, file_url)  # Assuming File is imported and ctx is defined
    df = pd.read_csv(io.BytesIO(response.content))
    df = df.assign(keywords=df['keywords'].str.split(',')) \
                             .explode('keywords') \
                             .reset_index(drop=True)
    docs = df["keywords"]
    return df, docs

In [None]:
#steps
steps_keywords_df, steps_keywords = process_csv_file( "process-steps-keywords.csv")

#steps digital
steps_digital_keywords_df, steps_digital_keywords = process_csv_file( "process-steps-digital-keywords.csv")

#process
process_keywords_df, process_keywords = process_csv_file( "process-keywords.csv")

#process evidences
process_evidences_keywords_df, process__evidences_keywords = process_csv_file( "process-evidences-keywords.csv")

#process conditions
process_conditions_keywords_df, process__conditions_keywords = process_csv_file( "process-conditions-keywords.csv")

#process rules
process_rules_keywords_df, process__rules_keywords = process_csv_file( "process-rules-keywords.csv")

# Topic Modeling

## Functions

### Stop Words

In [None]:
# get Greek stop_words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
greek_stopwords = stopwords.words('greek')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
new_words = ['της', 'τη', 'του', 'από']

for word in new_words:
  greek_stopwords.append(word)

### Topic

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer # Embeddings
from umap import UMAP #Dimensionality reduction
from hdbscan import HDBSCAN #clustering
from sklearn.feature_extraction.text import CountVectorizer # CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from transformers.pipelines import pipeline

In [None]:
def create_topic_model(n_neighbors, n_components, min_dist, min_cluster_size, documents):
    # Define the embedding model
    embedding_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

    # Define the UMAP model
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist, metric='cosine') #increasing n_neighbors in general larger clusters | n_components - in which dimension. Too small loss of info, too high performance | min_dist how far the points should be in low dimensional

    # Define the HDBSCAN model
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom') #min_cluster_size, increasing -> fewer clusters | cluster_selection_method also the "leaf" method available (smaller clusters)

    # Define the vectorizer model
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=greek_stopwords) #ngram_range combination of words

    # Define the class-TFIDF transformer
    ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)

    # Create the topic model
    topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model,
                           vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model)

    # Fit the topic model to the process steps documents
    topics, probs = topic_model.fit_transform(documents)

    return topic_model, topics, probs

In [None]:
def reduce_outliers(documents, topic_model, topics, threshold):
    # Define the vectorizer model
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=greek_stopwords) #ngram_range combination of words

    # Define the class-TFIDF transformer
    ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)

    # Reduce outliers using embeddings
    new_topics = topic_model.reduce_outliers(documents, topics, strategy="embeddings", threshold=threshold)

    # Update topics
    topic_model.update_topics(documents, topics=new_topics, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model)

# Training

## Steps

In [None]:
steps_keywords_topic_model, steps_keywords_topics, steps_probs = create_topic_model(100, 10, 0.5, 25, steps_keywords)
freq = steps_keywords_topic_model.get_topic_info(); freq.head(1)

Downloading (…)9e268/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)f2cd19e268/README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading (…)cd19e268/config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)9e268/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading (…)d19e268/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11179,-1_συνδρομής_προσωπικού_δελτίου_ανάκληση,"[συνδρομής, προσωπικού, δελτίου, ανάκληση, δικ...","[αίτησης αιτούντα, παραλαβή αίτησης τροποποίησ..."


In [None]:
reduce_outliers(steps_keywords, steps_keywords_topic_model, steps_keywords_topics, 0.7)
freq = steps_keywords_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1120,-1_αγροτικής_γέννησης_ανάπτυξης_αγροτικής ανάπ...,"[αγροτικής, γέννησης, ανάπτυξης, αγροτικής ανά...","[αίτησης αιτούντα, παραλαβή αίτησης τροποποίησ..."


## Steps Digital

In [None]:
steps_digital_keywords_topic_model, steps_digital_keywords_topics, steps_digital_probs = create_topic_model(15, 8, 0.1, 8, steps_digital_keywords)
freq = steps_digital_keywords_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,402,-1_μεταβολής_κατάστασης_σύστασης_πληρότητας,"[μεταβολής, κατάστασης, σύστασης, πληρότητας, ...","[προεπισκόπιση περιουσιακής κατάστασης, προεπι..."


In [None]:
freq["Topic"].nunique()

210

## Process

In [None]:
process_keywords_topic_model, process_keywords_topics, process_probs = create_topic_model(15, 8, 0.1, 8, process_keywords)
freq = process_keywords_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1282,-1_ευκαιρίας_επιστροφή_κατόχου_απογραφή,"[ευκαιρίας, επιστροφή, κατόχου, απογραφή, διαγ...","[δεύτερης ευκαιρίας, απογραφή μητρώο ασφαλισμέ..."


In [None]:
freq["Topic"].nunique()

363

## Process Evidences

In [None]:
process_evidences_keywords_topic_model, process_evidences_keywords_topics, process_evidences_probs = create_topic_model(100, 10, 0.5, 25, process__evidences_keywords)
freq = process_evidences_keywords_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11839,-1_μόνιμης_γενικής_κάθε_δημόσιο,"[μόνιμης, γενικής, κάθε, δημόσιο, απόφασης, χρ...","[μαθημάτων αναλυτική βαθμολογία, εταιρεία περι..."


In [None]:
freq["Topic"].nunique()

419

In [None]:
reduce_outliers(process__evidences_keywords, process_evidences_keywords_topic_model, process_evidences_keywords_topics, 0.7)
freq = process_evidences_keywords_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2139,-1_000_βιοκαυσίμων_χορού_ισχύει,"[000, βιοκαυσίμων, χορού, ισχύει, προγράμματα ...","[μαθημάτων αναλυτική βαθμολογία, εταιρεία περι..."


## Process Conditions

In [None]:
process_conditions_keywords_topic_model, process_conditions_keywords_topics, process_conditions_probs = create_topic_model(100, 10, 0.5, 25, process__conditions_keywords)
freq = process_conditions_keywords_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11464,-1_τους_μηνών_φυσικό_έχουν,"[τους, μηνών, φυσικό, έχουν, εκπαίδευσης, σπου...","[νόμιμος εκπρόσωπος φυσικό, εκπληρώσει στρατιω..."


In [None]:
freq["Topic"].nunique()

254

In [None]:
reduce_outliers(process__conditions_keywords, process_conditions_keywords_topic_model, process_conditions_keywords_topics, 0.7)
freq = process_conditions_keywords_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3804,-1_μόνο_000_500_ευρώ,"[μόνο, 000, 500, ευρώ, κοινωνικής, πληροί, συν...","[νόμιμος εκπρόσωπος φυσικό, εκπληρώσει στρατιω..."


## Process Rules

In [None]:
process_rules_keywords_topic_model, process_rules_keywords_topics, process_rules_probs = create_topic_model(100, 10, 0.5, 25, process__rules_keywords)
freq = process_rules_keywords_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,12401,-1_άδεια_10_εσωτερικών_τροποποιήθηκε,"[άδεια, 10, εσωτερικών, τροποποιήθηκε, εργασία...","[αναδιάρθρωση δευτεροβάθμιας εκπαίδευσης, αναδ..."


In [None]:
freq["Topic"].nunique()

323

In [None]:
reduce_outliers(process__rules_keywords, process_rules_keywords_topic_model, process_rules_keywords_topics, 0.7)
freq = process_rules_keywords_topic_model.get_topic_info(); freq.head(1)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2712,-1_gallus_προσωπικού χαρακτήρα_χαρακτήρα_καταχ...,"[gallus, προσωπικού χαρακτήρα, χαρακτήρα, κατα...","[αναδιάρθρωση δευτεροβάθμιας εκπαίδευσης, αναδ..."


# Save Topics

In [None]:
def upload_csv_to_target_folder(topic_model, documents, dataframe, name):

  result = topic_model.get_document_info(documents)
  dataframe["topic"] = result["Topic"]

  path = name+".csv"

  dataframe.to_csv(path, index=False)

  url=folder_analysis_url_shrpt+"keywords/"+"keywords_topics"
  target_folder = ctx.web.get_folder_by_server_relative_url(url)
  with open(path, "rb") as content_file:
      file_content = content_file.read()
      target_folder.upload_file(os.path.basename(path), file_content).execute_query()

In [None]:
#steps
upload_csv_to_target_folder(steps_keywords_topic_model, steps_keywords, steps_keywords_df, "steps_keywords_topic")

In [None]:
#steps digital
upload_csv_to_target_folder(steps_digital_keywords_topic_model, steps_digital_keywords, steps_digital_keywords_df, "steps_digital_keywords_topic")

In [None]:
#process
upload_csv_to_target_folder(process_keywords_topic_model, process_keywords, process_keywords_df, "process_keywords_topic")

In [None]:
#process evidences
upload_csv_to_target_folder(process_evidences_keywords_topic_model, process__evidences_keywords, process_evidences_keywords_df, "process_evidences_keywords_topic")

In [None]:
#process conditons
upload_csv_to_target_folder(process_conditions_keywords_topic_model, process__conditions_keywords, process_conditions_keywords_df, "process_conditions_keywords_topic")

In [None]:
#process rules
upload_csv_to_target_folder(process_rules_keywords_topic_model, process__rules_keywords, process_rules_keywords_df, "process_rules_keywords_topic")