# Connect to Sharepoint

In [1]:
!pip install office365
!pip install Office365-REST-Python-Client

Collecting office365
  Downloading office365-0.3.15-py3-none-any.whl (32 kB)
Collecting azure-storage-blob (from office365)
  Downloading azure_storage_blob-12.17.0-py3-none-any.whl (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.0/388.0 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting O365 (from office365)
  Downloading O365-2.0.27-py3-none-any.whl (164 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.2/164.2 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymiscutils (from office365)
  Downloading pymiscutils-0.3.14-py3-none-any.whl (14 kB)
Collecting pathmagic (from office365)
  Downloading pathmagic-0.3.14-py3-none-any.whl (21 kB)
Collecting pyiotools (from office365)
  Downloading pyiotools-0.3.18-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pysubtypes (from office365)
  Downloading pysub

In [2]:
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
from office365.runtime.auth.client_credential import ClientCredential
from office365.sharepoint.files.file import File

####inputs########
# This will be the URL that points to your sharepoint site.
# Make sure you change only the parts of the link that start with "Your"
url_shrpt = 'https://ihuedu.sharepoint.com/sites/EDYTEProject2023/'
username_shrpt = '################'
password_shrpt = '################'
folder_files_url_shrpt = '/sites/EDYTEProject2023/Shared%20Documents/General/wp5_data/analysis_exports/topics_extraction/'
folder_analysis_url_shrpt = '/sites/EDYTEProject2023/Shared%20Documents/General/wp5_data/analysis_exports/'

In [3]:
###Authentication###For authenticating into your sharepoint site###
ctx_auth = AuthenticationContext(url_shrpt)
if ctx_auth.acquire_token_for_user(username_shrpt, password_shrpt):
  ctx = ClientContext(url_shrpt, ctx_auth)
  web = ctx.web
  ctx.load(web)
  ctx.execute_query()
  print('Authenticated into sharepoint as: ',web.properties['Title'])

else:
  print(ctx_auth.get_last_error())
############################

Authenticated into sharepoint as:  EDYTE Project 2023


In [4]:
####Function for extracting the file names of a folder in sharepoint###
###If you want to extract the folder names instead of file names, you have to change "sub_folders = folder.files" to "sub_folders = folder.folders" in the below function

global print_folder_contents
def print_folder_contents(ctx, folder_url):
    try:

        folder = ctx.web.get_folder_by_server_relative_url(folder_url)
        fold_names = []
        sub_folders = folder.files #Replace files with folders for getting list of folders
        ctx.load(sub_folders)
        ctx.execute_query()

        for s_folder in sub_folders:

            fold_names.append(s_folder.properties["Name"])

        return fold_names

    except Exception as e:
        print('Problem printing out library contents: ', e)
######################################################

# Call the function by giving your folder URL as input
filelist_shrpt=print_folder_contents(ctx,folder_files_url_shrpt)
#Print the list of files present in the folder
print(filelist_shrpt)

['Topic Modeling - Per File.ipynb', 'conditions_topics.csv', 'steps_digital_topics_dic.csv', 'step_topics.csv', 'evidences_topics_dic.csv', 'step_topics_dic.csv', 'process_title_topics_dic.csv', 'steps_digital_topics.csv', 'evidences_topics.csv', 'conditions_topics_dic.csv', 'process_title_topics.csv']


# Data


In [5]:
import pandas as pd
import csv
import numpy as np
import io
import os
import tempfile

In [6]:
def process_csv_file(file_name):
    file_url= folder_files_url_shrpt+file_name
    response = File.open_binary(ctx, file_url)  # Assuming File is imported and ctx is defined
    df = pd.read_csv(io.BytesIO(response.content))
    return df

In [67]:
steps_topics = process_csv_file("step_topics.csv")
steps_digital_topics = process_csv_file("steps_digital_topics.csv")
process_title_topics = process_csv_file("process_title_topics.csv")
evidences_topics = process_csv_file("evidences_topics.csv")
conditions_topics = process_csv_file("conditions_topics.csv")

# **Semantic Similarity**



## FAISS optimized


In [9]:
!pip install faiss-cpu
!pip install sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)


In [66]:
import torch
from sentence_transformers import SentenceTransformer, util
import numpy as np
import faiss
from tqdm import tqdm

def get_sentence_embeddings(sentences, model):
    sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
    return sentence_embeddings.cpu()  # Move the embeddings to the CPU

def build_faiss_index(embeddings):
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    return index

def find_most_similar_sentence(query_embedding, index, embeddings, similarity_threshold, kept_sentences, kept_ids):
    similarity_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    top_similarities = similarity_scores[similarity_scores >= similarity_threshold]
    if len(top_similarities) > 0:
        max_similarity = top_similarities.max().item()
        max_similarity_index = np.argmax(similarity_scores)
        if max_similarity >= similarity_threshold:
            return kept_sentences[max_similarity_index], max_similarity, kept_ids[max_similarity_index]
    return '', 0.0, None

def filter_similar_sentences(dataframe, id_column_name, similarity_threshold=0.95):
    model_name = 'paraphrase-multilingual-mpnet-base-v2'
    model = SentenceTransformer(model_name)

    # Create new columns to store the most similar sentence, similarity measure, and the ID of the most similar sentence
    dataframe['Most_Similar_Sentence'] = ''
    dataframe['Similarity_Measure'] = 0.0
    dataframe[id_column_name + '_mss'] = None

    # Initialize the kept_sentences list, embeddings, and IDs
    kept_sentences = []
    kept_embeddings = []
    kept_ids = []
    kept_index = None

    # Iterate over the rows in the dataframe
    for i, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Processing"):
        curr_id = row[id_column_name]
        curr_sentence = row['Document']

        if isinstance(curr_sentence, str):  # Check if the sentence is a valid string
            curr_embedding = get_sentence_embeddings([curr_sentence], model)[0]  # Pass the model as an argument
            most_similar_sentence = ''
            max_similarity = 0.0
            most_similar_id = None

            # Compare with the previously kept sentences using FAISS index
            if kept_index is not None:
                most_similar_sentence, max_similarity, most_similar_id = find_most_similar_sentence(
                    curr_embedding, kept_index, kept_embeddings, similarity_threshold, kept_sentences, kept_ids
                )

            # If a similar sentence is found, update the most similar sentence, similarity measure, and ID columns
            if most_similar_sentence != '':
                dataframe.loc[i, 'Most_Similar_Sentence'] = most_similar_sentence
                dataframe.loc[i, 'Similarity_Measure'] = max_similarity
                dataframe.loc[i, id_column_name + '_mss'] = most_similar_id
            else:
                # If no similar sentence is found, keep the original text and ID instead of the embedding
                kept_sentences.append(curr_sentence)
                kept_ids.append(curr_id)
                dataframe.loc[i, 'Most_Similar_Sentence'] = curr_sentence
                dataframe.loc[i, 'Similarity_Measure'] = 1.0  # Set similarity measure to 1.0 for the original sentence
                dataframe.loc[i, id_column_name + '_mss'] = most_similar_id


            # Check if the current sentence is similar to the kept sentences
            if most_similar_sentence == '' or max_similarity < similarity_threshold:
                kept_embeddings.append(curr_embedding.cpu().numpy().tolist())  # Convert numpy array to Python list
                if kept_index is None:
                    kept_index = build_faiss_index(np.array([curr_embedding.cpu().numpy()]))
                else:
                    kept_index.add(np.array([curr_embedding.cpu().numpy()]))

        else:
            dataframe.loc[i, 'Most_Similar_Sentence'] = np.nan
            dataframe.loc[i, 'Similarity_Measure'] = np.nan
            dataframe.loc[i, id_column_name + '_mss'] = None

    return dataframe

In [68]:
def process_topics(result, column_name):
    topics_num = result["Topic"].nunique() - 1
    final_df = pd.DataFrame()

    for j in range(-1, topics_num):
        sentences = result.loc[result['Topic'] == j]
        filtered_df = filter_similar_sentences(sentences, column_name, similarity_threshold=0.93)

        final_df = pd.concat([final_df, filtered_df], ignore_index=True)
        #print(j, final_steps)

    return final_df

# Run the process

## Steps Topic

In [76]:
steps_similarity_process = process_topics(steps_topics, "ihu_unique_step_id")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['Most_Similar_Sentence'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['Similarity_Measure'] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[id_column_name + '_mss'] = None
Processing: 100%|██████████| 1995/1995 [03:23<00:00,  9.82it/s]
A value is trying to 

In [77]:
steps_similarity_process.dropna(subset=['ihu_unique_step_id_mss'], inplace=True)
steps_similarity = steps_similarity_process.reindex(columns=["ihu_unique_step_id", "Document", "Most_Similar_Sentence", "Similarity_Measure", "ihu_unique_step_id_mss"])

In [78]:
steps_similarity

Unnamed: 0,ihu_unique_step_id,Document,Most_Similar_Sentence,Similarity_Measure,ihu_unique_step_id_mss
12,466053_stp_05,Αναγνώριση πλασματικού χρόνου,Αναγνώριση πλασματικού χρόνου,1.000000,214376_stp_05
24,915874_stp_06,Επιτυχία εισόδου στο σύστημα,Επιτυχία εισόδου στο σύστημα,1.000000,732020_stp_02
25,915874_stp_07,Αποτυχία εισόδου στο σύστημα,Αποτυχία εισόδου στο σύστημα,1.000000,732020_stp_03
38,468536_stp_10,Αρνητικό αποτέλεσμα ελέγχου,Αρνητικό αποτέλεσμα ελέγχου,1.000000,991597_stp_08
39,854144_stp_09,Αρνητικό αποτέλεσμα ελέγχου,Αρνητικό αποτέλεσμα ελέγχου,1.000000,991597_stp_08
...,...,...,...,...,...
22993,295685_stp_08,Μη έκδοση πιστοποιητικού - Ενημέρωση ενδιαφερο...,Μη έκδοση του πιστοποιητικού - Ενημέρωση ενδια...,0.995112,491108_stp_06
22994,673186_stp_08,Μη έκδοση πιστοποιητικού - Ενημέρωση ενδιαφερο...,Μη έκδοση του πιστοποιητικού - Ενημέρωση ενδια...,0.995112,491108_stp_06
22995,554361_stp_10,Μη έκδοση του πιστοποιητικού εξαγωγής εκτός τη...,Μη έκδοση του πιστοποιητικού εξαγωγής εκτός τη...,1.000000,701482_stp_10
22996,137462_stp_08,Μη έκδοση πιστοποιητικού - Ενημέρωση ενδιαφερο...,Μη έκδοση του πιστοποιητικού - Ενημέρωση ενδια...,0.995112,491108_stp_06


## Steps Digital

In [69]:
steps_digital_similarity_process = process_topics(steps_digital_topics, "ihu_unique_step_digital_id")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['Most_Similar_Sentence'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['Similarity_Measure'] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[id_column_name + '_mss'] = None
Processing: 100%|██████████| 274/274 [00:07<00:00, 35.66it/s]
A value is trying to be

In [70]:
steps_digital_similarity_process.dropna(subset=['ihu_unique_step_digital_id_mss'], inplace=True)
steps_digital_similarity = steps_digital_similarity_process.reindex(columns=["ihu_unique_step_digital_id", "Document", "Most_Similar_Sentence", "Similarity_Measure", "ihu_unique_step_digital_id_mss"])

In [74]:
steps_digital_similarity

Unnamed: 0,ihu_unique_step_digital_id,Document,Most_Similar_Sentence,Similarity_Measure,ihu_unique_step_digital_id_mss
12,533822_stdg_08,Εκτύπωση της Βεβαίωσης,Εκτύπωση Βεβαίωσης,0.948652,437870_stdg_08
18,819823_stdg_03,Επιλογή Περιφερειακού Γραφείου/Αυτοτελές Κλιμά...,Επιλογή Περιφερειακού Γραφείου/Αυτοτελές Κλιμά...,0.996515,977567_stdg_03
23,499131_stdg_03,Εκτύπωση Βεβαίωσης,Εκτύπωση Βεβαίωσης,1.000000,437870_stdg_08
31,386245_stdg_03,Επιλογή: Έκδοση Βεβαίωσης,Επιλογή: Έκδοση Βεβαίωσης,1.000000,374859_stdg_03
36,445356_stdg_03,Εκτύπωση Βεβαίωσης,Εκτύπωση Βεβαίωσης,1.000000,437870_stdg_08
...,...,...,...,...,...
1653,485754_stdg_07,Έλεγχος και Επιβεβαίωση Πληρωμής,Έλεγχος και επιβεβαίωση διατραπεζικής πληρωμής,0.952558,699244_stdg_07
1655,921188_stdg_05,Έγκριση και ανάρτηση βεβαίωσης από αρμόδιο υπά...,Έγκριση και ανάρτηση βεβαίωσης από αρμόδιο υπά...,1.000000,407009_stdg_05
1665,894344_stdg_11,Έκδοση ληξιαρχικής πράξης (απόσπασμα),Έκδοση ληξιαρχικής πράξης (απόσπασμα),1.000000,912456_stdg_11
1668,167470_stdg_13,Έκδοση ληξιαρχικής πράξης (απόσπασμα),Έκδοση ληξιαρχικής πράξης (απόσπασμα),1.000000,912456_stdg_11


## Process Title

In [80]:
process_title_similarity_process = process_topics(process_title_topics, "id")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['Most_Similar_Sentence'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['Similarity_Measure'] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[id_column_name + '_mss'] = None
Processing: 100%|██████████| 395/395 [00:15<00:00, 25.07it/s]
A value is trying to be

In [81]:
process_title_similarity_process.dropna(subset=['id_mss'], inplace=True)
process_title_similarity = process_title_similarity_process.reindex(columns=["id", "Document", "Most_Similar_Sentence", "Similarity_Measure", "id_mss"])

In [82]:
process_title_similarity

Unnamed: 0,id,Document,Most_Similar_Sentence,Similarity_Measure,id_mss
29,963649,Εποπτεία Οργανισμών Αξιολόγησης της Συμμόρφωση...,Έγκριση (Αναγνώριση) Οργανισμών Αξιολόγησης τη...,0.978632,955783
35,926290,Άρση Απόσπασης Υπαλλήλου στο Γραφείο Έλληνα Επ...,Απόσπαση Υπαλλήλου στο Γραφείο Έλληνα Επιτρόπο...,0.939425,542776
75,224693,Δήλωση Συνέχισης Άσκησης Γεωτεχνικού Επαγγέλμα...,Αναγγελία Άσκησης Επαγγέλματος Γεωτεχνικού: Γε...,0.951507,945847
83,599736,Άδεια Λειτουργίας Μονάδας Φροντίδας Ηλικιωμένων,Αναθεώρηση Άδειας Λειτουργίας Μονάδας Φροντίδα...,0.947820,933967
129,310513,Χαρακτηρισμός Δημόσιας Βιβλιοθήκης ως «Ιστορικής»,Χαρακτηρισμός Δημόσιας Βιβλιοθήκης ως «Κεντρικής»,0.952064,325221
...,...,...,...,...,...
2980,690185,Άδεια για απευθείας προμήθεια πετρελαιοειδών α...,Άδεια για απευθείας προμήθεια πετρελαιοειδών α...,0.958805,702322
2981,894864,Άδεια για απευθείας προμήθεια πετρελαιοειδών α...,Άδεια για απευθείας προμήθεια πετρελαιοειδών α...,0.944799,702322
2982,454783,Άδεια εμπορίας πετρελαιοειδών προϊόντων για τα...,Άδεια εμπορίας πετρελαιοειδών προϊόντων για τα...,0.940588,512142
2983,739974,Άδεια για απευθείας προμήθεια πετρελαιοειδών α...,Άδεια για απευθείας προμήθεια πετρελαιοειδών α...,0.993456,702322


## Evidences

In [84]:
evidences_similarity_process = process_topics(evidences_topics, "ihu_unique_evidence_id")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['Most_Similar_Sentence'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['Similarity_Measure'] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[id_column_name + '_mss'] = None
Processing: 100%|██████████| 866/866 [00:48<00:00, 17.86it/s]
A value is trying to be

In [88]:
evidences_similarity_process.dropna(subset=['ihu_unique_evidence_id_mss'], inplace=True)
evidences_similarity = evidences_similarity_process.reindex(columns=["ihu_unique_evidence_id", "Document", "Most_Similar_Sentence", "Similarity_Measure", "ihu_unique_evidence_id_mss"])

In [89]:
evidences_similarity

Unnamed: 0,ihu_unique_evidence_id,Document,Most_Similar_Sentence,Similarity_Measure,ihu_unique_evidence_id_mss
36,197055_ev_02,ΕΝΤΥΠΟ Β: Ωρολόγιο πρόγραμμα σεμιναρίου,ΕΝΤΥΠΟ Β: Ωρολόγιο πρόγραμμα σεμιναρίου,1.000000,194624_ev_04
46,716344_ev_06,Η γνώση χειρισμού Η/Υ στα αντικείμενα: (1) επε...,Η γνώση χειρισμού Η/Υ στα αντικείμενα: (1) επε...,1.000000,557451_ev_08
56,344649_ev_16,Βεβαίωση ανάληψης καθηκόντων εκπαιδευτικών που...,Βεβαίωση ανάληψης καθηκόντων εκπαιδευτικών που...,1.000000,814084_ev_13
57,344649_ev_17,Βεβαίωση εντοπιότητας των γονέων των εκπαιδευτ...,Βεβαίωση εντοπιότητας των γονέων των εκπαιδευτ...,1.000000,814084_ev_22
71,511484_ev_23,ΥΠΥ21 1 Απόφαση Παραπομπής σε διαδικασία προσδ...,ΥΠΥ21 Πράξη προσδιορισμού ηλικίας,0.945117,511484_ev_22
...,...,...,...,...,...
13905,883351_ev_02,Πιστοποιητικό της αρμόδιας για το επάγγελμα αρ...,Πιστοποιητικό της αρμόδιας για το επάγγελμα αρ...,0.973244,879124_ev_01
13906,535037_ev_02,Πιστοποιητικό της αρμόδιας για το επάγγελμα αρ...,Πιστοποιητικό της αρμόδιας για το επάγγελμα αρ...,0.973244,879124_ev_01
13907,477946_ev_02,Πιστοποιητικό της αρμόδιας για το επάγγελμα αρ...,Πιστοποιητικό της αρμόδιας για το επάγγελμα αρ...,0.973244,879124_ev_01
13909,962443_ev_23,Υπεύθυνη Δήλωσή του Διευθυντή Κατάρτιση ότι δε...,Υπεύθυνη Δήλωσή του Διευθυντή Κατάρτιση ότι δε...,1.000000,964979_ev_18


## Conditions

In [90]:
conditions_similarity_process = process_topics(conditions_topics, "ihu_unique_condition_id")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  dataframe[id_column_name + '_mss'] = None
Processing: 100%|██████████| 51/51 [00:01<00:00, 43.12it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['Most_Similar_Sentence'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['Similarity_Measure'] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/index

In [91]:
conditions_similarity_process.dropna(subset=['ihu_unique_condition_id_mss'], inplace=True)
conditions_similarity = conditions_similarity_process.reindex(columns=["ihu_unique_condition_id", "Document", "Most_Similar_Sentence", "Similarity_Measure", "ihu_unique_condition_id_mss"])

In [92]:
conditions_similarity

Unnamed: 0,ihu_unique_condition_id,Document,Most_Similar_Sentence,Similarity_Measure,ihu_unique_condition_id_mss
9,912456_con_07,Άμεσα ηλεκτρονικά διαθέσιμες είναι οι ληξιαρχι...,Άμεσα ηλεκτρονικά διαθέσιμες είναι οι ληξιαρχι...,0.979506,106576_con_05
22,421727_con_02,Εκούσια διακοπή συνεργασίας με τον αποσπαμένο ...,Εκούσια διακοπή συνεργασίας με τον αποσπασμένο...,0.993514,795899_con_02
23,347711_con_02,Εκούσια διακοπή συνεργασίας με τον αποσπασμένο...,Εκούσια διακοπή συνεργασίας με τον αποσπασμένο...,1.000000,795899_con_02
34,716344_con_08,Να έχουν γνώση χειρισμού Η/Υ στα αντικείμενα: ...,Να έχουν γνώση χειρισμού Η/Υ στα αντικείμενα: ...,1.000000,557451_con_10
39,488870_con_06,Τα τέκνα να είναι ανήλικα.,Τα τέκνα να είναι ανήλικα.,1.000000,449862_con_04
...,...,...,...,...,...
10473,515758_con_01,Η εκκαθαρίστρια υποχρεούται να έχει έδρα στην ...,Η εκκαθαρίστρια εταιρεία υποχρεούται να έχει έ...,0.974022,822349_con_01
10474,515758_con_10,"Ειδικά για την αναγνώριση αλλοδαπών εταιρειών,...","Ειδικά για την αναγνώριση αλλοδαπών εταιρειών,...",1.000000,822349_con_10
10487,838145_con_03,Ο έλεγχος αφορά σε υπαλλήλους του Υπουργείου Ο...,Ο έλεγχος αφορά σε υπαλλήλους του Υπουργείου Ο...,0.950781,838145_con_01
10506,561816_con_01,Σε όλες τις περιπτώσεις που τα υπο διαμετακόμι...,Σε όλες τις περιπτώσεις που τα υπό διαμετακόμι...,0.994622,725749_con_01


# Save

## Functions

In [72]:
def upload_csv_to_target_folder(similarity_name, dataframe):
  path = similarity_name+".csv"

  dataframe.to_csv(path, index=False)

  url=folder_analysis_url_shrpt+"semantic_similarity"
  target_folder = ctx.web.get_folder_by_server_relative_url(url)
  with open(path, "rb") as content_file:
      file_content = content_file.read()
      target_folder.upload_file(os.path.basename(path), file_content).execute_query()

## Save

Steps

In [79]:
upload_csv_to_target_folder("steps_similarity", steps_similarity)

Steps Digital

In [75]:
upload_csv_to_target_folder("steps_digital_similarity", steps_digital_similarity)

Process Title

In [83]:
upload_csv_to_target_folder("process_title_similarity", process_title_similarity)

Evidences

In [93]:
upload_csv_to_target_folder("evidences_similarity", evidences_similarity)

Conditions

In [94]:
upload_csv_to_target_folder("conditions_similarity", conditions_similarity)