#Connect to sharepoint

In [1]:
!pip install office365
!pip install Office365-REST-Python-Client

Collecting office365
  Downloading office365-0.3.15-py3-none-any.whl (32 kB)
Collecting azure-storage-blob (from office365)
  Downloading azure_storage_blob-12.17.0-py3-none-any.whl (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.0/388.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting O365 (from office365)
  Downloading O365-2.0.27-py3-none-any.whl (164 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.2/164.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymiscutils (from office365)
  Downloading pymiscutils-0.3.14-py3-none-any.whl (14 kB)
Collecting pathmagic (from office365)
  Downloading pathmagic-0.3.14-py3-none-any.whl (21 kB)
Collecting pyiotools (from office365)
  Downloading pyiotools-0.3.18-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pysubtypes (from office365)
  Downloading pysub

In [2]:
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
from office365.runtime.auth.client_credential import ClientCredential
from office365.sharepoint.files.file import File

####inputs########
# This will be the URL that points to your sharepoint site.
# Make sure you change only the parts of the link that start with "Your"
url_shrpt = 'https://ihuedu.sharepoint.com/sites/EDYTEProject2023/'
username_shrpt = '###############'
password_shrpt = '###############'
folder_files_url_shrpt = '/sites/EDYTEProject2023/Shared%20Documents/General/wp5_data/raw_Data/'
folder_analysis_url_shrpt = '/sites/EDYTEProject2023/Shared%20Documents/General/wp5_data/analysis_exports/'

In [3]:
###Authentication###For authenticating into your sharepoint site###
ctx_auth = AuthenticationContext(url_shrpt)
if ctx_auth.acquire_token_for_user(username_shrpt, password_shrpt):
  ctx = ClientContext(url_shrpt, ctx_auth)
  web = ctx.web
  ctx.load(web)
  ctx.execute_query()
  print('Authenticated into sharepoint as: ',web.properties['Title'])

else:
  print(ctx_auth.get_last_error())
############################

Authenticated into sharepoint as:  EDYTE Project 2023


In [4]:
####Function for extracting the file names of a folder in sharepoint###
###If you want to extract the folder names instead of file names, you have to change "sub_folders = folder.files" to "sub_folders = folder.folders" in the below function

global print_folder_contents
def print_folder_contents(ctx, folder_url):
    try:

        folder = ctx.web.get_folder_by_server_relative_url(folder_url)
        fold_names = []
        sub_folders = folder.files #Replace files with folders for getting list of folders
        ctx.load(sub_folders)
        ctx.execute_query()

        for s_folder in sub_folders:

            fold_names.append(s_folder.properties["Name"])

        return fold_names

    except Exception as e:
        print('Problem printing out library contents: ', e)
######################################################

# Call the function by giving your folder URL as input
filelist_shrpt=print_folder_contents(ctx,folder_files_url_shrpt)
#Print the list of files present in the folder
print(filelist_shrpt)

['process-provision-digital-locations.csv', 'process-steps-digital.csv', 'process-steps.csv', 'process-evidences-cost.csv', 'process.csv', 'process-rules.csv', 'process-conditions.csv', 'process-evidences.csv']


# Import Data

In [5]:
import pandas as pd
import csv
import numpy as np
import io
import os
import tempfile

In [6]:
def process_csv_file(file_name, column_names):
    file_url= folder_files_url_shrpt+file_name
    response = File.open_binary(ctx, file_url)  # Assuming File is imported and ctx is defined

    df = pd.read_csv(io.BytesIO(response.content))
    df = df.loc[:, column_names]
    df = df.dropna(subset=column_names[-1])

    return df

In [7]:
process_description = process_csv_file("process.csv", ['id', 'title_el', 'description'])
process_remarks = process_csv_file("process.csv", ['id', 'title_el', 'remarks'])
process_evidences = process_csv_file("process-evidences.csv", ['process_id', 'evidence_num_id', 'ihu_unique_evidence_id', 'evidence_description'])
process_conditions = process_csv_file("process-conditions.csv", ['process_id', 'conditions_num_id', 'ihu_unique_condition_id', 'conditions_name'])

# Import Models

In [8]:
import re
import pandas as pd

In [9]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
greek_stopwords = stopwords.words('greek')

new_words = ['της', 'τη', 'του', 'από', 'την', 'και', 'εώς', 'εως']

for word in new_words:
  greek_stopwords.append(word)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Regulation Extraction

In [10]:
def remove_stopwords(regulation):
  words = regulation.split()
  regulation = ' '.join([word for word in words if word not in greek_stopwords])
  return regulation

In [11]:
def is_date(regulation):
    # Check if the regulation is only a date
    if re.match(r'\d{1,2}\/\d{1,2}(?:\/\d{2,4})?', regulation):
        return True
    return False

In [12]:
def extract_regulations(df, column_name, id):

    column = column_name
    id = id
    pattern = r'(\S+\s(?:\d+\/(?:\d+(?:\.\d+)*(?:\/\d+(?:-\d+)?)?)?(?:-\d+)?))'

    extracted_regulations = []

    # Iterate over the 'step_description' column
    for index, row in df.iterrows():
        target_column = row[column]

        # Use the pattern to extract regulations
        matches = re.findall(pattern, target_column)

        for regulation in matches:
          regulation = remove_stopwords(regulation)
          # Check if regulation is only a date after removing stopwords
          if is_date(regulation):
              continue

          extracted_regulations.append({id: row[id], 'regulation': regulation})

    results = pd.DataFrame(extracted_regulations)
    return results

#Analysis

In [13]:
results_process_description = extract_regulations(process_description, "description", "id")
regulations_process_description = results_process_description.drop_duplicates()

results_process_remarks = extract_regulations(process_remarks, "remarks", "id")
regulations_process_remarks = results_process_remarks.drop_duplicates()


results_process_evidences = extract_regulations(process_evidences, "evidence_description", "ihu_unique_evidence_id")
regulations_process_evidences = results_process_evidences.drop_duplicates()


results_process_conditions = extract_regulations(process_conditions, "conditions_name", "ihu_unique_condition_id")
regulations_process_conditions = results_process_conditions.drop_duplicates()

In [14]:
# Concatenate the data frames
combined = pd.concat([regulations_process_description, regulations_process_remarks, regulations_process_evidences, regulations_process_conditions], ignore_index=True, sort=False)

# Reorder the columns
combined = combined[['id', 'ihu_unique_evidence_id', 'ihu_unique_condition_id', 'regulation']]

# Assign ID

In [15]:
def add_entity_ids(df):
    # Extract unique keywords
    keywords = set(df['regulation'])

    # Assign IDs to keywords
    keyword_ids = {keyword: i for i, keyword in enumerate(keywords, start=1)}

    # Create the dictionary of unique keywords and IDs
    keyword_dict = {keyword: keyword_ids[keyword] for keyword in keywords}

    # Add the "Entity ID" column to the dataframe
    df['Regulation ID'] = df['regulation'].apply(lambda x: keyword_ids[x])

    return df, keyword_dict

In [16]:
final_regulations, regulations_dict = add_entity_ids(combined)
final_regulations = final_regulations.drop_duplicates()

In [28]:
final_regulations['id'] = final_regulations['id'].apply(lambda x: str(int(x)) if not pd.isna(x) else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_regulations['id'] = final_regulations['id'].apply(lambda x: str(int(x)) if not pd.isna(x) else x)


In [17]:
def upload_ids_to_target_folder(dic, name):

  path = name+".csv"

  # open file for writing, "w" is writing
  w = csv.writer(open(path, "w"))

  # loop over dictionary keys and values
  for key, val in dic.items():

      # write every key and value to file
      w.writerow([key, val])

  url=folder_analysis_url_shrpt+"regulations_extraction"
  target_folder = ctx.web.get_folder_by_server_relative_url(url)

  with open(path, "rb") as content_file:
      file_content = content_file.read()
      target_folder.upload_file(os.path.basename(path), file_content).execute_query()

In [18]:
upload_ids_to_target_folder(regulations_dict, "regulations_dict")

split dataframe

In [19]:
def filter_dataframe_by_id(df, column_name):
    filtered_df = df[df[column_name].notnull()]

    # Select the desired columns for the new dataframe
    selected_columns = [column_name, "regulation", "Regulation ID"]
    new_df = filtered_df[selected_columns].reset_index(drop=True)

    return new_df

In [24]:
def upload_entity_files_to_target_folder(unique_id, name):

  result = filter_dataframe_by_id(final_regulations, unique_id)

  path = name+".csv"

  result.to_csv(path, index=False)

  url=folder_analysis_url_shrpt+"regulations_extraction"
  target_folder = ctx.web.get_folder_by_server_relative_url(url)
  with open(path, "rb") as content_file:
      file_content = content_file.read()
      target_folder.upload_file(os.path.basename(path), file_content).execute_query()

In [30]:
upload_entity_files_to_target_folder("id", "process_regulations")

In [26]:
upload_entity_files_to_target_folder("ihu_unique_evidence_id", "evidences_regulations")

In [27]:
upload_entity_files_to_target_folder("ihu_unique_condition_id", "conditions_regulations")