From a65319e8df833e6c11cfa238f3cc8816e450a4f7 Mon Sep 17 00:00:00 2001 From: praveen-elastic Date: Tue, 1 Mar 2022 20:17:49 +0530 Subject: [PATCH 1/9] added concurrency and other changes from stacked PRs --- ees_sharepoint/deletion_sync_command.py | 31 ++- ees_sharepoint/fetch_index.py | 313 ++++++++++++++---------- ees_sharepoint/schema.py | 8 +- ees_sharepoint/utils.py | 76 +++++- sharepoint_server_2016_connector.yml | 2 + 5 files changed, 287 insertions(+), 143 deletions(-) diff --git a/ees_sharepoint/deletion_sync_command.py b/ees_sharepoint/deletion_sync_command.py index 6bebb9d..1b2abb6 100644 --- a/ees_sharepoint/deletion_sync_command.py +++ b/ees_sharepoint/deletion_sync_command.py @@ -13,9 +13,11 @@ import requests from .base_command import BaseCommand - +from .utils import split_list_in_chunks IDS_PATH = os.path.join(os.path.dirname(__file__), 'doc_id.json') +# By default, Enterprise Search configuration has a maximum allowed limit set to 100 documents for an api request +BATCH_SIZE = 100 class DeletionSyncCommand(BaseCommand): @@ -38,7 +40,7 @@ def deindexing_items(self, collection, ids, key): logger = self.logger delete_ids_items = ids["delete_keys"][collection].get(key) - logger.info("Deindexing items...") + logger.info(f"Deindexing {key}...") if delete_ids_items: delete_site = [] global_ids_items = ids["global_keys"][collection][key] @@ -56,9 +58,10 @@ def deindexing_items(self, collection, ids, key): if resp.status_code == requests.codes['not_found'] or result == []: doc.append(item_id) if doc: - self.workplace_search_client.delete_documents( - content_source_id=self.ws_source, - document_ids=doc) + for chunk in split_list_in_chunks(doc, BATCH_SIZE): + self.workplace_search_client.delete_documents( + content_source_id=self.ws_source, + document_ids=chunk) updated_items = global_ids_items[site_url].get(list_id) if updated_items is None: continue @@ -96,11 +99,12 @@ def deindexing_lists(self, collection, ids): for list_id in list_details.keys(): url = f"{site_url}/_api/web/lists(guid\'{list_id}\')" resp = self.sharepoint_client.get(url, '', "deindex") - if resp and resp.status_code == requests.codes['not_found']: + if resp is not None and resp.status_code == requests.codes['not_found']: doc.append(list_id) - self.workplace_search_client.delete_documents( - content_source_id=self.ws_source, - document_ids=doc) + for chunk in split_list_in_chunks(doc, BATCH_SIZE): + self.workplace_search_client.delete_documents( + content_source_id=self.ws_source, + document_ids=chunk) for list_id in doc: if list_id in global_ids_lists[site_url]: global_ids_lists[site_url].pop(list_id) @@ -126,11 +130,12 @@ def deindexing_sites(self, collection, ids): for site_id, site_url in site_details.items(): url = f"{site_url}/_api/web" resp = self.sharepoint_client.get(url, '', "deindex") - if resp and resp.status_code == requests.codes['not_found']: + if resp is not None and resp.status_code == requests.codes['not_found']: doc.append(site_id) - self.workplace_search_client.delete_documents( - content_source_id=self.ws_source, - document_ids=doc) + for chunk in split_list_in_chunks(doc, BATCH_SIZE): + self.workplace_search_client.delete_documents( + content_source_id=self.ws_source, + document_ids=chunk) for site_id in doc: ids["global_keys"][collection]["sites"].pop(site_id) else: diff --git a/ees_sharepoint/fetch_index.py b/ees_sharepoint/fetch_index.py index 44ee221..36b32bb 100644 --- a/ees_sharepoint/fetch_index.py +++ b/ees_sharepoint/fetch_index.py @@ -16,10 +16,11 @@ from dateutil.parser import parse from tika.tika import TikaException +from multiprocessing.pool import ThreadPool from .checkpointing import Checkpoint from .usergroup_permissions import Permissions -from .utils import encode, extract +from .utils import encode, extract, partition_equal_share, split_list_in_chunks, get_partition_time, split_dict_in_chunks from . import adapter IDS_PATH = os.path.join(os.path.dirname(__file__), 'doc_id.json') @@ -31,8 +32,7 @@ LISTS = "lists" LIST_ITEMS = "list_items" DRIVE_ITEMS = "drive_items" -DOCUMENT_SIZE = 100 -DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ" +BATCH_SIZE = 100 def get_results(logger, response, entity_name): @@ -66,21 +66,20 @@ def __init__(self, config, logger, workplace_search_client, sharepoint_client, s self.enable_permission = config.get_value("enable_document_permission") self.start_time = start_time self.end_time = end_time + self.max_threads = config.get_value("max_threads") self.mapping_sheet_path = config.get_value("sharepoint_workplace_user_mapping") self.checkpoint = Checkpoint(config, logger) self.permissions = Permissions(self.sharepoint_client, self.workplace_search_client, logger) - def index_document(self, document, parent_object, param_name): + def index_document(self, document, param_name): """ This method indexes the documents to the workplace. :param document: document to be indexed - :param parent_object: parent of the objects to be indexed :param param_name: parameter name whether it is SITES, LISTS LIST_ITEMS OR DRIVE_ITEMS """ if document: total_documents_indexed = 0 - document_list = [document[i * DOCUMENT_SIZE:(i + 1) * DOCUMENT_SIZE] for i in range((len(document) + DOCUMENT_SIZE - 1) // DOCUMENT_SIZE)] - for chunk in document_list: + for chunk in split_list_in_chunks(document, BATCH_SIZE): response = self.workplace_search_client.index_documents( content_source_id=self.ws_source, documents=chunk @@ -90,8 +89,21 @@ def index_document(self, document, parent_object, param_name): total_documents_indexed += 1 else: self.logger.error("Error while indexing %s. Error: %s" % (each['id'], each['errors'])) - self.logger.info("Successfully indexed %s %s for %s to the workplace" % ( - total_documents_indexed, param_name, parent_object)) + self.logger.info("Successfully indexed %s %s to the workplace" % ( + total_documents_indexed, param_name)) + + def threaded_index_documents(self, document, param_name): + """ Applies multithreading on indexing functionality + :param document: documents to be indexed equally in each thread + :param param_name: parameter name whether it is SITES, LISTS LIST_ITEMS OR DRIVE_ITEMS + """ + chunk_documents = partition_equal_share(document, self.max_threads) + thread_pool = ThreadPool(self.max_threads) + for doc in chunk_documents: + thread_pool.apply_async(self.index_document, (doc, param_name)) + + thread_pool.close() + thread_pool.join() def get_schema_fields(self, document_name): """ returns the schema of all the include_fields or exclude_fields specified in the configuration file. @@ -112,7 +124,7 @@ def get_schema_fields(self, document_name): adapter_schema['id'] = field_id return adapter_schema - def index_sites(self, parent_site_url, sites, ids, index): + def fetch_sites(self, parent_site_url, sites, ids, index, start_time, end_time): """This method fetches sites from a collection and invokes the index permission method to get the document level permissions. If the fetching is not successful, it logs proper message. @@ -120,23 +132,24 @@ def index_sites(self, parent_site_url, sites, ids, index): :param sites: dictionary of site path and it's last updated time :param ids: structure containing id's of all objects :param index: index, boolean value + :param start_time: start time for fetching the data + :param end_time: end time for fetching the data Returns: document: response of sharepoint GET call, with fields specified in the schema """ rel_url = f"{parent_site_url}/_api/web/webs" self.logger.info("Fetching the sites detail from url: %s" % (rel_url)) query = self.sharepoint_client.get_query( - self.start_time, self.end_time, SITES) + start_time, end_time, SITES) response = self.sharepoint_client.get(rel_url, query, SITES) response_data = get_results(self.logger, response, SITES) if not response_data: - self.logger.info("No sites were created in %s for this interval: start time: %s and end time: %s" % (parent_site_url, self.start_time, self.end_time)) + self.logger.info("No sites were created in %s for this interval: start time: %s and end time: %s" % (parent_site_url, start_time, end_time)) return sites self.logger.info( "Successfully fetched and parsed %s sites response from SharePoint" % len(response_data) ) - self.logger.info("Indexing the sites to the Workplace") schema = self.get_schema_fields(SITES) document = [] @@ -153,18 +166,18 @@ def index_sites(self, parent_site_url, sites, ids, index): key=SITES, site=response_data[i]['ServerRelativeUrl']) document.append(doc) ids["sites"].update({doc["id"]: response_data[i]["ServerRelativeUrl"]}) - self.index_document(document, parent_site_url, SITES) for result in response_data: site_server_url = result.get("ServerRelativeUrl") sites.update({site_server_url: result.get("LastItemModifiedDate")}) - self.index_sites(site_server_url, sites, ids, index) - return sites + self.fetch_sites(site_server_url, sites, ids, index, start_time, end_time) + return sites, document - def index_lists(self, sites, ids, index): + def fetch_lists(self, sites, ids, index): """This method fetches lists from all sites in a collection and invokes the index permission method to get the document level permissions. If the fetching is not successful, it logs proper message. :param sites: dictionary of site path and it's last updated time + :param ids: structure containing id's of all objects :param index: index, boolean value Returns: document: response of sharepoint GET call, with fields specified in the schema @@ -176,70 +189,67 @@ def index_lists(self, sites, ids, index): self.logger.info("No list was created in this interval: start time: %s and end time: %s" % (self.start_time, self.end_time)) return [], [] schema_list = self.get_schema_fields(LISTS) - for site, time_modified in sites.items(): - if parse(self.start_time) > parse(time_modified): - continue - rel_url = f"{site}/_api/web/lists" - self.logger.info( - "Fetching the lists for site: %s from url: %s" - % (site, rel_url) - ) - - query = self.sharepoint_client.get_query( - self.start_time, self.end_time, LISTS) - response = self.sharepoint_client.get( - rel_url, query, LISTS) - - response_data = get_results(self.logger, response, LISTS) - if not response_data: - self.logger.info("No list was created for the site : %s in this interval: start time: %s and end time: %s" % (site, self.start_time, self.end_time)) - continue - self.logger.info( - "Successfully fetched and parsed %s list response for site: %s from SharePoint" - % (len(response_data), site) - ) + for site_details in sites: + for site, time_modified in site_details.items(): + if parse(self.start_time) > parse(time_modified): + continue + rel_url = f"{site}/_api/web/lists" + self.logger.info( + "Fetching the lists for site: %s from url: %s" + % (site, rel_url) + ) - base_list_url = f"{site}/Lists/" + query = self.sharepoint_client.get_query( + self.start_time, self.end_time, LISTS) + response = self.sharepoint_client.get( + rel_url, query, LISTS) - if index: - if not ids["lists"].get(site): - ids["lists"].update({site: {}}) - for i, _ in enumerate(response_data): - doc = {'type': LIST} - for field, response_field in schema_list.items(): - doc[field] = response_data[i].get( - response_field) - if self.enable_permission is True: - doc["_allow_permissions"] = self.index_permissions( - key=LISTS, site=site, list_id=doc["id"], list_url=response_data[i]['ParentWebUrl'], itemid=None) - doc["url"] = urljoin(base_list_url, re.sub( - r'[^ \w+]', '', response_data[i]["Title"])) - document.append(doc) - ids["lists"][site].update({doc["id"]: response_data[i]["Title"]}) + response_data = get_results(self.logger, response, LISTS) + if not response_data: + self.logger.info("No list was created for the site : %s in this interval: start time: %s and end time: %s" % (site, self.start_time, self.end_time)) + continue self.logger.info( - "Indexing the list for site: %s to the Workplace" % (site) + "Successfully fetched and parsed %s list response for site: %s from SharePoint" + % (len(response_data), site) ) - self.index_document(document, site, LISTS) - - responses.append(response_data) - lists = {} - libraries = {} - for response in responses: - for result in response: - if result.get('BaseType') == 1: - libraries[result.get("Id")] = [result.get( - "ParentWebUrl"), result.get("Title"), result.get("LastItemModifiedDate")] - else: - lists[result.get("Id")] = [result.get( - "ParentWebUrl"), result.get("Title"), result.get("LastItemModifiedDate")] - return lists, libraries - - def index_items(self, lists, ids): + base_list_url = f"{site}/Lists/" + + if index: + if not ids["lists"].get(site): + ids["lists"].update({site: {}}) + for i, _ in enumerate(response_data): + doc = {'type': LIST} + for field, response_field in schema_list.items(): + doc[field] = response_data[i].get( + response_field) + if self.enable_permission is True: + doc["_allow_permissions"] = self.index_permissions( + key=LISTS, site=site, list_id=doc["id"], list_url=response_data[i]['ParentWebUrl'], itemid=None) + doc["url"] = urljoin(base_list_url, re.sub( + r'[^ \w+]', '', response_data[i]["Title"])) + document.append(doc) + ids["lists"][site].update({doc["id"]: response_data[i]["Title"]}) + + responses.append(response_data) + lists = {} + libraries = {} + for response in responses: + for result in response: + if result.get('BaseType') == 1: + libraries[result.get("Id")] = [result.get( + "ParentWebUrl"), result.get("Title"), result.get("LastItemModifiedDate")] + else: + lists[result.get("Id")] = [result.get( + "ParentWebUrl"), result.get("Title"), result.get("LastItemModifiedDate")] + return lists, libraries, document + + def fetch_items(self, lists, ids): """This method fetches items from all the lists in a collection and invokes theindex permission method to get the document level permissions. If the fetching is not successful, it logs proper message. :param lists: document lists + :param ids: structure containing id's of all objects Returns: document: response of sharepoint GET call, with fields specified in the schema """ @@ -316,23 +326,17 @@ def index_items(self, lists, ids): if response_data[i].get("GUID") not in ids["list_items"][value[0]][list_content]: ids["list_items"][value[0]][list_content].append( response_data[i].get("GUID")) - self.logger.info( - "Indexing the listitem for list: %s to the Workplace" - % (value[1]) - ) - - self.index_document(document, value[1], LIST_ITEMS) - - responses.append(document) + responses.extend(document) return responses - def index_drive_items(self, libraries, ids): + def fetch_drive_items(self, libraries, ids): """This method fetches items from all the lists in a collection and invokes theindex permission method to get the document level permissions. If the fetching is not successful, it logs proper message. :param libraries: document lists :param ids: structure containing id's of all objects """ + responses = [] # here value is a list of url and title of the library self.logger.info("Fetching all the files for the library") if not libraries: @@ -390,12 +394,8 @@ def index_drive_items(self, libraries, ids): document.append(doc) if doc['id'] not in ids["drive_items"][value[0]][lib_content]: ids["drive_items"][value[0]][lib_content].append(doc['id']) - if document: - self.logger.info("Indexing the drive items for library: %s to the Workplace" % (value[1])) - self.index_document(document, value[1], DRIVE_ITEMS) - else: - self.logger.info("No item was present in the library %s for the interval: start time: %s and end time: %s" % ( - value[1], self.start_time, self.end_time)) + responses.extend(document) + return responses def get_roles(self, key, site, list_url, list_id, itemid): """ Checks the permissions and returns the user roles. @@ -458,40 +458,113 @@ def index_permissions( groups.append(title) return groups + def index_sites(self, parent_site_url, ids, sites_path): + """ Indexes the site details to the Workplace Search + :param parent_site_url: parent site relative path + :param ids: id collection of the all the objects + :param sites_path: dictionary of site path and it's last updated time + """ + _, datelist = get_partition_time(self.max_threads, self.start_time, self.end_time) + results = [] + thread_pool = ThreadPool(self.max_threads) + for num in range(0, self.max_threads): + start_time_partition = datelist[num] + end_time_partition = datelist[num + 1] + thread = thread_pool.apply_async( + self.fetch_sites, (parent_site_url, {}, ids, (SITES in self.objects), + start_time_partition, end_time_partition)) + results.append(thread.get()) + + sites, documents = [], [] + for result in results: + if result: + sites.append(result[0]) + documents.extend(result[1]) + thread_pool.close() + thread_pool.join() + self.threaded_index_documents(documents, SITES) + sites_path.extend(sites) + + def index_lists(self, sites_path, ids, lists_details, libraries_details): + """ Indexes the list details to the Workplace Search + :param sites_path: dictionary of site path and it's last updated time + :param ids: id collection of the all the objects + :param lists_details: dictionary containing list name, list path and id + :param libraries_details: dictionary containing library name, library path and id + """ + results = [] + thread_pool = ThreadPool(self.max_threads) + partitioned_sites = partition_equal_share(sites_path, self.max_threads) + for site in partitioned_sites: + thread = thread_pool.apply_async(self.fetch_lists, (site, ids, (LISTS in self.objects))) + results.append(thread.get()) + documents = [] + for result in results: + if result: + lists_details.update(result[0]) + libraries_details.update(result[1]) + documents.extend(result[2]) + thread_pool.close() + thread_pool.join() + self.threaded_index_documents(documents, LISTS) + + def index_items(self, job_type, lists_details, libraries_details, ids): + """ Indexes the list_items and drive_items to the Workplace Search + :param job_type: denotes the type of sharepoint object being fetched in a particular process + :param lists_details: dictionary containing list name, list path and id + :param libraries_details: dictionary containing library name, library path and id + :param ids: id collection of the all the objects + """ + results = [] + partition = [] + if job_type == "list_items" and LIST_ITEMS in self.objects: + thread_pool = ThreadPool(self.max_threads) + func = self.fetch_items + partition = split_dict_in_chunks(lists_details, self.max_threads) + elif job_type == "drive_items" and DRIVE_ITEMS in self.objects: + thread_pool = ThreadPool(self.max_threads) + func = self.fetch_drive_items + partition = split_dict_in_chunks(libraries_details, self.max_threads) + for list_data in partition: + thread = thread_pool.apply_async(func, (list_data, ids)) + results.append(thread.get()) + documents = [] + for result in results: + if result: + documents.extend(result) + thread_pool.close() + thread_pool.join() + self.threaded_index_documents(documents, job_type) + def indexing(self, collection, ids, storage, job_type, parent_site_url, sites_path, lists_details, libraries_details): """This method fetches all the objects from sharepoint server and ingests them into the workplace search :param collection: collection name :param ids: id collection of the all the objects :param storage: temporary storage for storing all the documents - :job_type: denotes the type of sharepoint object being fetched in a particular process - :parent_site_url: parent site relative path - :sites_path: dictionary of site path and it's last updated time - :lists_details: dictionary containing list name, list path and id - :library_details: dictionary containing library name, library path and id + :param job_type: denotes the type of sharepoint object being fetched in a particular process + :param parent_site_url: parent site relative path + :param sites_path: dictionary of site path and it's last updated time + :param lists_details: dictionary containing list name, list path and id + :param libraries_details: dictionary containing library name, library path and id """ if job_type == "sites": - sites = self.index_sites(parent_site_url, {}, ids, index=(SITES in self.objects)) - sites_path.update(sites) + self.index_sites(parent_site_url, ids, sites_path) + elif job_type == "lists": - lists, libraries = self.index_lists(sites_path, ids, index=(LISTS in self.objects)) - lists_details.update(lists) - libraries_details.update(libraries) - elif job_type == "list_items": - if LIST_ITEMS in self.objects: - self.index_items(lists_details, ids) - else: - if DRIVE_ITEMS in self.objects: - self.index_drive_items(libraries_details, ids) + self.index_lists(sites_path, ids, lists_details, libraries_details) - self.logger.info( - "Completed fetching all the objects for site collection: %s" - % (collection) - ) + elif job_type in ["list_items", "drive_items"]: + self.index_items(job_type, lists_details, libraries_details, ids) - self.logger.info( - "Saving the checkpoint for the site collection: %s" % (collection) - ) + self.logger.info( + "Completed fetching all the objects for site collection: %s" + % (collection) + ) + + self.logger.info( + "Saving the checkpoint for the site collection: %s" % (collection) + ) if ids.get(job_type): prev_ids = storage[job_type] if job_type == 'sites': @@ -507,21 +580,6 @@ def indexing(self, collection, ids, storage, job_type, parent_site_url, sites_pa storage[job_type] = prev_ids -def datetime_partitioning(start_time, end_time, processes): - """ Divides the timerange in equal partitions by number of processors - :param start_time: start time of the interval - :param end_time: end time of the interval - :param processes: number of processors the device have - """ - start_time = datetime.strptime(start_time, DATETIME_FORMAT) - end_time = datetime.strptime(end_time, DATETIME_FORMAT) - - diff = (end_time - start_time) / processes - for idx in range(processes): - yield start_time + diff * idx - yield end_time - - def start(indexing_type, config, logger, workplace_search_client, sharepoint_client): """Runs the indexing logic :param indexing_type: The type of the indexing i.e. incremental or full @@ -568,7 +626,7 @@ def start(indexing_type, config, logger, workplace_search_client, sharepoint_cli "sites": {}, "lists": {}, "list_items": {}, "drive_items": {}} parent_site_url = f"/sites/{collection}" - sites_path = {parent_site_url: end_time} + sites_path = [{parent_site_url: end_time}] lists_details = {} libraries_details = {} logger.info( @@ -592,9 +650,8 @@ def start(indexing_type, config, logger, workplace_search_client, sharepoint_cli storage_with_collection["global_keys"][collection] = storage.copy() - check.set_checkpoint(collection, start_time, indexing_type) + check.set_checkpoint(collection, end_time, indexing_type) except Exception as exception: - check.set_checkpoint(collection, end_time, indexing_type) raise exception with open(IDS_PATH, "w") as file: diff --git a/ees_sharepoint/schema.py b/ees_sharepoint/schema.py index 5505da1..bf63ad0 100644 --- a/ees_sharepoint/schema.py +++ b/ees_sharepoint/schema.py @@ -156,7 +156,7 @@ def coerce_rfc_3339_date(input_date): 'log_level': { 'required': False, 'type': 'string', - 'default': 'info', + 'default': 'INFO', 'allowed': ['DEBUG', 'INFO', 'WARN', 'ERROR'] }, 'retry_count': { @@ -165,6 +165,12 @@ def coerce_rfc_3339_date(input_date): 'default': 3, 'min': 1 }, + 'max_threads': { + 'required': False, + 'type': 'integer', + 'default': 40, + 'min': 1 + }, 'sharepoint_workplace_user_mapping': { 'required': False, 'type': 'string' diff --git a/ees_sharepoint/utils.py b/ees_sharepoint/utils.py index 89e41a0..ab3a932 100644 --- a/ees_sharepoint/utils.py +++ b/ees_sharepoint/utils.py @@ -3,11 +3,13 @@ # or more contributor license agreements. Licensed under the Elastic License 2.0; # you may not use this file except in compliance with the Elastic License 2.0. # -"""This module contains uncategorisied utility methods.""" +"""This module contains uncategorized utility methods.""" import urllib.parse from tika import parser +from datetime import datetime +DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ" def extract(content): @@ -28,3 +30,75 @@ def encode(object_name): :param object_name: name that contains special characters""" name = urllib.parse.quote(object_name, safe="'") return name.replace("'", "''") + + +def partition_equal_share(object_list, total_groups): + """ Divides the list in groups of approximately equal sizes + :param object_list: list to be partitioned + :param total_groups: number of groups to be formed + """ + if object_list: + groups = min(total_groups, len(object_list)) + group_list = [] + for i in range(groups): + group_list.append(object_list[i::groups]) + return group_list + else: + return [] + + +def split_list_in_chunks(input_list, chunk_size): + """ This method splits a list into separate chunks with maximum size + as chunk_size + :param input_list: List to be partitioned into chunks + :param chunk_size: Maximum size of a chunk + Returns: + list_of_chunks: List containing the chunks + """ + list_of_chunks = [] + for i in range(0, len(input_list), chunk_size): + list_of_chunks.append(input_list[i:i + chunk_size]) + return list_of_chunks + + +def split_dict_in_chunks(input_dict, chunk_size): + """ This method splits a dictionary into separate chunks with maximum size + as chunk_size + :param input_dict: Dictionary to be partitioned into chunks + :param chunk_size: Maximum size of a chunk + Returns: + list_of_chunks: List containing the chunks + """ + list_of_chunks = [] + for i in range(0, len(input_dict), chunk_size): + partitioned_chunk = list(input_dict.items())[i:i + chunk_size] + list_of_chunks.append(dict(partitioned_chunk)) + return list_of_chunks + + +def datetime_partitioning(start_time, end_time, processes): + """ Divides the timerange in equal partitions by number of processors + :param start_time: start time of the interval + :param end_time: end time of the interval + :param processes: number of processors the device have + """ + start_time = datetime.strptime(start_time, DATETIME_FORMAT) + end_time = datetime.strptime(end_time, DATETIME_FORMAT) + + diff = (end_time - start_time) / processes + for idx in range(processes): + yield start_time + diff * idx + yield end_time + + +def get_partition_time(max_threads, start_time, end_time): + """ Divides the time range of indexing into partitions based on number of processes. + :param max_threads: Number of threads in multithreading + :param start_time: Start time of a time range + :param end_time: End time of a time range + """ + partitions = list(datetime_partitioning(start_time, end_time, max_threads)) + datelist = [] + for sub in partitions: + datelist.append(sub.strftime(DATETIME_FORMAT)) + return end_time, datelist diff --git a/sharepoint_server_2016_connector.yml b/sharepoint_server_2016_connector.yml index 5899064..948b328 100644 --- a/sharepoint_server_2016_connector.yml +++ b/sharepoint_server_2016_connector.yml @@ -44,5 +44,7 @@ end_time : log_level: INFO #The number of retries to perform in case of server error. The connector will use exponential backoff for retry mechanism retry_count: 3 +#Number of threads to be used in multithreading for the connector. +max_threads: 40 #the path of csv file containing mapping of sharepoint user ID to Workplace user ID sharepoint_workplace_user_mapping: "C:/Users/abc/folder_name/file_name.csv" From 40d94b7a57bfc493f8d73e5ab745014d4d1dbd08 Mon Sep 17 00:00:00 2001 From: praveen-elastic Date: Fri, 4 Mar 2022 16:07:09 +0530 Subject: [PATCH 2/9] updated multithreading --- ees_sharepoint/fetch_index.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ees_sharepoint/fetch_index.py b/ees_sharepoint/fetch_index.py index 36b32bb..9816e27 100644 --- a/ees_sharepoint/fetch_index.py +++ b/ees_sharepoint/fetch_index.py @@ -473,10 +473,10 @@ def index_sites(self, parent_site_url, ids, sites_path): thread = thread_pool.apply_async( self.fetch_sites, (parent_site_url, {}, ids, (SITES in self.objects), start_time_partition, end_time_partition)) - results.append(thread.get()) + results.append(thread) sites, documents = [], [] - for result in results: + for result in [r.get() for r in results]: if result: sites.append(result[0]) documents.extend(result[1]) @@ -497,9 +497,9 @@ def index_lists(self, sites_path, ids, lists_details, libraries_details): partitioned_sites = partition_equal_share(sites_path, self.max_threads) for site in partitioned_sites: thread = thread_pool.apply_async(self.fetch_lists, (site, ids, (LISTS in self.objects))) - results.append(thread.get()) + results.append(thread) documents = [] - for result in results: + for result in [r.get() for r in results]: if result: lists_details.update(result[0]) libraries_details.update(result[1]) @@ -527,9 +527,9 @@ def index_items(self, job_type, lists_details, libraries_details, ids): partition = split_dict_in_chunks(libraries_details, self.max_threads) for list_data in partition: thread = thread_pool.apply_async(func, (list_data, ids)) - results.append(thread.get()) + results.append(thread) documents = [] - for result in results: + for result in [r.get() for r in results]: if result: documents.extend(result) thread_pool.close() From b472d3773cf6ef75636d84c36e5394bfc103ae32 Mon Sep 17 00:00:00 2001 From: praveen-elastic Date: Fri, 11 Mar 2022 15:50:28 +0530 Subject: [PATCH 3/9] resolve PR comments --- ees_sharepoint/deletion_sync_command.py | 8 +- ees_sharepoint/fetch_index.py | 104 +++++++++++++----------- ees_sharepoint/utils.py | 49 ++++------- 3 files changed, 78 insertions(+), 83 deletions(-) diff --git a/ees_sharepoint/deletion_sync_command.py b/ees_sharepoint/deletion_sync_command.py index 1b2abb6..800a9be 100644 --- a/ees_sharepoint/deletion_sync_command.py +++ b/ees_sharepoint/deletion_sync_command.py @@ -13,7 +13,7 @@ import requests from .base_command import BaseCommand -from .utils import split_list_in_chunks +from .utils import split_list_into_buckets IDS_PATH = os.path.join(os.path.dirname(__file__), 'doc_id.json') # By default, Enterprise Search configuration has a maximum allowed limit set to 100 documents for an api request @@ -58,7 +58,7 @@ def deindexing_items(self, collection, ids, key): if resp.status_code == requests.codes['not_found'] or result == []: doc.append(item_id) if doc: - for chunk in split_list_in_chunks(doc, BATCH_SIZE): + for chunk in split_list_into_buckets(doc, BATCH_SIZE): self.workplace_search_client.delete_documents( content_source_id=self.ws_source, document_ids=chunk) @@ -101,7 +101,7 @@ def deindexing_lists(self, collection, ids): resp = self.sharepoint_client.get(url, '', "deindex") if resp is not None and resp.status_code == requests.codes['not_found']: doc.append(list_id) - for chunk in split_list_in_chunks(doc, BATCH_SIZE): + for chunk in split_list_into_buckets(doc, BATCH_SIZE): self.workplace_search_client.delete_documents( content_source_id=self.ws_source, document_ids=chunk) @@ -132,7 +132,7 @@ def deindexing_sites(self, collection, ids): resp = self.sharepoint_client.get(url, '', "deindex") if resp is not None and resp.status_code == requests.codes['not_found']: doc.append(site_id) - for chunk in split_list_in_chunks(doc, BATCH_SIZE): + for chunk in split_list_into_buckets(doc, BATCH_SIZE): self.workplace_search_client.delete_documents( content_source_id=self.ws_source, document_ids=chunk) diff --git a/ees_sharepoint/fetch_index.py b/ees_sharepoint/fetch_index.py index 9816e27..fc31c38 100644 --- a/ees_sharepoint/fetch_index.py +++ b/ees_sharepoint/fetch_index.py @@ -16,11 +16,10 @@ from dateutil.parser import parse from tika.tika import TikaException -from multiprocessing.pool import ThreadPool from .checkpointing import Checkpoint from .usergroup_permissions import Permissions -from .utils import encode, extract, partition_equal_share, split_list_in_chunks, get_partition_time, split_dict_in_chunks +from .utils import encode, extract, split_list_into_buckets, split_date_range_into_chunks, split_dict_in_chunks, spawn_threads from . import adapter IDS_PATH = os.path.join(os.path.dirname(__file__), 'doc_id.json') @@ -79,7 +78,7 @@ def index_document(self, document, param_name): """ if document: total_documents_indexed = 0 - for chunk in split_list_in_chunks(document, BATCH_SIZE): + for chunk in split_list_into_buckets(document, BATCH_SIZE): response = self.workplace_search_client.index_documents( content_source_id=self.ws_source, documents=chunk @@ -97,8 +96,8 @@ def threaded_index_documents(self, document, param_name): :param document: documents to be indexed equally in each thread :param param_name: parameter name whether it is SITES, LISTS LIST_ITEMS OR DRIVE_ITEMS """ - chunk_documents = partition_equal_share(document, self.max_threads) - thread_pool = ThreadPool(self.max_threads) + chunk_documents = split_list_into_buckets(document, self.max_threads) + thread_pool = spawn_threads(self.max_threads) for doc in chunk_documents: thread_pool.apply_async(self.index_document, (doc, param_name)) @@ -458,15 +457,17 @@ def index_permissions( groups.append(title) return groups - def index_sites(self, parent_site_url, ids, sites_path): + def index_sites(self, ids, end_time, collection): """ Indexes the site details to the Workplace Search - :param parent_site_url: parent site relative path :param ids: id collection of the all the objects - :param sites_path: dictionary of site path and it's last updated time + :param end_time: end time for fetching the data + :param collection: collection name """ - _, datelist = get_partition_time(self.max_threads, self.start_time, self.end_time) + _, datelist = split_date_range_into_chunks(self.start_time, self.end_time, self.max_threads) results = [] - thread_pool = ThreadPool(self.max_threads) + parent_site_url = f"/sites/{collection}" + sites_path = [{parent_site_url: end_time}] + thread_pool = spawn_threads(self.max_threads) for num in range(0, self.max_threads): start_time_partition = datelist[num] end_time_partition = datelist[num + 1] @@ -484,17 +485,16 @@ def index_sites(self, parent_site_url, ids, sites_path): thread_pool.join() self.threaded_index_documents(documents, SITES) sites_path.extend(sites) + return sites_path - def index_lists(self, sites_path, ids, lists_details, libraries_details): + def index_lists(self, sites_path, ids): """ Indexes the list details to the Workplace Search :param sites_path: dictionary of site path and it's last updated time :param ids: id collection of the all the objects - :param lists_details: dictionary containing list name, list path and id - :param libraries_details: dictionary containing library name, library path and id """ - results = [] - thread_pool = ThreadPool(self.max_threads) - partitioned_sites = partition_equal_share(sites_path, self.max_threads) + results, lists_details, libraries_details = [], {}, {} + thread_pool = spawn_threads(self.max_threads) + partitioned_sites = split_list_into_buckets(sites_path, self.max_threads) for site in partitioned_sites: thread = thread_pool.apply_async(self.fetch_lists, (site, ids, (LISTS in self.objects))) results.append(thread) @@ -507,26 +507,39 @@ def index_lists(self, sites_path, ids, lists_details, libraries_details): thread_pool.close() thread_pool.join() self.threaded_index_documents(documents, LISTS) + return [lists_details, libraries_details] - def index_items(self, job_type, lists_details, libraries_details, ids): - """ Indexes the list_items and drive_items to the Workplace Search - :param job_type: denotes the type of sharepoint object being fetched in a particular process + def index_list_items(self, lists_details, ids): + """ Indexes the list_items to the Workplace Search :param lists_details: dictionary containing list name, list path and id + :param ids: id collection of the all the objects + """ + results = [] + partition = [] + thread_pool = spawn_threads(self.max_threads) + partition = split_dict_in_chunks(lists_details, self.max_threads) + for list_data in partition: + thread = thread_pool.apply_async(self.fetch_items, (list_data, ids)) + results.append(thread) + documents = [] + for result in [r.get() for r in results]: + if result: + documents.extend(result) + thread_pool.close() + thread_pool.join() + self.threaded_index_documents(documents, LIST_ITEMS) + + def index_drive_items(self, libraries_details, ids): + """ Indexes the drive_items to the Workplace Search :param libraries_details: dictionary containing library name, library path and id :param ids: id collection of the all the objects """ results = [] partition = [] - if job_type == "list_items" and LIST_ITEMS in self.objects: - thread_pool = ThreadPool(self.max_threads) - func = self.fetch_items - partition = split_dict_in_chunks(lists_details, self.max_threads) - elif job_type == "drive_items" and DRIVE_ITEMS in self.objects: - thread_pool = ThreadPool(self.max_threads) - func = self.fetch_drive_items - partition = split_dict_in_chunks(libraries_details, self.max_threads) + thread_pool = spawn_threads(self.max_threads) + partition = split_dict_in_chunks(libraries_details, self.max_threads) for list_data in partition: - thread = thread_pool.apply_async(func, (list_data, ids)) + thread = thread_pool.apply_async(self.fetch_drive_items, (list_data, ids)) results.append(thread) documents = [] for result in [r.get() for r in results]: @@ -534,28 +547,29 @@ def index_items(self, job_type, lists_details, libraries_details, ids): documents.extend(result) thread_pool.close() thread_pool.join() - self.threaded_index_documents(documents, job_type) + self.threaded_index_documents(documents, DRIVE_ITEMS) - def indexing(self, collection, ids, storage, job_type, parent_site_url, sites_path, lists_details, libraries_details): + def indexing(self, collection, ids, storage, job_type, collected_objects, end_time): """This method fetches all the objects from sharepoint server and ingests them into the workplace search :param collection: collection name :param ids: id collection of the all the objects :param storage: temporary storage for storing all the documents :param job_type: denotes the type of sharepoint object being fetched in a particular process - :param parent_site_url: parent site relative path - :param sites_path: dictionary of site path and it's last updated time - :param lists_details: dictionary containing list name, list path and id - :param libraries_details: dictionary containing library name, library path and id + :param collected_objects: helper variable to provide the data to children object + :param end_time: end time for fetching the data """ if job_type == "sites": - self.index_sites(parent_site_url, ids, sites_path) + collected_objects = self.index_sites(ids, end_time, collection) elif job_type == "lists": - self.index_lists(sites_path, ids, lists_details, libraries_details) + collected_objects = self.index_lists(collected_objects, ids) + + elif job_type == LIST_ITEMS and LIST_ITEMS in self.objects: + self.index_list_items(collected_objects[0], ids) - elif job_type in ["list_items", "drive_items"]: - self.index_items(job_type, lists_details, libraries_details, ids) + elif job_type == DRIVE_ITEMS and DRIVE_ITEMS in self.objects: + self.index_drive_items(collected_objects[1], ids) self.logger.info( "Completed fetching all the objects for site collection: %s" @@ -578,6 +592,7 @@ def indexing(self, collection, ids, storage, job_type, parent_site_url, sites_pa for list_name in list_content.keys(): prev_ids[site][list_name] = list(set([*prev_ids[site].get(list_name, []), *ids[job_type][site][list_name]])) storage[job_type] = prev_ids + return collected_objects def start(indexing_type, config, logger, workplace_search_client, sharepoint_client): @@ -625,27 +640,22 @@ def start(indexing_type, config, logger, workplace_search_client, sharepoint_cli ids_collection["global_keys"][collection] = { "sites": {}, "lists": {}, "list_items": {}, "drive_items": {}} - parent_site_url = f"/sites/{collection}" - sites_path = [{parent_site_url: end_time}] - lists_details = {} - libraries_details = {} logger.info( "Starting to index all the objects configured in the object field: %s" % (str(config.get_value("objects"))) ) indexer = FetchIndex(config, logger, workplace_search_client, sharepoint_client, start_time, end_time) + returned_documents = None for job_type in ["sites", "lists", "list_items", "drive_items"]: logger.info(f"Indexing {job_type}") - indexer.indexing( + returned_documents = indexer.indexing( collection, ids_collection["global_keys"][collection], storage, job_type, - parent_site_url, - sites_path, - lists_details, - libraries_details + returned_documents, + end_time ) storage_with_collection["global_keys"][collection] = storage.copy() diff --git a/ees_sharepoint/utils.py b/ees_sharepoint/utils.py index ab3a932..ceba89f 100644 --- a/ees_sharepoint/utils.py +++ b/ees_sharepoint/utils.py @@ -9,6 +9,7 @@ from tika import parser from datetime import datetime +from multiprocessing.pool import ThreadPool DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ" @@ -32,7 +33,7 @@ def encode(object_name): return name.replace("'", "''") -def partition_equal_share(object_list, total_groups): +def split_list_into_buckets(object_list, total_groups): """ Divides the list in groups of approximately equal sizes :param object_list: list to be partitioned :param total_groups: number of groups to be formed @@ -47,20 +48,6 @@ def partition_equal_share(object_list, total_groups): return [] -def split_list_in_chunks(input_list, chunk_size): - """ This method splits a list into separate chunks with maximum size - as chunk_size - :param input_list: List to be partitioned into chunks - :param chunk_size: Maximum size of a chunk - Returns: - list_of_chunks: List containing the chunks - """ - list_of_chunks = [] - for i in range(0, len(input_list), chunk_size): - list_of_chunks.append(input_list[i:i + chunk_size]) - return list_of_chunks - - def split_dict_in_chunks(input_dict, chunk_size): """ This method splits a dictionary into separate chunks with maximum size as chunk_size @@ -76,29 +63,27 @@ def split_dict_in_chunks(input_dict, chunk_size): return list_of_chunks -def datetime_partitioning(start_time, end_time, processes): - """ Divides the timerange in equal partitions by number of processors +def split_date_range_into_chunks(start_time, end_time, number_of_threads): + """ Divides the timerange in equal partitions by number of threads :param start_time: start time of the interval :param end_time: end time of the interval - :param processes: number of processors the device have + :param number_of_threads: number of threads defined by user in config file """ start_time = datetime.strptime(start_time, DATETIME_FORMAT) end_time = datetime.strptime(end_time, DATETIME_FORMAT) - diff = (end_time - start_time) / processes - for idx in range(processes): - yield start_time + diff * idx - yield end_time + diff = (end_time - start_time) / number_of_threads + datelist = [] + for idx in range(number_of_threads): + date_time = start_time + diff * idx + datelist.append(date_time.strftime(DATETIME_FORMAT)) + formatted_end_time = end_time.strftime(DATETIME_FORMAT) + datelist.append(formatted_end_time) + return formatted_end_time, datelist -def get_partition_time(max_threads, start_time, end_time): - """ Divides the time range of indexing into partitions based on number of processes. - :param max_threads: Number of threads in multithreading - :param start_time: Start time of a time range - :param end_time: End time of a time range +def spawn_threads(max_threads): + """ Spawns number of threads provided by user in the config file + :param max_threads: maximum number of threads defined by user """ - partitions = list(datetime_partitioning(start_time, end_time, max_threads)) - datelist = [] - for sub in partitions: - datelist.append(sub.strftime(DATETIME_FORMAT)) - return end_time, datelist + return ThreadPool(max_threads) From 08e79e504db797f70d8283d1a73fdf9e4a102554 Mon Sep 17 00:00:00 2001 From: praveen-elastic Date: Thu, 24 Mar 2022 20:03:15 +0530 Subject: [PATCH 4/9] modify multiprocessing approach to introduce queue --- ees_sharepoint/connector_queue.py | 49 ++ ees_sharepoint/fetch_index.py | 672 --------------------- ees_sharepoint/full_sync_command.py | 26 +- ees_sharepoint/incremental_sync_command.py | 26 +- ees_sharepoint/permission_sync_command.py | 49 +- ees_sharepoint/schema.py | 10 +- ees_sharepoint/sync_enterprise_search.py | 76 +++ ees_sharepoint/sync_sharepoint.py | 644 ++++++++++++++++++++ ees_sharepoint/utils.py | 74 ++- sharepoint_server_connector.yml | 6 +- 10 files changed, 885 insertions(+), 747 deletions(-) create mode 100644 ees_sharepoint/connector_queue.py delete mode 100644 ees_sharepoint/fetch_index.py create mode 100644 ees_sharepoint/sync_enterprise_search.py create mode 100644 ees_sharepoint/sync_sharepoint.py diff --git a/ees_sharepoint/connector_queue.py b/ees_sharepoint/connector_queue.py new file mode 100644 index 0000000..71792a7 --- /dev/null +++ b/ees_sharepoint/connector_queue.py @@ -0,0 +1,49 @@ +# +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License 2.0; +# you may not use this file except in compliance with the Elastic License 2.0. +# +import multiprocessing +from multiprocessing.queues import Queue +from .utils import split_documents_into_equal_chunks + + +BATCH_SIZE = 100 + + +class ConnectorQueue(Queue): + """Class to support additional queue operations specific to the connector""" + + def __init__(self): + ctx = multiprocessing.get_context() + super(ConnectorQueue, self).__init__(ctx=ctx) + + def end_signal(self): + """Send an terminate signal to indicate the queue can be closed""" + + signal_close = {"type": "signal_close"} + self.put(signal_close) + + def put_checkpoint(self, key, checkpoint_time, indexing_type): + """Put the checkpoint object in the queue which will be used by the consumer to update the checkpoint file + + :param key: The key of the checkpoint dictionary + :param checkpoint_time: The end time that will be stored in the checkpoint as {'key': 'checkpoint_time'} + :param indexing_type: The type of the indexing i.e. Full or Incremental + """ + + checkpoint = {"type": "checkpoint", "data": (key, checkpoint_time, indexing_type)} + self.put(checkpoint) + + def append_to_queue(self, documents): + """Append documents to the shared queue + :param documents: documents fetched from sharepoint + """ + if documents: + results = documents + # In case documents is object of tuple + if isinstance(documents, tuple): + results = documents[-1] + for chunk in split_documents_into_equal_chunks(results.get("data"), BATCH_SIZE): + document = {"type": results.get("type"), "data": chunk} + self.put(document) diff --git a/ees_sharepoint/fetch_index.py b/ees_sharepoint/fetch_index.py deleted file mode 100644 index fc31c38..0000000 --- a/ees_sharepoint/fetch_index.py +++ /dev/null @@ -1,672 +0,0 @@ -# -# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -# or more contributor license agreements. Licensed under the Elastic License 2.0; -# you may not use this file except in compliance with the Elastic License 2.0. -# -"""fetch_index module allows to sync data to Elastic Enterprise Search. - -It's possible to run full syncs and incremental syncs with this module.""" - -import copy -import os -import re -import json -from datetime import datetime -from urllib.parse import urljoin -from dateutil.parser import parse - -from tika.tika import TikaException - -from .checkpointing import Checkpoint -from .usergroup_permissions import Permissions -from .utils import encode, extract, split_list_into_buckets, split_date_range_into_chunks, split_dict_in_chunks, spawn_threads -from . import adapter - -IDS_PATH = os.path.join(os.path.dirname(__file__), 'doc_id.json') - -SITE = "site" -LIST = "list" -ITEM = "item" -SITES = "sites" -LISTS = "lists" -LIST_ITEMS = "list_items" -DRIVE_ITEMS = "drive_items" -BATCH_SIZE = 100 - - -def get_results(logger, response, entity_name): - """ Attempts to fetch results from a Sharepoint Server response - :param response: response from the sharepoint client - :param entity_name: entity name whether it is SITES, LISTS, LIST_ITEMS OR DRIVE_ITEMS - Returns: - Parsed response - """ - if not response: - logger.error(f"Empty response when fetching {entity_name}") # TODO: should it be an error? - return None - - if entity_name == "attachment" and not response.get("d", {}).get("results"): - logger.info("Failed to fetch attachment") # TODO: not sure if it's the right message - return None - return response.get("d", {}).get("results") - - -class FetchIndex: - """This class allows ingesting data from Sharepoint Server to Elastic Enterprise Search.""" - def __init__(self, config, logger, workplace_search_client, sharepoint_client, start_time, end_time): - self.config = config - self.logger = logger - self.workplace_search_client = workplace_search_client - self.sharepoint_client = sharepoint_client - - self.ws_source = config.get_value("workplace_search.source_id") - self.objects = config.get_value("objects") - self.site_collections = config.get_value("sharepoint.site_collections") - self.enable_permission = config.get_value("enable_document_permission") - self.start_time = start_time - self.end_time = end_time - self.max_threads = config.get_value("max_threads") - self.mapping_sheet_path = config.get_value("sharepoint_workplace_user_mapping") - - self.checkpoint = Checkpoint(config, logger) - self.permissions = Permissions(self.sharepoint_client, self.workplace_search_client, logger) - - def index_document(self, document, param_name): - """ This method indexes the documents to the workplace. - :param document: document to be indexed - :param param_name: parameter name whether it is SITES, LISTS LIST_ITEMS OR DRIVE_ITEMS - """ - if document: - total_documents_indexed = 0 - for chunk in split_list_into_buckets(document, BATCH_SIZE): - response = self.workplace_search_client.index_documents( - content_source_id=self.ws_source, - documents=chunk - ) - for each in response['results']: - if not each['errors']: - total_documents_indexed += 1 - else: - self.logger.error("Error while indexing %s. Error: %s" % (each['id'], each['errors'])) - self.logger.info("Successfully indexed %s %s to the workplace" % ( - total_documents_indexed, param_name)) - - def threaded_index_documents(self, document, param_name): - """ Applies multithreading on indexing functionality - :param document: documents to be indexed equally in each thread - :param param_name: parameter name whether it is SITES, LISTS LIST_ITEMS OR DRIVE_ITEMS - """ - chunk_documents = split_list_into_buckets(document, self.max_threads) - thread_pool = spawn_threads(self.max_threads) - for doc in chunk_documents: - thread_pool.apply_async(self.index_document, (doc, param_name)) - - thread_pool.close() - thread_pool.join() - - def get_schema_fields(self, document_name): - """ returns the schema of all the include_fields or exclude_fields specified in the configuration file. - :param document_name: document name from SITES, LISTS, LIST_ITEMS OR DRIVE_ITEMS - Returns: - schema: included and excluded fields schema - """ - fields = self.objects.get(document_name) - adapter_schema = adapter.DEFAULT_SCHEMA[document_name] - field_id = adapter_schema['id'] - if fields: - include_fields = fields.get("include_fields") - exclude_fields = fields.get("exclude_fields") - if include_fields: - adapter_schema = {key: val for key, val in adapter_schema.items() if val in include_fields} - elif exclude_fields: - adapter_schema = {key: val for key, val in adapter_schema.items() if val not in exclude_fields} - adapter_schema['id'] = field_id - return adapter_schema - - def fetch_sites(self, parent_site_url, sites, ids, index, start_time, end_time): - """This method fetches sites from a collection and invokes the - index permission method to get the document level permissions. - If the fetching is not successful, it logs proper message. - :param parent_site_url: parent site relative path - :param sites: dictionary of site path and it's last updated time - :param ids: structure containing id's of all objects - :param index: index, boolean value - :param start_time: start time for fetching the data - :param end_time: end time for fetching the data - Returns: - document: response of sharepoint GET call, with fields specified in the schema - """ - rel_url = f"{parent_site_url}/_api/web/webs" - self.logger.info("Fetching the sites detail from url: %s" % (rel_url)) - query = self.sharepoint_client.get_query( - start_time, end_time, SITES) - response = self.sharepoint_client.get(rel_url, query, SITES) - - response_data = get_results(self.logger, response, SITES) - if not response_data: - self.logger.info("No sites were created in %s for this interval: start time: %s and end time: %s" % (parent_site_url, start_time, end_time)) - return sites - self.logger.info( - "Successfully fetched and parsed %s sites response from SharePoint" % len(response_data) - ) - - schema = self.get_schema_fields(SITES) - document = [] - - if index: - for i, _ in enumerate(response_data): - doc = {'type': SITE} - # need to convert date to iso else workplace search throws error on date format Invalid field value: Value '2021-09-29T08:13:00' cannot be parsed as a date (RFC 3339)"]} - response_data[i]['Created'] += 'Z' - for field, response_field in schema.items(): - doc[field] = response_data[i].get(response_field) - if self.enable_permission is True: - doc["_allow_permissions"] = self.index_permissions( - key=SITES, site=response_data[i]['ServerRelativeUrl']) - document.append(doc) - ids["sites"].update({doc["id"]: response_data[i]["ServerRelativeUrl"]}) - for result in response_data: - site_server_url = result.get("ServerRelativeUrl") - sites.update({site_server_url: result.get("LastItemModifiedDate")}) - self.fetch_sites(site_server_url, sites, ids, index, start_time, end_time) - return sites, document - - def fetch_lists(self, sites, ids, index): - """This method fetches lists from all sites in a collection and invokes the - index permission method to get the document level permissions. - If the fetching is not successful, it logs proper message. - :param sites: dictionary of site path and it's last updated time - :param ids: structure containing id's of all objects - :param index: index, boolean value - Returns: - document: response of sharepoint GET call, with fields specified in the schema - """ - self.logger.info("Fetching lists for all the sites") - responses = [] - document = [] - if not sites: - self.logger.info("No list was created in this interval: start time: %s and end time: %s" % (self.start_time, self.end_time)) - return [], [] - schema_list = self.get_schema_fields(LISTS) - for site_details in sites: - for site, time_modified in site_details.items(): - if parse(self.start_time) > parse(time_modified): - continue - rel_url = f"{site}/_api/web/lists" - self.logger.info( - "Fetching the lists for site: %s from url: %s" - % (site, rel_url) - ) - - query = self.sharepoint_client.get_query( - self.start_time, self.end_time, LISTS) - response = self.sharepoint_client.get( - rel_url, query, LISTS) - - response_data = get_results(self.logger, response, LISTS) - if not response_data: - self.logger.info("No list was created for the site : %s in this interval: start time: %s and end time: %s" % (site, self.start_time, self.end_time)) - continue - self.logger.info( - "Successfully fetched and parsed %s list response for site: %s from SharePoint" - % (len(response_data), site) - ) - - base_list_url = f"{site}/Lists/" - - if index: - if not ids["lists"].get(site): - ids["lists"].update({site: {}}) - for i, _ in enumerate(response_data): - doc = {'type': LIST} - for field, response_field in schema_list.items(): - doc[field] = response_data[i].get( - response_field) - if self.enable_permission is True: - doc["_allow_permissions"] = self.index_permissions( - key=LISTS, site=site, list_id=doc["id"], list_url=response_data[i]['ParentWebUrl'], itemid=None) - doc["url"] = urljoin(base_list_url, re.sub( - r'[^ \w+]', '', response_data[i]["Title"])) - document.append(doc) - ids["lists"][site].update({doc["id"]: response_data[i]["Title"]}) - - responses.append(response_data) - lists = {} - libraries = {} - for response in responses: - for result in response: - if result.get('BaseType') == 1: - libraries[result.get("Id")] = [result.get( - "ParentWebUrl"), result.get("Title"), result.get("LastItemModifiedDate")] - else: - lists[result.get("Id")] = [result.get( - "ParentWebUrl"), result.get("Title"), result.get("LastItemModifiedDate")] - return lists, libraries, document - - def fetch_items(self, lists, ids): - """This method fetches items from all the lists in a collection and - invokes theindex permission method to get the document level permissions. - If the fetching is not successful, it logs proper message. - :param lists: document lists - :param ids: structure containing id's of all objects - Returns: - document: response of sharepoint GET call, with fields specified in the schema - """ - responses = [] - # here value is a list of url and title - self.logger.info("Fetching all the items for the lists") - if not lists: - self.logger.info("No item was created in this interval: start time: %s and end time: %s" % (self.start_time, self.end_time)) - else: - for value in lists.values(): - if not ids["list_items"].get(value[0]): - ids["list_items"].update({value[0]: {}}) - schema_item = self.get_schema_fields(LIST_ITEMS) - for list_content, value in lists.items(): - if parse(self.start_time) > parse(value[2]): - continue - rel_url = f"{value[0]}/_api/web/lists(guid'{list_content}')/items" - self.logger.info( - "Fetching the items for list: %s from url: %s" - % (value[1], rel_url) - ) - - query = self.sharepoint_client.get_query( - self.start_time, self.end_time, LIST_ITEMS) - response = self.sharepoint_client.get(rel_url, query, LIST_ITEMS) - - response_data = get_results(self.logger, response, LIST_ITEMS) - if not response_data: - self.logger.info("No item was created for the list %s in this interval: start time: %s and end time: %s" % (value[1], self.start_time, self.end_time)) - continue - self.logger.info( - "Successfully fetched and parsed %s listitem response for list: %s from SharePoint" - % (len(response_data), value[1]) - ) - - list_name = re.sub(r'[^ \w+]', '', value[1]) - base_item_url = f"{value[0]}/Lists/{list_name}/DispForm.aspx?ID=" - document = [] - if not ids["list_items"][value[0]].get(list_content): - ids["list_items"][value[0]].update({list_content: []}) - rel_url = f'{value[0]}/_api/web/lists(guid\'{list_content}\')/items?$select=Attachments,AttachmentFiles,Title&$expand=AttachmentFiles' - - new_query = "&" + query.split("?")[1] - file_response_data = self.sharepoint_client.get(rel_url, query=new_query, param_name="attachment") - if file_response_data: - file_response_data = get_results(self.logger, file_response_data.json(), "attachment") - - for i, _ in enumerate(response_data): - doc = {'type': ITEM} - if response_data[i].get('Attachments') and file_response_data: - for data in file_response_data: - if response_data[i].get('Title') == data['Title']: - file_relative_url = data[ - 'AttachmentFiles']['results'][0]['ServerRelativeUrl'] - url_s = f"{value[0]}/_api/web/GetFileByServerRelativeUrl(\'{encode(file_relative_url)}\')/$value" - response = self.sharepoint_client.get( - url_s, query='', param_name="attachment") - doc['body'] = {} - if response and response.ok: - try: - doc['body'] = extract(response.content) - except TikaException as exception: - self.logger.error('Error while extracting the contents from the attachment, Error %s' % (exception)) - - break - for field, response_field in schema_item.items(): - doc[field] = response_data[i].get( - response_field) - if self.enable_permission is True: - doc["_allow_permissions"] = self.index_permissions( - key=LIST_ITEMS, list_id=list_content, list_url=value[0], itemid=str(response_data[i]["Id"])) - doc["url"] = base_item_url + str(response_data[i]["Id"]) - document.append(doc) - if response_data[i].get("GUID") not in ids["list_items"][value[0]][list_content]: - ids["list_items"][value[0]][list_content].append( - response_data[i].get("GUID")) - responses.extend(document) - return responses - - def fetch_drive_items(self, libraries, ids): - """This method fetches items from all the lists in a collection and - invokes theindex permission method to get the document level permissions. - If the fetching is not successful, it logs proper message. - :param libraries: document lists - :param ids: structure containing id's of all objects - """ - responses = [] - # here value is a list of url and title of the library - self.logger.info("Fetching all the files for the library") - if not libraries: - self.logger.info("No file was created in this interval: start time: %s and end time: %s" % (self.start_time, self.end_time)) - else: - schema_drive = self.get_schema_fields(DRIVE_ITEMS) - for lib_content, value in libraries.items(): - if parse(self.start_time) > parse(value[2]): - continue - if not ids["drive_items"].get(value[0]): - ids["drive_items"].update({value[0]: {}}) - rel_url = f"{value[0]}/_api/web/lists(guid'{lib_content}')/items?$select=Modified,Id,GUID,File,Folder&$expand=File,Folder" - self.logger.info( - "Fetching the items for libraries: %s from url: %s" - % (value[1], rel_url) - ) - query = self.sharepoint_client.get_query( - self.start_time, self.end_time, DRIVE_ITEMS) - response = self.sharepoint_client.get(rel_url, query, DRIVE_ITEMS) - response_data = get_results(self.logger, response, DRIVE_ITEMS) - if not response_data: - self.logger.info("No item was created for the library %s in this interval: start time: %s and end time: %s" % (value[1], self.start_time, self.end_time)) - continue - self.logger.info( - "Successfully fetched and parsed %s drive item response for library: %s from SharePoint" - % (len(response_data), value[1]) - ) - document = [] - if not ids["drive_items"][value[0]].get(lib_content): - ids["drive_items"][value[0]].update({lib_content: []}) - for i, _ in enumerate(response_data): - if response_data[i]['File'].get('TimeLastModified'): - obj_type = 'File' - doc = {'type': "file"} - file_relative_url = response_data[i]['File']['ServerRelativeUrl'] - url_s = f"{value[0]}/_api/web/GetFileByServerRelativeUrl(\'{encode(file_relative_url)}\')/$value" - response = self.sharepoint_client.get(url_s, query='', param_name="attachment") - doc['body'] = {} - if response and response.ok: - try: - doc['body'] = extract(response.content) - except TikaException as exception: - self.logger.error('Error while extracting the contents from the file at %s, Error %s' % (response_data[i].get('Url'), exception)) - else: - obj_type = 'Folder' - doc = {'type': "folder"} - for field, response_field in schema_drive.items(): - doc[field] = response_data[i][obj_type].get( - response_field) - doc['id'] = response_data[i].get("GUID") - if self.enable_permission is True: - doc["_allow_permissions"] = self.index_permissions( - key=DRIVE_ITEMS, list_id=lib_content, list_url=value[0], itemid=str(response_data[i].get("ID"))) - doc["url"] = response_data[i][obj_type]["ServerRelativeUrl"] - document.append(doc) - if doc['id'] not in ids["drive_items"][value[0]][lib_content]: - ids["drive_items"][value[0]][lib_content].append(doc['id']) - responses.extend(document) - return responses - - def get_roles(self, key, site, list_url, list_id, itemid): - """ Checks the permissions and returns the user roles. - :param key: key, a string value - :param site: site name to check the permission - :param list_url: list url to access the list - :param list_id: list id to check the permission - :param itemid: item id to check the permission - Returns: - roles: user roles - """ - if key == SITES: - rel_url = site - roles = self.permissions.fetch_users(key, rel_url) - - elif key == LISTS: - rel_url = list_url - roles = self.permissions.fetch_users( - key, rel_url, list_id=list_id - ) - - else: - rel_url = list_url - roles = self.permissions.fetch_users( - key, rel_url, list_id=list_id, item_id=itemid - ) - - return roles - - def index_permissions( - self, - key, - site=None, - list_id=None, - list_url=None, - itemid=None, - ): - """This method when invoked, checks the permission inheritance of each object. - If the object has unique permissions, the list of users having access to it - is fetched using sharepoint api else the permission levels of the that object - is taken same as the permission level of the site collection. - :param key: key, a string value - :param site: site name to index the permission for the site - :param list_id: list id to index the permission for the list - :param list_url: url of the list - :param itemid: item id to index the permission for the item - Returns: - groups: list of users having access to the given object - """ - roles = self.get_roles(key, site, list_url, list_id, itemid) - - groups = [] - - if not roles: - return [] - roles = get_results(self.logger, roles.json(), "roles") - - for role in roles: - title = role["Member"]["Title"] - groups.append(title) - return groups - - def index_sites(self, ids, end_time, collection): - """ Indexes the site details to the Workplace Search - :param ids: id collection of the all the objects - :param end_time: end time for fetching the data - :param collection: collection name - """ - _, datelist = split_date_range_into_chunks(self.start_time, self.end_time, self.max_threads) - results = [] - parent_site_url = f"/sites/{collection}" - sites_path = [{parent_site_url: end_time}] - thread_pool = spawn_threads(self.max_threads) - for num in range(0, self.max_threads): - start_time_partition = datelist[num] - end_time_partition = datelist[num + 1] - thread = thread_pool.apply_async( - self.fetch_sites, (parent_site_url, {}, ids, (SITES in self.objects), - start_time_partition, end_time_partition)) - results.append(thread) - - sites, documents = [], [] - for result in [r.get() for r in results]: - if result: - sites.append(result[0]) - documents.extend(result[1]) - thread_pool.close() - thread_pool.join() - self.threaded_index_documents(documents, SITES) - sites_path.extend(sites) - return sites_path - - def index_lists(self, sites_path, ids): - """ Indexes the list details to the Workplace Search - :param sites_path: dictionary of site path and it's last updated time - :param ids: id collection of the all the objects - """ - results, lists_details, libraries_details = [], {}, {} - thread_pool = spawn_threads(self.max_threads) - partitioned_sites = split_list_into_buckets(sites_path, self.max_threads) - for site in partitioned_sites: - thread = thread_pool.apply_async(self.fetch_lists, (site, ids, (LISTS in self.objects))) - results.append(thread) - documents = [] - for result in [r.get() for r in results]: - if result: - lists_details.update(result[0]) - libraries_details.update(result[1]) - documents.extend(result[2]) - thread_pool.close() - thread_pool.join() - self.threaded_index_documents(documents, LISTS) - return [lists_details, libraries_details] - - def index_list_items(self, lists_details, ids): - """ Indexes the list_items to the Workplace Search - :param lists_details: dictionary containing list name, list path and id - :param ids: id collection of the all the objects - """ - results = [] - partition = [] - thread_pool = spawn_threads(self.max_threads) - partition = split_dict_in_chunks(lists_details, self.max_threads) - for list_data in partition: - thread = thread_pool.apply_async(self.fetch_items, (list_data, ids)) - results.append(thread) - documents = [] - for result in [r.get() for r in results]: - if result: - documents.extend(result) - thread_pool.close() - thread_pool.join() - self.threaded_index_documents(documents, LIST_ITEMS) - - def index_drive_items(self, libraries_details, ids): - """ Indexes the drive_items to the Workplace Search - :param libraries_details: dictionary containing library name, library path and id - :param ids: id collection of the all the objects - """ - results = [] - partition = [] - thread_pool = spawn_threads(self.max_threads) - partition = split_dict_in_chunks(libraries_details, self.max_threads) - for list_data in partition: - thread = thread_pool.apply_async(self.fetch_drive_items, (list_data, ids)) - results.append(thread) - documents = [] - for result in [r.get() for r in results]: - if result: - documents.extend(result) - thread_pool.close() - thread_pool.join() - self.threaded_index_documents(documents, DRIVE_ITEMS) - - def indexing(self, collection, ids, storage, job_type, collected_objects, end_time): - """This method fetches all the objects from sharepoint server and - ingests them into the workplace search - :param collection: collection name - :param ids: id collection of the all the objects - :param storage: temporary storage for storing all the documents - :param job_type: denotes the type of sharepoint object being fetched in a particular process - :param collected_objects: helper variable to provide the data to children object - :param end_time: end time for fetching the data - """ - if job_type == "sites": - collected_objects = self.index_sites(ids, end_time, collection) - - elif job_type == "lists": - collected_objects = self.index_lists(collected_objects, ids) - - elif job_type == LIST_ITEMS and LIST_ITEMS in self.objects: - self.index_list_items(collected_objects[0], ids) - - elif job_type == DRIVE_ITEMS and DRIVE_ITEMS in self.objects: - self.index_drive_items(collected_objects[1], ids) - - self.logger.info( - "Completed fetching all the objects for site collection: %s" - % (collection) - ) - - self.logger.info( - "Saving the checkpoint for the site collection: %s" % (collection) - ) - if ids.get(job_type): - prev_ids = storage[job_type] - if job_type == 'sites': - prev_ids.update(ids[job_type]) - elif job_type == "lists": - for site, list_content in ids[job_type].items(): - prev_ids[site] = {**prev_ids.get(site, {}), **ids[job_type][site]} - else: - for site, list_content in ids[job_type].items(): - prev_ids[site] = ids[job_type][site] if not prev_ids.get(site) else prev_ids[site] - for list_name in list_content.keys(): - prev_ids[site][list_name] = list(set([*prev_ids[site].get(list_name, []), *ids[job_type][site][list_name]])) - storage[job_type] = prev_ids - return collected_objects - - -def start(indexing_type, config, logger, workplace_search_client, sharepoint_client): - """Runs the indexing logic - :param indexing_type: The type of the indexing i.e. incremental or full - :param config: instance of Configuration class - :param logger: instance of Logger class - :param workplace_search_client: instance of WorkplaceSearch - :param sharepoint_client: instance of SharePoint - """ - logger.info(f"Starting the {indexing_type} indexing..") - current_time = (datetime.utcnow()).strftime("%Y-%m-%dT%H:%M:%SZ") - ids_collection = {"global_keys": {}} - storage_with_collection = {"global_keys": {}, "delete_keys": {}} - - if (os.path.exists(IDS_PATH) and os.path.getsize(IDS_PATH) > 0): - with open(IDS_PATH) as ids_store: - try: - ids_collection = json.load(ids_store) - except ValueError as exception: - logger.exception( - "Error while parsing the json file of the ids store from path: %s. Error: %s" - % (IDS_PATH, exception) - ) - - storage_with_collection["delete_keys"] = copy.deepcopy(ids_collection.get("global_keys")) - check = Checkpoint(config, logger) - - try: - for collection in config.get_value("sharepoint.site_collections"): - storage = {"sites": {}, "lists": {}, "list_items": {}, "drive_items": {}} - logger.info( - "Starting the data fetching for site collection: %s" - % (collection) - ) - - if indexing_type == "incremental": - start_time, end_time = check.get_checkpoint( - collection, current_time) - else: - start_time = config.get_value("start_time") - end_time = current_time - - if not ids_collection["global_keys"].get(collection): - ids_collection["global_keys"][collection] = { - "sites": {}, "lists": {}, "list_items": {}, "drive_items": {}} - - logger.info( - "Starting to index all the objects configured in the object field: %s" - % (str(config.get_value("objects"))) - ) - - indexer = FetchIndex(config, logger, workplace_search_client, sharepoint_client, start_time, end_time) - returned_documents = None - for job_type in ["sites", "lists", "list_items", "drive_items"]: - logger.info(f"Indexing {job_type}") - returned_documents = indexer.indexing( - collection, - ids_collection["global_keys"][collection], - storage, - job_type, - returned_documents, - end_time - ) - - storage_with_collection["global_keys"][collection] = storage.copy() - - check.set_checkpoint(collection, end_time, indexing_type) - except Exception as exception: - raise exception - - with open(IDS_PATH, "w") as file: - try: - json.dump(storage_with_collection, file, indent=4) - except ValueError as exception: - logger.warning( - 'Error while adding ids to json file. Error: %s' % (exception)) diff --git a/ees_sharepoint/full_sync_command.py b/ees_sharepoint/full_sync_command.py index 99bfc4a..2d7bdd3 100644 --- a/ees_sharepoint/full_sync_command.py +++ b/ees_sharepoint/full_sync_command.py @@ -8,14 +8,36 @@ It will attempt to sync absolutely all documents that are available in the third-party system and ingest them into Enterprise Search instance.""" from .base_command import BaseCommand -from .fetch_index import start +from .sync_sharepoint import init_sharepoint_sync +from .connector_queue import ConnectorQueue +from .sync_enterprise_search import init_enterprise_search_sync +from multiprocessing import Process class FullSyncCommand(BaseCommand): + """This class start execution of fullsync feature.""" + def execute(self): + """This function execute the start function.""" config = self.config logger = self.logger workplace_search_client = self.workplace_search_client sharepoint_client = self.sharepoint_client - start("full", config, logger, workplace_search_client, sharepoint_client) + queue = ConnectorQueue() + producer = Process( + name="producer", + target=init_sharepoint_sync, + args=("full", config, logger, workplace_search_client, sharepoint_client, queue), + ) + producer.start() + + consumer = Process( + name="consumer", + target=init_enterprise_search_sync, + args=(config, logger, workplace_search_client, queue), + ) + consumer.start() + + producer.join() + consumer.join() diff --git a/ees_sharepoint/incremental_sync_command.py b/ees_sharepoint/incremental_sync_command.py index 604d8f3..d29d1dc 100644 --- a/ees_sharepoint/incremental_sync_command.py +++ b/ees_sharepoint/incremental_sync_command.py @@ -11,14 +11,36 @@ Recency is determined by the time when the last successful incremental or full job was ran.""" from .base_command import BaseCommand -from .fetch_index import start +from .sync_sharepoint import init_sharepoint_sync +from .connector_queue import ConnectorQueue +from .sync_enterprise_search import init_enterprise_search_sync +from multiprocessing import Process class IncrementalSyncCommand(BaseCommand): + """This class start execution of incrementalsync feature.""" + def execute(self): + """This function execute the start function.""" config = self.config logger = self.logger workplace_search_client = self.workplace_search_client sharepoint_client = self.sharepoint_client - start("incremental", config, logger, workplace_search_client, sharepoint_client) + queue = ConnectorQueue() + producer = Process( + name="producer", + target=init_sharepoint_sync, + args=("incremental", config, logger, workplace_search_client, sharepoint_client, queue), + ) + producer.start() + + consumer = Process( + name="consumer", + target=init_enterprise_search_sync, + args=(config, logger, workplace_search_client, queue), + ) + consumer.start() + + producer.join() + consumer.join() diff --git a/ees_sharepoint/permission_sync_command.py b/ees_sharepoint/permission_sync_command.py index 03cc70d..3332458 100644 --- a/ees_sharepoint/permission_sync_command.py +++ b/ees_sharepoint/permission_sync_command.py @@ -7,13 +7,14 @@ It will attempt to remove from Enterprise Search instance the documents that have been deleted from the third-party system.""" -import os import csv +import os + +from ees_sharepoint.base_command import BaseCommand from .checkpointing import Checkpoint +from .sync_sharepoint import get_results from .usergroup_permissions import Permissions -from .fetch_index import get_results -from ees_sharepoint.base_command import BaseCommand class PermissionSyncDisabledException(Exception): @@ -33,6 +34,7 @@ class PermissionSyncCommand(BaseCommand): It can be used to run the job that will periodically sync permissions from Sharepoint Server to Elastic Enteprise Search.""" + def __init__(self, args): super().__init__(args) @@ -47,8 +49,8 @@ def __init__(self, args): self.permissions = Permissions(self.sharepoint_client, self.workplace_search_client, self.logger) def get_users_id(self): - """ This method returns the dictionary of dictionaries containing users and their id - as a key value pair for all the site-collections.""" + """This method returns the dictionary of dictionaries containing users and their id + as a key value pair for all the site-collections.""" user_ids = {} for collection in self.site_collections: user_id_collection = {} @@ -65,8 +67,8 @@ def get_users_id(self): return user_ids def get_user_groups(self, user_ids): - """ This method returns the groups of each user in all the site-collections - :param user_ids: user ids to fetch the groups of the specific user""" + """This method returns the groups of each user in all the site-collections + :param user_ids: user ids to fetch the groups of the specific user""" user_group = {} for collection in self.site_collections: user_group_collection = {} @@ -76,41 +78,34 @@ def get_user_groups(self, user_ids): if response: groups = get_results(self.logger, response.json(), "user_groups") if groups: - user_group_collection[name] = [group['Title'] for group in groups] + user_group_collection[name] = [group["Title"] for group in groups] user_group.update({collection: user_group_collection}) return user_group def workplace_add_permission(self, permissions): - """ This method when invoked would index the permission provided in the paramater - for the user in paramter user_name - :param permissions: dictionary of dictionaries containing permissions of all the users in each site-collection.""" + """This method when invoked would index the permission provided in the paramater + for the user in paramter user_name + :param permissions: dictionary of dictionaries containing permissions of all the users in each site-collection.""" for collection in self.site_collections: for user_name, permission_list in permissions[collection].items(): try: self.workplace_search_client.add_user_permissions( content_source_id=self.ws_source, user=user_name, - body={ - "permissions": permission_list - }, - ) - self.logger.info( - "Successfully indexed the permissions for user %s to the workplace" % ( - user_name - ) + body={"permissions": permission_list}, ) + self.logger.info("Successfully indexed the permissions for user %s to the workplace" % (user_name)) except Exception as exception: self.logger.exception( - "Error while indexing the permissions for user: %s to the workplace. Error: %s" % ( - user_name, exception - ) + "Error while indexing the permissions for user: %s to the workplace. Error: %s" + % (user_name, exception) ) def sync_permissions(self): - """ This method when invoked, checks the permission of SharePoint users and update those user - permissions in the Workplace Search.""" + """This method when invoked, checks the permission of SharePoint users and update those user + permissions in the Workplace Search.""" rows = {} - if (os.path.exists(self.mapping_sheet_path) and os.path.getsize(self.mapping_sheet_path) > 0): + if os.path.exists(self.mapping_sheet_path) and os.path.getsize(self.mapping_sheet_path) > 0: with open(self.mapping_sheet_path) as file: csvreader = csv.reader(file) for row in csvreader: @@ -133,7 +128,7 @@ def sync_permissions(self): self.workplace_add_permission(user_groups) def execute(self): - """ Runs the permission indexing logic""" + """Runs the permission indexing logic""" logger = self.logger config = self.config @@ -141,6 +136,6 @@ def execute(self): enable_permission = config.get_value("enable_document_permission") if not enable_permission: - logger.warn('Exiting as the enable permission flag is set to False') + logger.warn("Exiting as the enable permission flag is set to False") raise PermissionSyncDisabledException self.sync_permissions() diff --git a/ees_sharepoint/schema.py b/ees_sharepoint/schema.py index bf63ad0..807c6ca 100644 --- a/ees_sharepoint/schema.py +++ b/ees_sharepoint/schema.py @@ -165,10 +165,16 @@ def coerce_rfc_3339_date(input_date): 'default': 3, 'min': 1 }, - 'max_threads': { + 'sharepoint_sync_thread_count': { 'required': False, 'type': 'integer', - 'default': 40, + 'default': 5, + 'min': 1 + }, + 'enterprise_search_sync_thread_count': { + 'required': False, + 'type': 'integer', + 'default': 5, 'min': 1 }, 'sharepoint_workplace_user_mapping': { diff --git a/ees_sharepoint/sync_enterprise_search.py b/ees_sharepoint/sync_enterprise_search.py new file mode 100644 index 0000000..87e8bb4 --- /dev/null +++ b/ees_sharepoint/sync_enterprise_search.py @@ -0,0 +1,76 @@ +# +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License 2.0; +# you may not use this file except in compliance with the Elastic License 2.0. +# +from multiprocessing.pool import ThreadPool +from .utils import split_documents_into_equal_chunks +from .checkpointing import Checkpoint + +BATCH_SIZE = 100 + + +class SyncEnterpriseSearch: + """This class allows ingesting documents to Elastic Enterprise Search.""" + + def __init__(self, config, logger, workplace_search_client, queue): + self.config = config + self.logger = logger + self.workplace_search_client = workplace_search_client + self.ws_source = config.get_value("workplace_search.source_id") + self.enterprise_search_thread_count = config.get_value("enterprise_search_sync_thread_count") + self.thread_pool = ThreadPool(self.enterprise_search_thread_count) + self.queue = queue + + def index_documents(self, documents): + """This method indexes the documents to the Enterprise Search. + :param documents: documents to be indexed + """ + total_documents_indexed = 0 + if documents: + responses = self.workplace_search_client.index_documents( + content_source_id=self.ws_source, documents=documents + ) + for response in responses["results"]: + if not response["errors"]: + total_documents_indexed += 1 + else: + self.logger.error("Error while indexing %s. Error: %s" % (response["id"], response["errors"])) + self.logger.info("Successfully indexed %s documents to the workplace" % (total_documents_indexed)) + + def perform_sync(self): + """Pull documents from the queue and synchronize it to the Enterprise Search.""" + checkpoint = Checkpoint(self.config, self.logger) + signal_open = True + while signal_open: + for _ in range(0, self.enterprise_search_thread_count): + documents_to_index = [] + while len(documents_to_index) < BATCH_SIZE: + documents = self.queue.get() + if documents.get("type") == "signal_close": + signal_open = False + break + elif documents.get("type") == "checkpoint": + checkpoint.set_checkpoint( + documents.get("data")[0], documents.get("data")[1], documents.get("data")[2] + ) + break + else: + documents_to_index.extend(documents.get("data")) + for chunk in split_documents_into_equal_chunks(documents_to_index, BATCH_SIZE): + self.thread_pool.apply_async(self.index_documents, (chunk,)) + if not signal_open: + break + self.thread_pool.close() + self.thread_pool.join() + + +def init_enterprise_search_sync(config, logger, workplace_search_client, queue): + """Runs the indexing logic + :param config: instance of Configuration class + :param logger: instance of Logger class + :param workplace_search_client: instance of WorkplaceSearch + :param queue: Shared queue to push the objects fetched from SharePoint + """ + indexer = SyncEnterpriseSearch(config, logger, workplace_search_client, queue) + indexer.perform_sync() diff --git a/ees_sharepoint/sync_sharepoint.py b/ees_sharepoint/sync_sharepoint.py new file mode 100644 index 0000000..1884c79 --- /dev/null +++ b/ees_sharepoint/sync_sharepoint.py @@ -0,0 +1,644 @@ +# +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License 2.0; +# you may not use this file except in compliance with the Elastic License 2.0. +# +"""sync_sharepoint module allows to sync data to Elastic Enterprise Search. + +It's possible to run full syncs and incremental syncs with this module.""" + +import copy +import json +import os +import re +from datetime import datetime +from urllib.parse import urljoin +from dateutil.parser import parse +from multiprocessing.pool import ThreadPool +from tika.tika import TikaException + +from . import adapter +from .checkpointing import Checkpoint +from .usergroup_permissions import Permissions +from .utils import ( + encode, + extract, + split_list_into_buckets, + split_date_range_into_chunks, + split_documents_into_equal_chunks, +) + +IDS_PATH = os.path.join(os.path.dirname(__file__), "doc_id.json") + +SITE = "site" +LIST = "list" +ITEM = "item" +SITES = "sites" +LISTS = "lists" +LIST_ITEMS = "list_items" +DRIVE_ITEMS = "drive_items" + + +def get_results(logger, response, entity_name): + """Attempts to fetch results from a Sharepoint Server response + :param response: response from the sharepoint client + :param entity_name: entity name whether it is SITES, LISTS, LIST_ITEMS OR DRIVE_ITEMS + Returns: + Parsed response + """ + if not response: + logger.error(f"Empty response when fetching {entity_name}") # TODO: should it be an error? + return None + + if entity_name == "attachment" and not response.get("d", {}).get("results"): + logger.info("Failed to fetch attachment") # TODO: not sure if it's the right message + return None + return response.get("d", {}).get("results") + + +class SyncSharepoint: + """This class allows synching objects from the SharePoint Server.""" + + def __init__(self, config, logger, workplace_search_client, sharepoint_client, start_time, end_time, queue): + self.config = config + self.logger = logger + self.workplace_search_client = workplace_search_client + self.sharepoint_client = sharepoint_client + + self.ws_source = config.get_value("workplace_search.source_id") + self.objects = config.get_value("objects") + self.site_collections = config.get_value("sharepoint.site_collections") + self.enable_permission = config.get_value("enable_document_permission") + self.start_time = start_time + self.end_time = end_time + self.sharepoint_thread_count = config.get_value("sharepoint_sync_thread_count") + self.mapping_sheet_path = config.get_value("sharepoint_workplace_user_mapping") + + self.checkpoint = Checkpoint(config, logger) + self.permissions = Permissions(self.sharepoint_client, self.workplace_search_client, logger) + + self.thread_pool = ThreadPool(self.sharepoint_thread_count) + self.queue = queue + + def get_schema_fields(self, document_name): + """returns the schema of all the include_fields or exclude_fields specified in the configuration file. + :param document_name: document name from SITES, LISTS, LIST_ITEMS OR DRIVE_ITEMS + Returns: + schema: included and excluded fields schema + """ + fields = self.objects.get(document_name) + adapter_schema = adapter.DEFAULT_SCHEMA[document_name] + field_id = adapter_schema["id"] + if fields: + include_fields = fields.get("include_fields") + exclude_fields = fields.get("exclude_fields") + if include_fields: + adapter_schema = {key: val for key, val in adapter_schema.items() if val in include_fields} + elif exclude_fields: + adapter_schema = {key: val for key, val in adapter_schema.items() if val not in exclude_fields} + adapter_schema["id"] = field_id + return adapter_schema + + def fetch_sites(self, parent_site_url, sites, ids, index, start_time, end_time): + """This method fetches sites from a collection and invokes the + index permission method to get the document level permissions. + If the fetching is not successful, it logs proper message. + :param parent_site_url: parent site relative path + :param sites: dictionary of site path and it's last updated time + :param ids: structure containing id's of all objects + :param index: index, boolean value + :param start_time: start time for fetching the data + :param end_time: end time for fetching the data + Returns: + document: response of sharepoint GET call, with fields specified in the schema + """ + rel_url = f"{parent_site_url}/_api/web/webs" + self.logger.info("Fetching the sites detail from url: %s" % (rel_url)) + query = self.sharepoint_client.get_query(start_time, end_time, SITES) + response = self.sharepoint_client.get(rel_url, query, SITES) + + response_data = get_results(self.logger, response, SITES) + if not response_data: + self.logger.info( + "No sites were created in %s for this interval: start time: %s and end time: %s" + % (parent_site_url, start_time, end_time) + ) + return sites + self.logger.info("Successfully fetched and parsed %s sites response from SharePoint" % len(response_data)) + + schema = self.get_schema_fields(SITES) + document = [] + + if index: + for i, _ in enumerate(response_data): + doc = {"type": SITE} + # need to convert date to iso else workplace search throws error on date format Invalid field value: Value '2021-09-29T08:13:00' cannot be parsed as a date (RFC 3339)"]} + response_data[i]["Created"] += "Z" + for field, response_field in schema.items(): + doc[field] = response_data[i].get(response_field) + if self.enable_permission is True: + doc["_allow_permissions"] = self.fetch_permissions( + key=SITES, site=response_data[i]["ServerRelativeUrl"] + ) + document.append(doc) + ids["sites"].update({doc["id"]: response_data[i]["ServerRelativeUrl"]}) + for result in response_data: + site_server_url = result.get("ServerRelativeUrl") + sites.update({site_server_url: result.get("LastItemModifiedDate")}) + self.fetch_sites(site_server_url, sites, ids, index, start_time, end_time) + + documents = {"type": SITES, "data": document} + return sites, documents + + def fetch_lists(self, sites, ids, index): + """This method fetches lists from all sites in a collection and invokes the + index permission method to get the document level permissions. + If the fetching is not successful, it logs proper message. + :param sites: dictionary of site path and it's last updated time + :param ids: structure containing id's of all objects + :param index: index, boolean value + Returns: + document: response of sharepoint GET call, with fields specified in the schema + """ + self.logger.info("Fetching lists for all the sites") + responses = [] + document = [] + if not sites: + self.logger.info( + "No list was created in this interval: start time: %s and end time: %s" + % (self.start_time, self.end_time) + ) + return [], [] + schema_list = self.get_schema_fields(LISTS) + for site_details in sites: + for site, time_modified in site_details.items(): + if parse(self.start_time) > parse(time_modified): + continue + rel_url = f"{site}/_api/web/lists" + self.logger.info("Fetching the lists for site: %s from url: %s" % (site, rel_url)) + + query = self.sharepoint_client.get_query(self.start_time, self.end_time, LISTS) + response = self.sharepoint_client.get(rel_url, query, LISTS) + + response_data = get_results(self.logger, response, LISTS) + if not response_data: + self.logger.info( + "No list was created for the site : %s in this interval: start time: %s and end time: %s" + % (site, self.start_time, self.end_time) + ) + continue + self.logger.info( + "Successfully fetched and parsed %s list response for site: %s from SharePoint" + % (len(response_data), site) + ) + + base_list_url = f"{site}/Lists/" + + if index: + if not ids["lists"].get(site): + ids["lists"].update({site: {}}) + for i, _ in enumerate(response_data): + doc = {"type": LIST} + for field, response_field in schema_list.items(): + doc[field] = response_data[i].get(response_field) + if self.enable_permission is True: + doc["_allow_permissions"] = self.fetch_permissions( + key=LISTS, + site=site, + list_id=doc["id"], + list_url=response_data[i]["ParentWebUrl"], + itemid=None, + ) + doc["url"] = urljoin(base_list_url, re.sub(r"[^ \w+]", "", response_data[i]["Title"])) + document.append(doc) + ids["lists"][site].update({doc["id"]: response_data[i]["Title"]}) + + responses.append(response_data) + lists = {} + libraries = {} + for response in responses: + for result in response: + if result.get("BaseType") == 1: + libraries[result.get("Id")] = [ + result.get("ParentWebUrl"), + result.get("Title"), + result.get("LastItemModifiedDate"), + ] + else: + lists[result.get("Id")] = [ + result.get("ParentWebUrl"), + result.get("Title"), + result.get("LastItemModifiedDate"), + ] + documents = {"type": LISTS, "data": document} + return lists, libraries, documents + + def fetch_items(self, lists, ids): + """This method fetches items from all the lists in a collection and + invokes theindex permission method to get the document level permissions. + If the fetching is not successful, it logs proper message. + :param lists: document lists + :param ids: structure containing id's of all objects + Returns: + document: response of sharepoint GET call, with fields specified in the schema + """ + responses = [] + # here value is a list of url and title + self.logger.info("Fetching all the items for the lists") + if not lists: + self.logger.info( + "No item was created in this interval: start time: %s and end time: %s" + % (self.start_time, self.end_time) + ) + else: + for value in lists.values(): + if not ids["list_items"].get(value[0]): + ids["list_items"].update({value[0]: {}}) + schema_item = self.get_schema_fields(LIST_ITEMS) + for list_content, value in lists.items(): + if parse(self.start_time) > parse(value[2]): + continue + rel_url = f"{value[0]}/_api/web/lists(guid'{list_content}')/items" + self.logger.info("Fetching the items for list: %s from url: %s" % (value[1], rel_url)) + + query = self.sharepoint_client.get_query(self.start_time, self.end_time, LIST_ITEMS) + response = self.sharepoint_client.get(rel_url, query, LIST_ITEMS) + + response_data = get_results(self.logger, response, LIST_ITEMS) + if not response_data: + self.logger.info( + "No item was created for the list %s in this interval: start time: %s and end time: %s" + % (value[1], self.start_time, self.end_time) + ) + continue + self.logger.info( + "Successfully fetched and parsed %s listitem response for list: %s from SharePoint" + % (len(response_data), value[1]) + ) + + list_name = re.sub(r"[^ \w+]", "", value[1]) + base_item_url = f"{value[0]}/Lists/{list_name}/DispForm.aspx?ID=" + document = [] + if not ids["list_items"][value[0]].get(list_content): + ids["list_items"][value[0]].update({list_content: []}) + rel_url = f"{value[0]}/_api/web/lists(guid'{list_content}')/items?$select=Attachments,AttachmentFiles,Title&$expand=AttachmentFiles" + + new_query = "&" + query.split("?")[1] + file_response_data = self.sharepoint_client.get(rel_url, query=new_query, param_name="attachment") + if file_response_data: + file_response_data = get_results(self.logger, file_response_data.json(), "attachment") + + for i, _ in enumerate(response_data): + doc = {"type": ITEM} + if response_data[i].get("Attachments") and file_response_data: + for data in file_response_data: + if response_data[i].get("Title") == data["Title"]: + file_relative_url = data["AttachmentFiles"]["results"][0]["ServerRelativeUrl"] + url_s = f"{value[0]}/_api/web/GetFileByServerRelativeUrl('{encode(file_relative_url)}')/$value" + response = self.sharepoint_client.get(url_s, query="", param_name="attachment") + doc["body"] = {} + if response and response.ok: + try: + doc["body"] = extract(response.content) + except TikaException as exception: + self.logger.error( + "Error while extracting the contents from the attachment, Error %s" + % (exception) + ) + + break + for field, response_field in schema_item.items(): + doc[field] = response_data[i].get(response_field) + if self.enable_permission is True: + doc["_allow_permissions"] = self.fetch_permissions( + key=LIST_ITEMS, list_id=list_content, list_url=value[0], itemid=str(response_data[i]["Id"]) + ) + doc["url"] = base_item_url + str(response_data[i]["Id"]) + document.append(doc) + if response_data[i].get("GUID") not in ids["list_items"][value[0]][list_content]: + ids["list_items"][value[0]][list_content].append(response_data[i].get("GUID")) + responses.extend(document) + documents = {"type": LIST_ITEMS, "data": responses} + return documents + + def fetch_drive_items(self, libraries, ids): + """This method fetches items from all the lists in a collection and + invokes theindex permission method to get the document level permissions. + If the fetching is not successful, it logs proper message. + :param libraries: document lists + :param ids: structure containing id's of all objects + """ + responses = [] + # here value is a list of url and title of the library + self.logger.info("Fetching all the files for the library") + if not libraries: + self.logger.info( + "No file was created in this interval: start time: %s and end time: %s" + % (self.start_time, self.end_time) + ) + else: + schema_drive = self.get_schema_fields(DRIVE_ITEMS) + for lib_content, value in libraries.items(): + if parse(self.start_time) > parse(value[2]): + continue + if not ids["drive_items"].get(value[0]): + ids["drive_items"].update({value[0]: {}}) + rel_url = f"{value[0]}/_api/web/lists(guid'{lib_content}')/items?$select=Modified,Id,GUID,File,Folder&$expand=File,Folder" + self.logger.info("Fetching the items for libraries: %s from url: %s" % (value[1], rel_url)) + query = self.sharepoint_client.get_query(self.start_time, self.end_time, DRIVE_ITEMS) + response = self.sharepoint_client.get(rel_url, query, DRIVE_ITEMS) + response_data = get_results(self.logger, response, DRIVE_ITEMS) + if not response_data: + self.logger.info( + "No item was created for the library %s in this interval: start time: %s and end time: %s" + % (value[1], self.start_time, self.end_time) + ) + continue + self.logger.info( + "Successfully fetched and parsed %s drive item response for library: %s from SharePoint" + % (len(response_data), value[1]) + ) + document = [] + if not ids["drive_items"][value[0]].get(lib_content): + ids["drive_items"][value[0]].update({lib_content: []}) + for i, _ in enumerate(response_data): + if response_data[i]["File"].get("TimeLastModified"): + obj_type = "File" + doc = {"type": "file"} + file_relative_url = response_data[i]["File"]["ServerRelativeUrl"] + url_s = f"{value[0]}/_api/web/GetFileByServerRelativeUrl('{encode(file_relative_url)}')/$value" + response = self.sharepoint_client.get(url_s, query="", param_name="attachment") + doc["body"] = {} + if response and response.ok: + try: + doc["body"] = extract(response.content) + except TikaException as exception: + self.logger.error( + "Error while extracting the contents from the file at %s, Error %s" + % (response_data[i].get("Url"), exception) + ) + else: + obj_type = "Folder" + doc = {"type": "folder"} + for field, response_field in schema_drive.items(): + doc[field] = response_data[i][obj_type].get(response_field) + doc["id"] = response_data[i].get("GUID") + if self.enable_permission is True: + doc["_allow_permissions"] = self.fetch_permissions( + key=DRIVE_ITEMS, + list_id=lib_content, + list_url=value[0], + itemid=str(response_data[i].get("ID")), + ) + doc["url"] = response_data[i][obj_type]["ServerRelativeUrl"] + document.append(doc) + if doc["id"] not in ids["drive_items"][value[0]][lib_content]: + ids["drive_items"][value[0]][lib_content].append(doc["id"]) + responses.extend(document) + documents = {"type": DRIVE_ITEMS, "data": responses} + return documents + + def get_roles(self, key, site, list_url, list_id, itemid): + """Checks the permissions and returns the user roles. + :param key: key, a string value + :param site: site name to check the permission + :param list_url: list url to access the list + :param list_id: list id to check the permission + :param itemid: item id to check the permission + Returns: + roles: user roles + """ + if key == SITES: + rel_url = site + roles = self.permissions.fetch_users(key, rel_url) + + elif key == LISTS: + rel_url = list_url + roles = self.permissions.fetch_users(key, rel_url, list_id=list_id) + + else: + rel_url = list_url + roles = self.permissions.fetch_users(key, rel_url, list_id=list_id, item_id=itemid) + + return roles + + def fetch_permissions( + self, + key, + site=None, + list_id=None, + list_url=None, + itemid=None, + ): + """This method when invoked, checks the permission inheritance of each object. + If the object has unique permissions, the list of users having access to it + is fetched using sharepoint api else the permission levels of the that object + is taken same as the permission level of the site collection. + :param key: key, a string value + :param site: site name to index the permission for the site + :param list_id: list id to index the permission for the list + :param list_url: url of the list + :param itemid: item id to index the permission for the item + Returns: + groups: list of users having access to the given object + """ + roles = self.get_roles(key, site, list_url, list_id, itemid) + + groups = [] + + if not roles: + return [] + roles = get_results(self.logger, roles.json(), "roles") + + for role in roles: + title = role["Member"]["Title"] + groups.append(title) + return groups + + def fetch_and_append_sites_to_queue(self, ids, end_time, collection): + """Fetches and appends site details to queue + :param ids: id collection of the all the objects + :param end_time: end time for fetching the data + :param collection: collection name + """ + _, datelist = split_date_range_into_chunks(self.start_time, self.end_time, self.sharepoint_thread_count) + results = [] + parent_site_url = f"/sites/{collection}" + sites_path = [{parent_site_url: end_time}] + for num in range(0, self.sharepoint_thread_count): + start_time_partition = datelist[num] + end_time_partition = datelist[num + 1] + thread = self.thread_pool.apply_async( + self.fetch_sites, + (parent_site_url, {}, ids, (SITES in self.objects), start_time_partition, end_time_partition), + callback=self.queue.append_to_queue, + ) + results.append(thread) + + sites = [] + for result in [r.get() for r in results]: + if result: + sites.append(result[0]) + + sites_path.extend(sites) + return sites_path + + def fetch_and_append_lists_to_queue(self, sites_path, ids): + """Fetches and appends list details to queue + :param sites_path: dictionary of site path and it's last updated time + :param ids: id collection of the all the objects + """ + results, lists_details, libraries_details = [], {}, {} + partitioned_sites = split_list_into_buckets(sites_path, self.sharepoint_thread_count) + for site in partitioned_sites: + thread = self.thread_pool.apply_async( + self.fetch_lists, (site, ids, (LISTS in self.objects)), callback=self.queue.append_to_queue + ) + results.append(thread) + for result in [r.get() for r in results]: + if result: + lists_details.update(result[0]) + libraries_details.update(result[1]) + return [lists_details, libraries_details] + + def fetch_and_append_list_items_to_queue(self, lists_details, ids): + """Fetches and appends list_items to the queue + :param lists_details: dictionary containing list name, list path and id + :param ids: id collection of the all the objects + """ + partition = [] + partition = split_documents_into_equal_chunks(lists_details, self.sharepoint_thread_count) + for list_data in partition: + self.thread_pool.apply_async(self.fetch_items, (list_data, ids), callback=self.queue.append_to_queue) + + def fetch_and_append_drive_items_to_queue(self, libraries_details, ids): + """Fetches and appends the drive items to the queue + :param libraries_details: dictionary containing library name, library path and id + :param ids: id collection of the all the objects + """ + partition = [] + partition = split_documents_into_equal_chunks(libraries_details, self.sharepoint_thread_count) + for list_data in partition: + self.thread_pool.apply_async(self.fetch_drive_items, (list_data, ids), callback=self.queue.append_to_queue) + + def perform_sync(self, collection, ids, storage, job_type, collected_objects, end_time): + """This method fetches all the objects from sharepoint server + :param collection: collection name + :param ids: id collection of the all the objects + :param storage: temporary storage for storing all the documents + :param job_type: denotes the type of sharepoint object being fetched in a particular process + :param collected_objects: helper variable to provide the data to children object + :param end_time: end time for fetching the data + """ + if job_type == "sites": + collected_objects = self.fetch_and_append_sites_to_queue(ids, end_time, collection) + + elif job_type == "lists": + collected_objects = self.fetch_and_append_lists_to_queue(collected_objects, ids) + + elif job_type == LIST_ITEMS and LIST_ITEMS in self.objects: + self.fetch_and_append_list_items_to_queue(collected_objects[0], ids) + + elif job_type == DRIVE_ITEMS and DRIVE_ITEMS in self.objects: + self.fetch_and_append_drive_items_to_queue(collected_objects[1], ids) + + self.logger.info("Completed fetching all the objects for site collection: %s" % (collection)) + + self.logger.info("Saving the checkpoint for the site collection: %s" % (collection)) + if ids.get(job_type): + prev_ids = storage[job_type] + if job_type == "sites": + prev_ids.update(ids[job_type]) + elif job_type == "lists": + for site, list_content in ids[job_type].items(): + prev_ids[site] = {**prev_ids.get(site, {}), **ids[job_type][site]} + else: + for site, list_content in ids[job_type].items(): + prev_ids[site] = ids[job_type][site] if not prev_ids.get(site) else prev_ids[site] + for list_name in list_content.keys(): + prev_ids[site][list_name] = list( + set([*prev_ids[site].get(list_name, []), *ids[job_type][site][list_name]]) + ) + storage[job_type] = prev_ids + return collected_objects + + +def init_sharepoint_sync(indexing_type, config, logger, workplace_search_client, sharepoint_client, queue): + """Initialize the process for synching + :param indexing_type: The type of the indexing i.e. incremental or full + :param config: instance of Configuration class + :param logger: instance of Logger class + :param workplace_search_client: instance of WorkplaceSearch + :param sharepoint_client: instance of SharePoint + :param queue: Shared queue to push the objects fetched from SharePoint + """ + logger.info(f"Starting the {indexing_type} indexing..") + current_time = (datetime.utcnow()).strftime("%Y-%m-%dT%H:%M:%SZ") + ids_collection = {"global_keys": {}} + storage_with_collection = {"global_keys": {}, "delete_keys": {}} + + if os.path.exists(IDS_PATH) and os.path.getsize(IDS_PATH) > 0: + with open(IDS_PATH) as ids_store: + try: + ids_collection = json.load(ids_store) + except ValueError as exception: + logger.exception( + "Error while parsing the json file of the ids store from path: %s. Error: %s" + % (IDS_PATH, exception) + ) + + storage_with_collection["delete_keys"] = copy.deepcopy(ids_collection.get("global_keys")) + check = Checkpoint(config, logger) + + try: + for collection in config.get_value("sharepoint.site_collections"): + storage = {"sites": {}, "lists": {}, "list_items": {}, "drive_items": {}} + logger.info("Starting the data fetching for site collection: %s" % (collection)) + + if indexing_type == "incremental": + start_time, end_time = check.get_checkpoint(collection, current_time) + else: + start_time = config.get_value("start_time") + end_time = current_time + + if not ids_collection["global_keys"].get(collection): + ids_collection["global_keys"][collection] = { + "sites": {}, + "lists": {}, + "list_items": {}, + "drive_items": {}, + } + + logger.info( + "Starting to index all the objects configured in the object field: %s" + % (str(config.get_value("objects"))) + ) + + sync_sharepoint = SyncSharepoint( + config, logger, workplace_search_client, sharepoint_client, start_time, end_time, queue + ) + returned_documents = None + for job_type in ["sites", "lists", "list_items", "drive_items"]: + logger.info(f"Indexing {job_type}") + returned_documents = sync_sharepoint.perform_sync( + collection, + ids_collection["global_keys"][collection], + storage, + job_type, + returned_documents, + end_time, + ) + sync_sharepoint.thread_pool.close() + sync_sharepoint.thread_pool.join() + queue.put_checkpoint(collection, end_time, indexing_type) + + storage_with_collection["global_keys"][collection] = storage.copy() + queue.end_signal() + except Exception as exception: + raise exception + + with open(IDS_PATH, "w") as file: + try: + json.dump(storage_with_collection, file, indent=4) + except ValueError as exception: + logger.warning("Error while adding ids to json file. Error: %s" % (exception)) diff --git a/ees_sharepoint/utils.py b/ees_sharepoint/utils.py index ceba89f..8f77b87 100644 --- a/ees_sharepoint/utils.py +++ b/ees_sharepoint/utils.py @@ -6,68 +6,69 @@ """This module contains uncategorized utility methods.""" import urllib.parse - -from tika import parser from datetime import datetime -from multiprocessing.pool import ThreadPool +from tika import parser + DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ" def extract(content): - """ Extracts the contents - :param content: content to be extracted - Returns: - parsed_test: parsed text""" + """Extracts the contents + :param content: content to be extracted + Returns: + parsed_test: parsed text""" parsed = parser.from_buffer(content) - parsed_text = parsed['content'] + parsed_text = parsed["content"] return parsed_text def encode(object_name): """Performs encoding on the name of objects - containing special characters in their url, and - replaces single quote with two single quote since quote - is treated as an escape character in odata - :param object_name: name that contains special characters""" + containing special characters in their url, and + replaces single quote with two single quote since quote + is treated as an escape character in odata + :param object_name: name that contains special characters""" name = urllib.parse.quote(object_name, safe="'") return name.replace("'", "''") -def split_list_into_buckets(object_list, total_groups): - """ Divides the list in groups of approximately equal sizes - :param object_list: list to be partitioned - :param total_groups: number of groups to be formed +def split_list_into_buckets(documents, total_buckets): + """Divide large number of documents amongst the total buckets + :param documents: list to be partitioned + :param total_buckets: number of groups to be formed """ - if object_list: - groups = min(total_groups, len(object_list)) + if documents: + groups = min(total_buckets, len(documents)) group_list = [] for i in range(groups): - group_list.append(object_list[i::groups]) + group_list.append(documents[i::groups]) return group_list else: return [] -def split_dict_in_chunks(input_dict, chunk_size): - """ This method splits a dictionary into separate chunks with maximum size - as chunk_size - :param input_dict: Dictionary to be partitioned into chunks - :param chunk_size: Maximum size of a chunk - Returns: - list_of_chunks: List containing the chunks +def split_documents_into_equal_chunks(documents, chunk_size): + """This method splits a list or dictionary into equal chunks size + :param documents: List or Dictionary to be partitioned into chunks + :param chunk_size: Maximum size of a chunk + Returns: + list_of_chunks: List containing the chunks """ list_of_chunks = [] - for i in range(0, len(input_dict), chunk_size): - partitioned_chunk = list(input_dict.items())[i:i + chunk_size] - list_of_chunks.append(dict(partitioned_chunk)) + for i in range(0, len(documents), chunk_size): + if type(documents) is dict: + partitioned_chunk = list(documents.items())[i: i + chunk_size] + list_of_chunks.append(dict(partitioned_chunk)) + else: + list_of_chunks.append(documents[i: i + chunk_size]) return list_of_chunks def split_date_range_into_chunks(start_time, end_time, number_of_threads): - """ Divides the timerange in equal partitions by number of threads - :param start_time: start time of the interval - :param end_time: end time of the interval - :param number_of_threads: number of threads defined by user in config file + """Divides the timerange in equal partitions by number of threads + :param start_time: start time of the interval + :param end_time: end time of the interval + :param number_of_threads: number of threads defined by user in config file """ start_time = datetime.strptime(start_time, DATETIME_FORMAT) end_time = datetime.strptime(end_time, DATETIME_FORMAT) @@ -80,10 +81,3 @@ def split_date_range_into_chunks(start_time, end_time, number_of_threads): formatted_end_time = end_time.strftime(DATETIME_FORMAT) datelist.append(formatted_end_time) return formatted_end_time, datelist - - -def spawn_threads(max_threads): - """ Spawns number of threads provided by user in the config file - :param max_threads: maximum number of threads defined by user - """ - return ThreadPool(max_threads) diff --git a/sharepoint_server_connector.yml b/sharepoint_server_connector.yml index 948b328..7131e9b 100644 --- a/sharepoint_server_connector.yml +++ b/sharepoint_server_connector.yml @@ -44,7 +44,9 @@ end_time : log_level: INFO #The number of retries to perform in case of server error. The connector will use exponential backoff for retry mechanism retry_count: 3 -#Number of threads to be used in multithreading for the connector. -max_threads: 40 +#Number of threads to be used in multithreading for the sharepoint sync. +sharepoint_sync_thread_count: 5 +#Number of threads to be used in multithreading for the enterprise search sync. +enterprise_search_sync_thread_count: 5 #the path of csv file containing mapping of sharepoint user ID to Workplace user ID sharepoint_workplace_user_mapping: "C:/Users/abc/folder_name/file_name.csv" From 2b60766df3af6a1f7f2341c72e6c40a362883f5a Mon Sep 17 00:00:00 2001 From: praveen-elastic Date: Sat, 26 Mar 2022 05:38:23 +0530 Subject: [PATCH 5/9] Fixing the issue of Pickling on Windows --- ees_sharepoint/full_sync_command.py | 7 +++---- ees_sharepoint/incremental_sync_command.py | 7 +++---- ees_sharepoint/sync_enterprise_search.py | 8 ++++++-- ees_sharepoint/sync_sharepoint.py | 10 +++++++--- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/ees_sharepoint/full_sync_command.py b/ees_sharepoint/full_sync_command.py index 2d7bdd3..4e6e4df 100644 --- a/ees_sharepoint/full_sync_command.py +++ b/ees_sharepoint/full_sync_command.py @@ -21,21 +21,20 @@ def execute(self): """This function execute the start function.""" config = self.config logger = self.logger - workplace_search_client = self.workplace_search_client - sharepoint_client = self.sharepoint_client + args = self.args queue = ConnectorQueue() producer = Process( name="producer", target=init_sharepoint_sync, - args=("full", config, logger, workplace_search_client, sharepoint_client, queue), + args=("full", config, logger, queue, args), ) producer.start() consumer = Process( name="consumer", target=init_enterprise_search_sync, - args=(config, logger, workplace_search_client, queue), + args=(config, logger, queue, args), ) consumer.start() diff --git a/ees_sharepoint/incremental_sync_command.py b/ees_sharepoint/incremental_sync_command.py index d29d1dc..1112c77 100644 --- a/ees_sharepoint/incremental_sync_command.py +++ b/ees_sharepoint/incremental_sync_command.py @@ -24,21 +24,20 @@ def execute(self): """This function execute the start function.""" config = self.config logger = self.logger - workplace_search_client = self.workplace_search_client - sharepoint_client = self.sharepoint_client + args = self.args queue = ConnectorQueue() producer = Process( name="producer", target=init_sharepoint_sync, - args=("incremental", config, logger, workplace_search_client, sharepoint_client, queue), + args=("incremental", config, logger, queue, args), ) producer.start() consumer = Process( name="consumer", target=init_enterprise_search_sync, - args=(config, logger, workplace_search_client, queue), + args=(config, logger, queue, args), ) consumer.start() diff --git a/ees_sharepoint/sync_enterprise_search.py b/ees_sharepoint/sync_enterprise_search.py index 87e8bb4..604c5e0 100644 --- a/ees_sharepoint/sync_enterprise_search.py +++ b/ees_sharepoint/sync_enterprise_search.py @@ -5,6 +5,7 @@ # from multiprocessing.pool import ThreadPool from .utils import split_documents_into_equal_chunks +from .base_command import BaseCommand from .checkpointing import Checkpoint BATCH_SIZE = 100 @@ -65,12 +66,15 @@ def perform_sync(self): self.thread_pool.join() -def init_enterprise_search_sync(config, logger, workplace_search_client, queue): +def init_enterprise_search_sync(config, logger, queue, args): """Runs the indexing logic :param config: instance of Configuration class :param logger: instance of Logger class - :param workplace_search_client: instance of WorkplaceSearch :param queue: Shared queue to push the objects fetched from SharePoint + :param args: The command line arguments passed from the base command """ + # Added this workaround of initializing the base_command since workplace_search_client and sharepoint_client cannot be passed in the Process argument as doing so would throw pickling errors on Windows + base_command = BaseCommand(args) + workplace_search_client = base_command.workplace_search_client indexer = SyncEnterpriseSearch(config, logger, workplace_search_client, queue) indexer.perform_sync() diff --git a/ees_sharepoint/sync_sharepoint.py b/ees_sharepoint/sync_sharepoint.py index 1884c79..8b8af37 100644 --- a/ees_sharepoint/sync_sharepoint.py +++ b/ees_sharepoint/sync_sharepoint.py @@ -18,6 +18,7 @@ from tika.tika import TikaException from . import adapter +from .base_command import BaseCommand from .checkpointing import Checkpoint from .usergroup_permissions import Permissions from .utils import ( @@ -563,19 +564,22 @@ def perform_sync(self, collection, ids, storage, job_type, collected_objects, en return collected_objects -def init_sharepoint_sync(indexing_type, config, logger, workplace_search_client, sharepoint_client, queue): +def init_sharepoint_sync(indexing_type, config, logger, queue, args): """Initialize the process for synching :param indexing_type: The type of the indexing i.e. incremental or full :param config: instance of Configuration class :param logger: instance of Logger class - :param workplace_search_client: instance of WorkplaceSearch - :param sharepoint_client: instance of SharePoint :param queue: Shared queue to push the objects fetched from SharePoint + :param args: The command line arguments passed from the base command """ logger.info(f"Starting the {indexing_type} indexing..") current_time = (datetime.utcnow()).strftime("%Y-%m-%dT%H:%M:%SZ") ids_collection = {"global_keys": {}} storage_with_collection = {"global_keys": {}, "delete_keys": {}} + # Added this workaround of initializing the base_command since workplace_search_client and sharepoint_client cannot be passed in the Process argument as doing so would throw pickling errors on Windows + base_command = BaseCommand(args) + workplace_search_client = base_command.workplace_search_client + sharepoint_client = base_command.sharepoint_client if os.path.exists(IDS_PATH) and os.path.getsize(IDS_PATH) > 0: with open(IDS_PATH) as ids_store: From 229ab29474ecd125f85d37775b6581720f73275d Mon Sep 17 00:00:00 2001 From: praveen-elastic Date: Thu, 7 Apr 2022 18:19:21 +0530 Subject: [PATCH 6/9] update multithreading approach --- ees_sharepoint/base_command.py | 44 ++- ees_sharepoint/connector_queue.py | 23 +- ees_sharepoint/full_sync_command.py | 89 +++-- ees_sharepoint/incremental_sync_command.py | 93 +++-- ees_sharepoint/local_storage.py | 39 ++ ees_sharepoint/sync_enterprise_search.py | 82 ++-- ees_sharepoint/sync_sharepoint.py | 414 ++++++++++----------- ees_sharepoint/utils.py | 21 ++ 8 files changed, 471 insertions(+), 334 deletions(-) create mode 100644 ees_sharepoint/local_storage.py diff --git a/ees_sharepoint/base_command.py b/ees_sharepoint/base_command.py index 5b3b691..0574fd4 100644 --- a/ees_sharepoint/base_command.py +++ b/ees_sharepoint/base_command.py @@ -16,9 +16,13 @@ from functools import cached_property except ImportError: from cached_property import cached_property + +from concurrent.futures import ThreadPoolExecutor, as_completed + from elastic_enterprise_search import WorkplaceSearch from .configuration import Configuration +from .local_storage import LocalStorage from .sharepoint_client import SharePoint @@ -27,13 +31,14 @@ class BaseCommand: Inherit from it and implement 'execute' method, then add code to cli.py to register this command.""" + def __init__(self, args): self.args = args def execute(self): """Run the command. - This method is overriden by actual commands with logic + This method is overridden by actual commands with logic that is specific to each command implementing it.""" raise NotImplementedError @@ -44,7 +49,7 @@ def logger(self): log level will be determined by the configuration setting log_level. """ - log_level = self.config.get_value('log_level') + log_level = self.config.get_value("log_level") logger = logging.getLogger(__name__) logger.propagate = False logger.setLevel(log_level) @@ -69,13 +74,14 @@ def workplace_search_client(self): args = self.args host = self.config.get_value("enterprise_search.host_url") - if hasattr(args, 'user') and args.user: + if hasattr(args, "user") and args.user: return WorkplaceSearch( f"{host}/api/ws/v1/sources", http_auth=(args.user, args.password) ) else: return WorkplaceSearch( - f"{host}/api/ws/v1/sources", http_auth=self.config.get_value("workplace_search.api_key") + f"{host}/api/ws/v1/sources", + http_auth=self.config.get_value("workplace_search.api_key"), ) @cached_property @@ -88,3 +94,33 @@ def config(self): def sharepoint_client(self): """Get the sharepoint client instance for the running command.""" return SharePoint(self.config, self.logger) + + @staticmethod + def producer(thread_count, func, args, items, wait=False): + """Apply async calls using multithreading to the targeted function + :param thread_count: Total number of threads to be spawned + :param func: The target function on which the async calls would be made + :param args: Arguments for the targeted function + :param items: iterator of partition + :param wait: wait until job completes if true, otherwise returns immediately + """ + with ThreadPoolExecutor(max_workers=thread_count) as executor: + futures = (executor.submit(func, *args, item) for item in items) + if wait: + result = [future.result() for future in as_completed(futures)] + return result + + @staticmethod + def consumer(thread_count, func): + """Apply async calls using multithreading to the targeted function + :param thread_count: Total number of threads to be spawned + :param func: The target function on which the async calls would be made + """ + with ThreadPoolExecutor(max_workers=thread_count) as executor: + for _ in range(thread_count): + executor.submit(func) + + @cached_property + def local_storage(self): + """Get the object for local storage to fetch and update ids stored locally""" + return LocalStorage(self.logger) diff --git a/ees_sharepoint/connector_queue.py b/ees_sharepoint/connector_queue.py index 71792a7..f772db8 100644 --- a/ees_sharepoint/connector_queue.py +++ b/ees_sharepoint/connector_queue.py @@ -5,8 +5,6 @@ # import multiprocessing from multiprocessing.queues import Queue -from .utils import split_documents_into_equal_chunks - BATCH_SIZE = 100 @@ -14,8 +12,9 @@ class ConnectorQueue(Queue): """Class to support additional queue operations specific to the connector""" - def __init__(self): + def __init__(self, logger): ctx = multiprocessing.get_context() + self.logger = logger super(ConnectorQueue, self).__init__(ctx=ctx) def end_signal(self): @@ -32,18 +31,8 @@ def put_checkpoint(self, key, checkpoint_time, indexing_type): :param indexing_type: The type of the indexing i.e. Full or Incremental """ - checkpoint = {"type": "checkpoint", "data": (key, checkpoint_time, indexing_type)} + checkpoint = { + "type": "checkpoint", + "data": (key, checkpoint_time, indexing_type), + } self.put(checkpoint) - - def append_to_queue(self, documents): - """Append documents to the shared queue - :param documents: documents fetched from sharepoint - """ - if documents: - results = documents - # In case documents is object of tuple - if isinstance(documents, tuple): - results = documents[-1] - for chunk in split_documents_into_equal_chunks(results.get("data"), BATCH_SIZE): - document = {"type": results.get("type"), "data": chunk} - self.put(document) diff --git a/ees_sharepoint/full_sync_command.py b/ees_sharepoint/full_sync_command.py index 4e6e4df..0fd3643 100644 --- a/ees_sharepoint/full_sync_command.py +++ b/ees_sharepoint/full_sync_command.py @@ -7,36 +7,77 @@ It will attempt to sync absolutely all documents that are available in the third-party system and ingest them into Enterprise Search instance.""" +from datetime import datetime + from .base_command import BaseCommand -from .sync_sharepoint import init_sharepoint_sync from .connector_queue import ConnectorQueue -from .sync_enterprise_search import init_enterprise_search_sync -from multiprocessing import Process +from .sync_enterprise_search import SyncEnterpriseSearch +from .sync_sharepoint import SyncSharepoint +from .utils import get_storage_with_collection, split_date_range_into_chunks class FullSyncCommand(BaseCommand): """This class start execution of fullsync feature.""" + def start_producer(self, queue): + """This method starts async calls for the producer which is responsible for fetching documents from + the SharePoint and pushing them in the shared queue + :param queue: Shared queue to fetch the stored documents + """ + self.logger.debug("Starting the full indexing..") + current_time = (datetime.utcnow()).strftime("%Y-%m-%dT%H:%M:%SZ") + + thread_count = self.config.get_value("sharepoint_sync_thread_count") + + start_time, end_time = self.config.get_value("start_time"), current_time + try: + sync_sharepoint = SyncSharepoint( + self.config, + self.logger, + self.workplace_search_client, + self.sharepoint_client, + start_time, + end_time, + queue, + ) + _, datelist = split_date_range_into_chunks( + start_time, + end_time, + thread_count, + ) + for collection in self.config.get_value("sharepoint.site_collections"): + storage_with_collection = get_storage_with_collection(self.local_storage, collection) + self.logger.info( + "Starting to index all the objects configured in the object field: %s" + % (str(self.config.get_value("objects"))) + ) + + ids = storage_with_collection["global_keys"][collection] + storage_with_collection["global_keys"][collection] = sync_sharepoint.fetch_records_from_sharepoint(self.producer, datelist, thread_count, ids, collection) + + queue.put_checkpoint(collection, end_time, "full") + + enterprise_thread_count = self.config.get_value("enterprise_search_sync_thread_count") + for _ in range(enterprise_thread_count): + queue.end_signal() + except Exception as exception: + self.logger.exception(f"Error while fetching the objects . Error {exception}") + raise exception + self.local_storage.update_storage(storage_with_collection) + + def start_consumer(self, queue): + """This method starts async calls for the consumer which is responsible for indexing documents to the + Enterprise Search + :param queue: Shared queue to fetch the stored documents + """ + thread_count = self.config.get_value("enterprise_search_sync_thread_count") + sync_es = SyncEnterpriseSearch(self.config, self.logger, self.workplace_search_client, queue) + + self.consumer(thread_count, sync_es.perform_sync) + def execute(self): """This function execute the start function.""" - config = self.config - logger = self.logger - args = self.args - - queue = ConnectorQueue() - producer = Process( - name="producer", - target=init_sharepoint_sync, - args=("full", config, logger, queue, args), - ) - producer.start() - - consumer = Process( - name="consumer", - target=init_enterprise_search_sync, - args=(config, logger, queue, args), - ) - consumer.start() - - producer.join() - consumer.join() + queue = ConnectorQueue(self.logger) + + self.start_producer(queue) + self.start_consumer(queue) diff --git a/ees_sharepoint/incremental_sync_command.py b/ees_sharepoint/incremental_sync_command.py index 1112c77..67f7e82 100644 --- a/ees_sharepoint/incremental_sync_command.py +++ b/ees_sharepoint/incremental_sync_command.py @@ -10,36 +10,79 @@ Recency is determined by the time when the last successful incremental or full job was ran.""" +from datetime import datetime + from .base_command import BaseCommand -from .sync_sharepoint import init_sharepoint_sync +from .checkpointing import Checkpoint from .connector_queue import ConnectorQueue -from .sync_enterprise_search import init_enterprise_search_sync -from multiprocessing import Process +from .sync_enterprise_search import SyncEnterpriseSearch +from .sync_sharepoint import SyncSharepoint +from .utils import get_storage_with_collection, split_date_range_into_chunks class IncrementalSyncCommand(BaseCommand): - """This class start execution of incrementalsync feature.""" + """This class start execution of incremental sync feature.""" + + def start_producer(self, queue): + """This method starts async calls for the producer which is responsible for fetching documents from the + SharePoint and pushing them in the shared queue + :param queue: Shared queue to fetch the stored documents + """ + self.logger.debug("Starting the incremental indexing..") + current_time = (datetime.utcnow()).strftime("%Y-%m-%dT%H:%M:%SZ") + + thread_count = self.config.get_value("sharepoint_sync_thread_count") + + checkpoint = Checkpoint(self.config, self.logger) + try: + for collection in self.config.get_value("sharepoint.site_collections"): + start_time, end_time = checkpoint.get_checkpoint(collection, current_time) + sync_sharepoint = SyncSharepoint( + self.config, + self.logger, + self.workplace_search_client, + self.sharepoint_client, + start_time, + end_time, + queue, + ) + _, datelist = split_date_range_into_chunks( + start_time, + end_time, + thread_count, + ) + storage_with_collection = get_storage_with_collection(self.local_storage, collection) + self.logger.info( + "Starting to index all the objects configured in the object field: %s" + % (str(self.config.get_value("objects"))) + ) + + ids = storage_with_collection["global_keys"][collection] + storage_with_collection["global_keys"][collection] = sync_sharepoint.fetch_records_from_sharepoint(self.producer, datelist, thread_count, ids, collection) + + queue.put_checkpoint(collection, end_time, "incremental") + + enterprise_thread_count = self.config.get_value("enterprise_search_sync_thread_count") + for _ in range(enterprise_thread_count): + queue.end_signal() + except Exception as exception: + self.logger.exception(f"Error while fetching the objects . Error {exception}") + raise exception + self.local_storage.update_storage(storage_with_collection) + + def start_consumer(self, queue): + """This method starts async calls for the consumer which is responsible for indexing documents to the + Enterprise Search + :param queue: Shared queue to fetch the stored documents + """ + thread_count = self.config.get_value("enterprise_search_sync_thread_count") + sync_es = SyncEnterpriseSearch(self.config, self.logger, self.workplace_search_client, queue) + + self.consumer(thread_count, sync_es.perform_sync) def execute(self): """This function execute the start function.""" - config = self.config - logger = self.logger - args = self.args - - queue = ConnectorQueue() - producer = Process( - name="producer", - target=init_sharepoint_sync, - args=("incremental", config, logger, queue, args), - ) - producer.start() - - consumer = Process( - name="consumer", - target=init_enterprise_search_sync, - args=(config, logger, queue, args), - ) - consumer.start() - - producer.join() - consumer.join() + queue = ConnectorQueue(self.logger) + + self.start_producer(queue) + self.start_consumer(queue) diff --git a/ees_sharepoint/local_storage.py b/ees_sharepoint/local_storage.py new file mode 100644 index 0000000..71166e4 --- /dev/null +++ b/ees_sharepoint/local_storage.py @@ -0,0 +1,39 @@ +import json +import os + +IDS_PATH = os.path.join(os.path.dirname(__file__), 'doc_id.json') + + +class LocalStorage: + """This class contains all the methods to do operations on doc_id json file + """ + + def __init__(self, logger): + self.logger = logger + + def load_storage(self): + """This method fetches the contents of doc_id.json(local ids storage) + """ + try: + with open(IDS_PATH, encoding='utf-8') as ids_file: + try: + return json.load(ids_file) + except ValueError as exception: + self.logger.exception( + f"Error while parsing the json file of the ids store from path: {IDS_PATH}. Error: {exception}" + ) + except FileNotFoundError: + self.logger.debug("Local storage for ids was not found.") + return {"global_keys": {}} + + def update_storage(self, ids): + """This method is used to update the ids stored in doc_id.json file + :param ids: updated ids to be stored in the doc_id.json file + """ + with open(IDS_PATH, "w", encoding='utf-8') as ids_file: + try: + json.dump(ids, ids_file, indent=4) + except ValueError as exception: + self.logger.exception( + f"Error while updating the doc_id json file. Error: {exception}" + ) diff --git a/ees_sharepoint/sync_enterprise_search.py b/ees_sharepoint/sync_enterprise_search.py index 604c5e0..cf199ba 100644 --- a/ees_sharepoint/sync_enterprise_search.py +++ b/ees_sharepoint/sync_enterprise_search.py @@ -3,10 +3,10 @@ # or more contributor license agreements. Licensed under the Elastic License 2.0; # you may not use this file except in compliance with the Elastic License 2.0. # -from multiprocessing.pool import ThreadPool -from .utils import split_documents_into_equal_chunks -from .base_command import BaseCommand +import threading + from .checkpointing import Checkpoint +from .utils import split_documents_into_equal_chunks BATCH_SIZE = 100 @@ -19,62 +19,64 @@ def __init__(self, config, logger, workplace_search_client, queue): self.logger = logger self.workplace_search_client = workplace_search_client self.ws_source = config.get_value("workplace_search.source_id") - self.enterprise_search_thread_count = config.get_value("enterprise_search_sync_thread_count") - self.thread_pool = ThreadPool(self.enterprise_search_thread_count) self.queue = queue def index_documents(self, documents): """This method indexes the documents to the Enterprise Search. :param documents: documents to be indexed """ - total_documents_indexed = 0 - if documents: - responses = self.workplace_search_client.index_documents( - content_source_id=self.ws_source, documents=documents + try: + total_documents_indexed = 0 + if documents: + responses = self.workplace_search_client.index_documents( + content_source_id=self.ws_source, + documents=documents, + request_timeout=1000, + ) + for response in responses["results"]: + if not response["errors"]: + total_documents_indexed += 1 + else: + self.logger.error( + "Error while indexing %s. Error: %s" + % (response["id"], response["errors"]) + ) + self.logger.info( + f"[{threading.get_ident()}] Successfully indexed {total_documents_indexed} documents to the workplace" ) - for response in responses["results"]: - if not response["errors"]: - total_documents_indexed += 1 - else: - self.logger.error("Error while indexing %s. Error: %s" % (response["id"], response["errors"])) - self.logger.info("Successfully indexed %s documents to the workplace" % (total_documents_indexed)) + except Exception as exception: + self.logger.exception(f"Error while indexing the files. Error: {exception}") + raise exception def perform_sync(self): """Pull documents from the queue and synchronize it to the Enterprise Search.""" - checkpoint = Checkpoint(self.config, self.logger) - signal_open = True - while signal_open: - for _ in range(0, self.enterprise_search_thread_count): + try: + checkpoint = Checkpoint(self.config, self.logger) + signal_open = True + while signal_open: documents_to_index = [] while len(documents_to_index) < BATCH_SIZE: documents = self.queue.get() if documents.get("type") == "signal_close": + self.logger.info( + f"Found an end signal in the queue. Closing Thread ID {threading.get_ident()}" + ) signal_open = False break elif documents.get("type") == "checkpoint": checkpoint.set_checkpoint( - documents.get("data")[0], documents.get("data")[1], documents.get("data")[2] + documents.get("data")[0], + documents.get("data")[1], + documents.get("data")[2], ) break else: documents_to_index.extend(documents.get("data")) - for chunk in split_documents_into_equal_chunks(documents_to_index, BATCH_SIZE): - self.thread_pool.apply_async(self.index_documents, (chunk,)) - if not signal_open: - break - self.thread_pool.close() - self.thread_pool.join() - - -def init_enterprise_search_sync(config, logger, queue, args): - """Runs the indexing logic - :param config: instance of Configuration class - :param logger: instance of Logger class - :param queue: Shared queue to push the objects fetched from SharePoint - :param args: The command line arguments passed from the base command - """ - # Added this workaround of initializing the base_command since workplace_search_client and sharepoint_client cannot be passed in the Process argument as doing so would throw pickling errors on Windows - base_command = BaseCommand(args) - workplace_search_client = base_command.workplace_search_client - indexer = SyncEnterpriseSearch(config, logger, workplace_search_client, queue) - indexer.perform_sync() + # This loop is to ensure if the last document fetched from the queue exceeds the size of + # documents_to_index to more than the permitted chunk size, then we split the documents as per the limit + for chunk in split_documents_into_equal_chunks( + documents_to_index, BATCH_SIZE + ): + self.index_documents(chunk) + except Exception as exception: + self.logger.error(f"Error while indexing the documents to the Enterprise Search. Error {exception}") diff --git a/ees_sharepoint/sync_sharepoint.py b/ees_sharepoint/sync_sharepoint.py index 8b8af37..e51f7ac 100644 --- a/ees_sharepoint/sync_sharepoint.py +++ b/ees_sharepoint/sync_sharepoint.py @@ -6,28 +6,18 @@ """sync_sharepoint module allows to sync data to Elastic Enterprise Search. It's possible to run full syncs and incremental syncs with this module.""" - -import copy -import json import os import re -from datetime import datetime +import threading from urllib.parse import urljoin + from dateutil.parser import parse -from multiprocessing.pool import ThreadPool from tika.tika import TikaException from . import adapter -from .base_command import BaseCommand from .checkpointing import Checkpoint from .usergroup_permissions import Permissions -from .utils import ( - encode, - extract, - split_list_into_buckets, - split_date_range_into_chunks, - split_documents_into_equal_chunks, -) +from .utils import encode, extract, split_documents_into_equal_chunks, split_list_into_buckets IDS_PATH = os.path.join(os.path.dirname(__file__), "doc_id.json") @@ -48,19 +38,28 @@ def get_results(logger, response, entity_name): Parsed response """ if not response: - logger.error(f"Empty response when fetching {entity_name}") # TODO: should it be an error? + logger.error(f"Empty response when fetching {entity_name}") return None if entity_name == "attachment" and not response.get("d", {}).get("results"): - logger.info("Failed to fetch attachment") # TODO: not sure if it's the right message + logger.info("Failed to fetch attachment") return None return response.get("d", {}).get("results") class SyncSharepoint: - """This class allows synching objects from the SharePoint Server.""" - - def __init__(self, config, logger, workplace_search_client, sharepoint_client, start_time, end_time, queue): + """This class allows syncing objects from the SharePoint Server.""" + + def __init__( + self, + config, + logger, + workplace_search_client, + sharepoint_client, + start_time, + end_time, + queue, + ): self.config = config self.logger = logger self.workplace_search_client = workplace_search_client @@ -74,11 +73,11 @@ def __init__(self, config, logger, workplace_search_client, sharepoint_client, s self.end_time = end_time self.sharepoint_thread_count = config.get_value("sharepoint_sync_thread_count") self.mapping_sheet_path = config.get_value("sharepoint_workplace_user_mapping") - + self.sharepoint_host = config.get_value("sharepoint.host_url") self.checkpoint = Checkpoint(config, logger) - self.permissions = Permissions(self.sharepoint_client, self.workplace_search_client, logger) - - self.thread_pool = ThreadPool(self.sharepoint_thread_count) + self.permissions = Permissions( + self.sharepoint_client, self.workplace_search_client, logger + ) self.queue = queue def get_schema_fields(self, document_name): @@ -94,9 +93,17 @@ def get_schema_fields(self, document_name): include_fields = fields.get("include_fields") exclude_fields = fields.get("exclude_fields") if include_fields: - adapter_schema = {key: val for key, val in adapter_schema.items() if val in include_fields} + adapter_schema = { + key: val + for key, val in adapter_schema.items() + if val in include_fields + } elif exclude_fields: - adapter_schema = {key: val for key, val in adapter_schema.items() if val not in exclude_fields} + adapter_schema = { + key: val + for key, val in adapter_schema.items() + if val not in exclude_fields + } adapter_schema["id"] = field_id return adapter_schema @@ -124,16 +131,19 @@ def fetch_sites(self, parent_site_url, sites, ids, index, start_time, end_time): "No sites were created in %s for this interval: start time: %s and end time: %s" % (parent_site_url, start_time, end_time) ) - return sites - self.logger.info("Successfully fetched and parsed %s sites response from SharePoint" % len(response_data)) - + return sites, {} + self.logger.info( + "Successfully fetched and parsed %s sites response from SharePoint" + % len(response_data) + ) schema = self.get_schema_fields(SITES) document = [] if index: for i, _ in enumerate(response_data): doc = {"type": SITE} - # need to convert date to iso else workplace search throws error on date format Invalid field value: Value '2021-09-29T08:13:00' cannot be parsed as a date (RFC 3339)"]} + # need to convert date to iso else workplace search throws error on date format Invalid field + # value: Value '2021-09-29T08:13:00' cannot be parsed as a date (RFC 3339)"]} response_data[i]["Created"] += "Z" for field, response_field in schema.items(): doc[field] = response_data[i].get(response_field) @@ -169,16 +179,20 @@ def fetch_lists(self, sites, ids, index): "No list was created in this interval: start time: %s and end time: %s" % (self.start_time, self.end_time) ) - return [], [] + return [], [], {} schema_list = self.get_schema_fields(LISTS) for site_details in sites: for site, time_modified in site_details.items(): if parse(self.start_time) > parse(time_modified): continue rel_url = f"{site}/_api/web/lists" - self.logger.info("Fetching the lists for site: %s from url: %s" % (site, rel_url)) + self.logger.info( + "Fetching the lists for site: %s from url: %s" % (site, rel_url) + ) - query = self.sharepoint_client.get_query(self.start_time, self.end_time, LISTS) + query = self.sharepoint_client.get_query( + self.start_time, self.end_time, LISTS + ) response = self.sharepoint_client.get(rel_url, query, LISTS) response_data = get_results(self.logger, response, LISTS) @@ -193,7 +207,7 @@ def fetch_lists(self, sites, ids, index): % (len(response_data), site) ) - base_list_url = f"{site}/Lists/" + base_list_url = urljoin(self.sharepoint_host, f"{site}/Lists/") if index: if not ids["lists"].get(site): @@ -210,9 +224,14 @@ def fetch_lists(self, sites, ids, index): list_url=response_data[i]["ParentWebUrl"], itemid=None, ) - doc["url"] = urljoin(base_list_url, re.sub(r"[^ \w+]", "", response_data[i]["Title"])) + doc["url"] = urljoin( + base_list_url, + re.sub(r"[^ \w+]", "", response_data[i]["Title"]), + ) document.append(doc) - ids["lists"][site].update({doc["id"]: response_data[i]["Title"]}) + ids["lists"][site].update( + {doc["id"]: response_data[i]["Title"]} + ) responses.append(response_data) lists = {} @@ -260,9 +279,13 @@ def fetch_items(self, lists, ids): if parse(self.start_time) > parse(value[2]): continue rel_url = f"{value[0]}/_api/web/lists(guid'{list_content}')/items" - self.logger.info("Fetching the items for list: %s from url: %s" % (value[1], rel_url)) + self.logger.info( + "Fetching the items for list: %s from url: %s" % (value[1], rel_url) + ) - query = self.sharepoint_client.get_query(self.start_time, self.end_time, LIST_ITEMS) + query = self.sharepoint_client.get_query( + self.start_time, self.end_time, LIST_ITEMS + ) response = self.sharepoint_client.get(rel_url, query, LIST_ITEMS) response_data = get_results(self.logger, response, LIST_ITEMS) @@ -278,25 +301,36 @@ def fetch_items(self, lists, ids): ) list_name = re.sub(r"[^ \w+]", "", value[1]) - base_item_url = f"{value[0]}/Lists/{list_name}/DispForm.aspx?ID=" + base_item_url = urljoin( + self.sharepoint_host, + f"{value[0]}/Lists/{list_name}/DispForm.aspx?ID=", + ) document = [] if not ids["list_items"][value[0]].get(list_content): ids["list_items"][value[0]].update({list_content: []}) rel_url = f"{value[0]}/_api/web/lists(guid'{list_content}')/items?$select=Attachments,AttachmentFiles,Title&$expand=AttachmentFiles" new_query = "&" + query.split("?")[1] - file_response_data = self.sharepoint_client.get(rel_url, query=new_query, param_name="attachment") + file_response_data = self.sharepoint_client.get( + rel_url, query=new_query, param_name="attachment" + ) if file_response_data: - file_response_data = get_results(self.logger, file_response_data.json(), "attachment") + file_response_data = get_results( + self.logger, file_response_data.json(), "attachment" + ) for i, _ in enumerate(response_data): doc = {"type": ITEM} if response_data[i].get("Attachments") and file_response_data: for data in file_response_data: if response_data[i].get("Title") == data["Title"]: - file_relative_url = data["AttachmentFiles"]["results"][0]["ServerRelativeUrl"] + file_relative_url = data["AttachmentFiles"]["results"][ + 0 + ]["ServerRelativeUrl"] url_s = f"{value[0]}/_api/web/GetFileByServerRelativeUrl('{encode(file_relative_url)}')/$value" - response = self.sharepoint_client.get(url_s, query="", param_name="attachment") + response = self.sharepoint_client.get( + url_s, query="", param_name="attachment" + ) doc["body"] = {} if response and response.ok: try: @@ -312,19 +346,27 @@ def fetch_items(self, lists, ids): doc[field] = response_data[i].get(response_field) if self.enable_permission is True: doc["_allow_permissions"] = self.fetch_permissions( - key=LIST_ITEMS, list_id=list_content, list_url=value[0], itemid=str(response_data[i]["Id"]) + key=LIST_ITEMS, + list_id=list_content, + list_url=value[0], + itemid=str(response_data[i]["Id"]), ) doc["url"] = base_item_url + str(response_data[i]["Id"]) document.append(doc) - if response_data[i].get("GUID") not in ids["list_items"][value[0]][list_content]: - ids["list_items"][value[0]][list_content].append(response_data[i].get("GUID")) + if ( + response_data[i].get("GUID") + not in ids["list_items"][value[0]][list_content] + ): + ids["list_items"][value[0]][list_content].append( + response_data[i].get("GUID") + ) responses.extend(document) documents = {"type": LIST_ITEMS, "data": responses} return documents def fetch_drive_items(self, libraries, ids): """This method fetches items from all the lists in a collection and - invokes theindex permission method to get the document level permissions. + invokes the index permission method to get the document level permissions. If the fetching is not successful, it logs proper message. :param libraries: document lists :param ids: structure containing id's of all objects @@ -345,8 +387,13 @@ def fetch_drive_items(self, libraries, ids): if not ids["drive_items"].get(value[0]): ids["drive_items"].update({value[0]: {}}) rel_url = f"{value[0]}/_api/web/lists(guid'{lib_content}')/items?$select=Modified,Id,GUID,File,Folder&$expand=File,Folder" - self.logger.info("Fetching the items for libraries: %s from url: %s" % (value[1], rel_url)) - query = self.sharepoint_client.get_query(self.start_time, self.end_time, DRIVE_ITEMS) + self.logger.info( + "Fetching the items for libraries: %s from url: %s" + % (value[1], rel_url) + ) + query = self.sharepoint_client.get_query( + self.start_time, self.end_time, DRIVE_ITEMS + ) response = self.sharepoint_client.get(rel_url, query, DRIVE_ITEMS) response_data = get_results(self.logger, response, DRIVE_ITEMS) if not response_data: @@ -366,9 +413,13 @@ def fetch_drive_items(self, libraries, ids): if response_data[i]["File"].get("TimeLastModified"): obj_type = "File" doc = {"type": "file"} - file_relative_url = response_data[i]["File"]["ServerRelativeUrl"] + file_relative_url = response_data[i]["File"][ + "ServerRelativeUrl" + ] url_s = f"{value[0]}/_api/web/GetFileByServerRelativeUrl('{encode(file_relative_url)}')/$value" - response = self.sharepoint_client.get(url_s, query="", param_name="attachment") + response = self.sharepoint_client.get( + url_s, query="", param_name="attachment" + ) doc["body"] = {} if response and response.ok: try: @@ -391,7 +442,10 @@ def fetch_drive_items(self, libraries, ids): list_url=value[0], itemid=str(response_data[i].get("ID")), ) - doc["url"] = response_data[i][obj_type]["ServerRelativeUrl"] + doc["url"] = urljoin( + self.sharepoint_host, + response_data[i][obj_type]["ServerRelativeUrl"], + ) document.append(doc) if doc["id"] not in ids["drive_items"][value[0]][lib_content]: ids["drive_items"][value[0]][lib_content].append(doc["id"]) @@ -419,17 +473,19 @@ def get_roles(self, key, site, list_url, list_id, itemid): else: rel_url = list_url - roles = self.permissions.fetch_users(key, rel_url, list_id=list_id, item_id=itemid) + roles = self.permissions.fetch_users( + key, rel_url, list_id=list_id, item_id=itemid + ) return roles def fetch_permissions( - self, - key, - site=None, - list_id=None, - list_url=None, - itemid=None, + self, + key, + site=None, + list_id=None, + list_url=None, + itemid=None, ): """This method when invoked, checks the permission inheritance of each object. If the object has unique permissions, the list of users having access to it @@ -456,193 +512,103 @@ def fetch_permissions( groups.append(title) return groups - def fetch_and_append_sites_to_queue(self, ids, end_time, collection): + def fetch_and_append_sites_to_queue( + self, ids, collection, duration + ): """Fetches and appends site details to queue :param ids: id collection of the all the objects - :param end_time: end time for fetching the data :param collection: collection name + :param duration: List of time range consisting of the [start_time, end_time] """ - _, datelist = split_date_range_into_chunks(self.start_time, self.end_time, self.sharepoint_thread_count) - results = [] + start_time, end_time = duration[0], duration[1] parent_site_url = f"/sites/{collection}" - sites_path = [{parent_site_url: end_time}] - for num in range(0, self.sharepoint_thread_count): - start_time_partition = datelist[num] - end_time_partition = datelist[num + 1] - thread = self.thread_pool.apply_async( - self.fetch_sites, - (parent_site_url, {}, ids, (SITES in self.objects), start_time_partition, end_time_partition), - callback=self.queue.append_to_queue, + sites_path = [{parent_site_url: self.end_time}] + sites, documents = self.fetch_sites( + parent_site_url, + {}, + ids, + (SITES in self.objects), + start_time, + end_time, + ) + if documents: + self.queue.put(documents) + self.logger.debug( + f"Thread ID {threading.get_ident()} added list of {len(documents.get('data'))} sites into the queue" ) - results.append(thread) - - sites = [] - for result in [r.get() for r in results]: - if result: - sites.append(result[0]) - - sites_path.extend(sites) + sites_path.append(sites) return sites_path - def fetch_and_append_lists_to_queue(self, sites_path, ids): + def fetch_and_append_lists_to_queue(self, ids, sites_path): """Fetches and appends list details to queue - :param sites_path: dictionary of site path and it's last updated time :param ids: id collection of the all the objects + :param sites_path: dictionary of site path and it's last updated time """ - results, lists_details, libraries_details = [], {}, {} - partitioned_sites = split_list_into_buckets(sites_path, self.sharepoint_thread_count) - for site in partitioned_sites: - thread = self.thread_pool.apply_async( - self.fetch_lists, (site, ids, (LISTS in self.objects)), callback=self.queue.append_to_queue + lists_details, libraries_details, documents = self.fetch_lists( + sites_path, ids, (LISTS in self.objects) + ) + if documents: + self.queue.put(documents) + self.logger.debug( + f"Thread ID {threading.get_ident()} added list of {len(documents.get('data'))} lists into the queue" ) - results.append(thread) - for result in [r.get() for r in results]: - if result: - lists_details.update(result[0]) - libraries_details.update(result[1]) return [lists_details, libraries_details] - def fetch_and_append_list_items_to_queue(self, lists_details, ids): + def fetch_and_append_list_items_to_queue(self, ids, lists_details): """Fetches and appends list_items to the queue - :param lists_details: dictionary containing list name, list path and id :param ids: id collection of the all the objects + :param lists_details: dictionary containing list name, list path and id """ - partition = [] - partition = split_documents_into_equal_chunks(lists_details, self.sharepoint_thread_count) - for list_data in partition: - self.thread_pool.apply_async(self.fetch_items, (list_data, ids), callback=self.queue.append_to_queue) + documents = self.fetch_items(lists_details, ids) + if documents: + self.queue.put(documents) + self.logger.debug( + f"Thread ID {threading.get_ident()} added list of {len(documents.get('data'))} list items into the queue" + ) - def fetch_and_append_drive_items_to_queue(self, libraries_details, ids): + def fetch_and_append_drive_items_to_queue(self, ids, libraries_details): """Fetches and appends the drive items to the queue - :param libraries_details: dictionary containing library name, library path and id :param ids: id collection of the all the objects + :param libraries_details: dictionary containing library name, library path and id """ - partition = [] - partition = split_documents_into_equal_chunks(libraries_details, self.sharepoint_thread_count) - for list_data in partition: - self.thread_pool.apply_async(self.fetch_drive_items, (list_data, ids), callback=self.queue.append_to_queue) - - def perform_sync(self, collection, ids, storage, job_type, collected_objects, end_time): - """This method fetches all the objects from sharepoint server - :param collection: collection name - :param ids: id collection of the all the objects - :param storage: temporary storage for storing all the documents - :param job_type: denotes the type of sharepoint object being fetched in a particular process - :param collected_objects: helper variable to provide the data to children object - :param end_time: end time for fetching the data - """ - if job_type == "sites": - collected_objects = self.fetch_and_append_sites_to_queue(ids, end_time, collection) - - elif job_type == "lists": - collected_objects = self.fetch_and_append_lists_to_queue(collected_objects, ids) - - elif job_type == LIST_ITEMS and LIST_ITEMS in self.objects: - self.fetch_and_append_list_items_to_queue(collected_objects[0], ids) - - elif job_type == DRIVE_ITEMS and DRIVE_ITEMS in self.objects: - self.fetch_and_append_drive_items_to_queue(collected_objects[1], ids) - - self.logger.info("Completed fetching all the objects for site collection: %s" % (collection)) - - self.logger.info("Saving the checkpoint for the site collection: %s" % (collection)) - if ids.get(job_type): - prev_ids = storage[job_type] - if job_type == "sites": - prev_ids.update(ids[job_type]) - elif job_type == "lists": - for site, list_content in ids[job_type].items(): - prev_ids[site] = {**prev_ids.get(site, {}), **ids[job_type][site]} - else: - for site, list_content in ids[job_type].items(): - prev_ids[site] = ids[job_type][site] if not prev_ids.get(site) else prev_ids[site] - for list_name in list_content.keys(): - prev_ids[site][list_name] = list( - set([*prev_ids[site].get(list_name, []), *ids[job_type][site][list_name]]) - ) - storage[job_type] = prev_ids - return collected_objects - - -def init_sharepoint_sync(indexing_type, config, logger, queue, args): - """Initialize the process for synching - :param indexing_type: The type of the indexing i.e. incremental or full - :param config: instance of Configuration class - :param logger: instance of Logger class - :param queue: Shared queue to push the objects fetched from SharePoint - :param args: The command line arguments passed from the base command - """ - logger.info(f"Starting the {indexing_type} indexing..") - current_time = (datetime.utcnow()).strftime("%Y-%m-%dT%H:%M:%SZ") - ids_collection = {"global_keys": {}} - storage_with_collection = {"global_keys": {}, "delete_keys": {}} - # Added this workaround of initializing the base_command since workplace_search_client and sharepoint_client cannot be passed in the Process argument as doing so would throw pickling errors on Windows - base_command = BaseCommand(args) - workplace_search_client = base_command.workplace_search_client - sharepoint_client = base_command.sharepoint_client - - if os.path.exists(IDS_PATH) and os.path.getsize(IDS_PATH) > 0: - with open(IDS_PATH) as ids_store: - try: - ids_collection = json.load(ids_store) - except ValueError as exception: - logger.exception( - "Error while parsing the json file of the ids store from path: %s. Error: %s" - % (IDS_PATH, exception) - ) - - storage_with_collection["delete_keys"] = copy.deepcopy(ids_collection.get("global_keys")) - check = Checkpoint(config, logger) - - try: - for collection in config.get_value("sharepoint.site_collections"): - storage = {"sites": {}, "lists": {}, "list_items": {}, "drive_items": {}} - logger.info("Starting the data fetching for site collection: %s" % (collection)) - - if indexing_type == "incremental": - start_time, end_time = check.get_checkpoint(collection, current_time) - else: - start_time = config.get_value("start_time") - end_time = current_time - - if not ids_collection["global_keys"].get(collection): - ids_collection["global_keys"][collection] = { - "sites": {}, - "lists": {}, - "list_items": {}, - "drive_items": {}, - } - - logger.info( - "Starting to index all the objects configured in the object field: %s" - % (str(config.get_value("objects"))) + documents = self.fetch_drive_items(libraries_details, ids) + if documents: + self.queue.put(documents) + self.logger.debug( + f"Thread ID {threading.get_ident()} added list of {len(documents.get('data'))} drive items into the queue" ) - sync_sharepoint = SyncSharepoint( - config, logger, workplace_search_client, sharepoint_client, start_time, end_time, queue - ) - returned_documents = None - for job_type in ["sites", "lists", "list_items", "drive_items"]: - logger.info(f"Indexing {job_type}") - returned_documents = sync_sharepoint.perform_sync( - collection, - ids_collection["global_keys"][collection], - storage, - job_type, - returned_documents, - end_time, - ) - sync_sharepoint.thread_pool.close() - sync_sharepoint.thread_pool.join() - queue.put_checkpoint(collection, end_time, indexing_type) - - storage_with_collection["global_keys"][collection] = storage.copy() - queue.end_signal() - except Exception as exception: - raise exception - - with open(IDS_PATH, "w") as file: - try: - json.dump(storage_with_collection, file, indent=4) - except ValueError as exception: - logger.warning("Error while adding ids to json file. Error: %s" % (exception)) + def fetch_records_from_sharepoint(self, producer, date_ranges, thread_count, ids, collection): + """Fetches Sites, Lists, List Items and Drive Items from sharepoint. + :param producer: Producer function + :param date_ranges: Partition of time range + :param thread_count: Thread count + :param ids: Content of the local storage + :param collection: SharePoint server Collection name + """ + # Fetch sites + time_range_list = [(date_ranges[num], date_ranges[num + 1]) for num in range(0, thread_count)] + sites = producer(thread_count, self.fetch_and_append_sites_to_queue, + [ids, collection], time_range_list, wait=True) + all_sites = [] + for site in sites: + all_sites.extend(site) + + # Fetch lists + partitioned_sites = split_list_into_buckets(all_sites, thread_count) + + lists = producer(thread_count, self.fetch_and_append_lists_to_queue, [ids], partitioned_sites, wait=True) + + # Fetch list items + lists_details, libraries_details = {}, {} + for result in lists: + lists_details.update(result[0]) + libraries_details.update(result[1]) + + list_items = split_documents_into_equal_chunks(lists_details, thread_count) + producer(thread_count, self.fetch_and_append_list_items_to_queue, [ids], list_items, wait=True) + + # Fetch library details + libraries_items = split_documents_into_equal_chunks(libraries_details, thread_count) + producer(thread_count, self.fetch_and_append_drive_items_to_queue, [ids], libraries_items, wait=True) + return ids diff --git a/ees_sharepoint/utils.py b/ees_sharepoint/utils.py index 8f77b87..6e5ca65 100644 --- a/ees_sharepoint/utils.py +++ b/ees_sharepoint/utils.py @@ -6,6 +6,7 @@ """This module contains uncategorized utility methods.""" import urllib.parse +import copy from datetime import datetime from tika import parser @@ -81,3 +82,23 @@ def split_date_range_into_chunks(start_time, end_time, number_of_threads): formatted_end_time = end_time.strftime(DATETIME_FORMAT) datelist.append(formatted_end_time) return formatted_end_time, datelist + + +def get_storage_with_collection(local_storage, collection): + """Returns a dictionary containing the locally stored IDs of files fetched from network drives + :param local_storage: The object of the local storage used to store the indexed document IDs + :param collection: The SharePoint server collection which is currently being fetched + """ + storage_with_collection = {"global_keys": {}, "delete_keys": {}} + ids_collection = local_storage.load_storage() + storage_with_collection["delete_keys"] = copy.deepcopy(ids_collection.get("global_keys")) + if not ids_collection["global_keys"].get(collection): + ids_collection["global_keys"][collection] = { + "sites": {}, + "lists": {}, + "list_items": {}, + "drive_items": {}, + } + storage_with_collection["global_keys"][collection] = copy.deepcopy(ids_collection["global_keys"][collection]) + + return storage_with_collection From 05218492bd234c17423bdc0782e89570d28d7eaa Mon Sep 17 00:00:00 2001 From: praveen-elastic Date: Thu, 7 Apr 2022 19:37:26 +0530 Subject: [PATCH 7/9] add last updated field --- ees_sharepoint/adapter.py | 2 ++ ees_sharepoint/sync_enterprise_search.py | 10 ++++++---- ees_sharepoint/utils.py | 11 ++++++++--- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/ees_sharepoint/adapter.py b/ees_sharepoint/adapter.py index b20f3b4..aafda0f 100644 --- a/ees_sharepoint/adapter.py +++ b/ees_sharepoint/adapter.py @@ -23,6 +23,7 @@ 'lists': { 'created_at': 'Created', 'id': 'Id', + 'last_updated': 'LastItemModifiedDate', 'relative_url': 'ParentWebUrl', 'title': 'Title' }, @@ -30,6 +31,7 @@ 'title': 'Title', 'id': 'GUID', 'created_at': 'Created', + 'last_updated': 'Modified', 'author_id': 'AuthorId' }, 'drive_items': { diff --git a/ees_sharepoint/sync_enterprise_search.py b/ees_sharepoint/sync_enterprise_search.py index cf199ba..124c40d 100644 --- a/ees_sharepoint/sync_enterprise_search.py +++ b/ees_sharepoint/sync_enterprise_search.py @@ -41,9 +41,9 @@ def index_documents(self, documents): "Error while indexing %s. Error: %s" % (response["id"], response["errors"]) ) - self.logger.info( - f"[{threading.get_ident()}] Successfully indexed {total_documents_indexed} documents to the workplace" - ) + self.logger.info( + f"[{threading.get_ident()}] Successfully indexed {total_documents_indexed} documents to the workplace" + ) except Exception as exception: self.logger.exception(f"Error while indexing the files. Error: {exception}") raise exception @@ -79,4 +79,6 @@ def perform_sync(self): ): self.index_documents(chunk) except Exception as exception: - self.logger.error(f"Error while indexing the documents to the Enterprise Search. Error {exception}") + self.logger.error( + f"Error while indexing the documents to the Enterprise Search. Error {exception}" + ) diff --git a/ees_sharepoint/utils.py b/ees_sharepoint/utils.py index 6e5ca65..7f55f14 100644 --- a/ees_sharepoint/utils.py +++ b/ees_sharepoint/utils.py @@ -5,9 +5,10 @@ # """This module contains uncategorized utility methods.""" -import urllib.parse import copy +import urllib.parse from datetime import datetime + from tika import parser DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ" @@ -91,7 +92,9 @@ def get_storage_with_collection(local_storage, collection): """ storage_with_collection = {"global_keys": {}, "delete_keys": {}} ids_collection = local_storage.load_storage() - storage_with_collection["delete_keys"] = copy.deepcopy(ids_collection.get("global_keys")) + storage_with_collection["delete_keys"] = copy.deepcopy( + ids_collection.get("global_keys") + ) if not ids_collection["global_keys"].get(collection): ids_collection["global_keys"][collection] = { "sites": {}, @@ -99,6 +102,8 @@ def get_storage_with_collection(local_storage, collection): "list_items": {}, "drive_items": {}, } - storage_with_collection["global_keys"][collection] = copy.deepcopy(ids_collection["global_keys"][collection]) + storage_with_collection["global_keys"][collection] = copy.deepcopy( + ids_collection["global_keys"][collection] + ) return storage_with_collection From 99849981338e267221aaa84dd5870da3959d8c00 Mon Sep 17 00:00:00 2001 From: praveen-elastic Date: Mon, 11 Apr 2022 16:45:36 +0530 Subject: [PATCH 8/9] readme.md in place of readme.rst --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b750303..26d216a 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ description = "" -with open("README.rst") as f: +with open("README.md") as f: description += f.read() + "\n\n" From 7053f7ce4e4dabcb899d36996891886d5589a773 Mon Sep 17 00:00:00 2001 From: praveen-elastic Date: Tue, 12 Apr 2022 18:20:09 +0530 Subject: [PATCH 9/9] Addressing review comments --- ees_sharepoint/full_sync_command.py | 6 ++--- ees_sharepoint/incremental_sync_command.py | 6 ++--- ees_sharepoint/local_storage.py | 28 ++++++++++++++++++++++ ees_sharepoint/utils.py | 27 +-------------------- 4 files changed, 35 insertions(+), 32 deletions(-) diff --git a/ees_sharepoint/full_sync_command.py b/ees_sharepoint/full_sync_command.py index 0fd3643..766a573 100644 --- a/ees_sharepoint/full_sync_command.py +++ b/ees_sharepoint/full_sync_command.py @@ -13,7 +13,7 @@ from .connector_queue import ConnectorQueue from .sync_enterprise_search import SyncEnterpriseSearch from .sync_sharepoint import SyncSharepoint -from .utils import get_storage_with_collection, split_date_range_into_chunks +from .utils import split_date_range_into_chunks class FullSyncCommand(BaseCommand): @@ -40,13 +40,13 @@ def start_producer(self, queue): end_time, queue, ) - _, datelist = split_date_range_into_chunks( + datelist = split_date_range_into_chunks( start_time, end_time, thread_count, ) for collection in self.config.get_value("sharepoint.site_collections"): - storage_with_collection = get_storage_with_collection(self.local_storage, collection) + storage_with_collection = self.local_storage.get_storage_with_collection(collection) self.logger.info( "Starting to index all the objects configured in the object field: %s" % (str(self.config.get_value("objects"))) diff --git a/ees_sharepoint/incremental_sync_command.py b/ees_sharepoint/incremental_sync_command.py index 67f7e82..adffb86 100644 --- a/ees_sharepoint/incremental_sync_command.py +++ b/ees_sharepoint/incremental_sync_command.py @@ -17,7 +17,7 @@ from .connector_queue import ConnectorQueue from .sync_enterprise_search import SyncEnterpriseSearch from .sync_sharepoint import SyncSharepoint -from .utils import get_storage_with_collection, split_date_range_into_chunks +from .utils import split_date_range_into_chunks class IncrementalSyncCommand(BaseCommand): @@ -46,12 +46,12 @@ def start_producer(self, queue): end_time, queue, ) - _, datelist = split_date_range_into_chunks( + datelist = split_date_range_into_chunks( start_time, end_time, thread_count, ) - storage_with_collection = get_storage_with_collection(self.local_storage, collection) + storage_with_collection = self.local_storage.get_storage_with_collection(collection) self.logger.info( "Starting to index all the objects configured in the object field: %s" % (str(self.config.get_value("objects"))) diff --git a/ees_sharepoint/local_storage.py b/ees_sharepoint/local_storage.py index 71166e4..af28f93 100644 --- a/ees_sharepoint/local_storage.py +++ b/ees_sharepoint/local_storage.py @@ -1,3 +1,9 @@ +# +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License 2.0; +# you may not use this file except in compliance with the Elastic License 2.0. +# +import copy import json import os @@ -37,3 +43,25 @@ def update_storage(self, ids): self.logger.exception( f"Error while updating the doc_id json file. Error: {exception}" ) + + def get_storage_with_collection(self, collection): + """Returns a dictionary containing the locally stored IDs of files fetched from SharePoint + :param collection: The SharePoint server collection which is currently being fetched + """ + storage_with_collection = {"global_keys": {}, "delete_keys": {}} + ids_collection = self.load_storage() + storage_with_collection["delete_keys"] = copy.deepcopy( + ids_collection.get("global_keys") + ) + if not ids_collection["global_keys"].get(collection): + ids_collection["global_keys"][collection] = { + "sites": {}, + "lists": {}, + "list_items": {}, + "drive_items": {}, + } + storage_with_collection["global_keys"][collection] = copy.deepcopy( + ids_collection["global_keys"][collection] + ) + + return storage_with_collection diff --git a/ees_sharepoint/utils.py b/ees_sharepoint/utils.py index 7f55f14..314ee01 100644 --- a/ees_sharepoint/utils.py +++ b/ees_sharepoint/utils.py @@ -5,7 +5,6 @@ # """This module contains uncategorized utility methods.""" -import copy import urllib.parse from datetime import datetime @@ -82,28 +81,4 @@ def split_date_range_into_chunks(start_time, end_time, number_of_threads): datelist.append(date_time.strftime(DATETIME_FORMAT)) formatted_end_time = end_time.strftime(DATETIME_FORMAT) datelist.append(formatted_end_time) - return formatted_end_time, datelist - - -def get_storage_with_collection(local_storage, collection): - """Returns a dictionary containing the locally stored IDs of files fetched from network drives - :param local_storage: The object of the local storage used to store the indexed document IDs - :param collection: The SharePoint server collection which is currently being fetched - """ - storage_with_collection = {"global_keys": {}, "delete_keys": {}} - ids_collection = local_storage.load_storage() - storage_with_collection["delete_keys"] = copy.deepcopy( - ids_collection.get("global_keys") - ) - if not ids_collection["global_keys"].get(collection): - ids_collection["global_keys"][collection] = { - "sites": {}, - "lists": {}, - "list_items": {}, - "drive_items": {}, - } - storage_with_collection["global_keys"][collection] = copy.deepcopy( - ids_collection["global_keys"][collection] - ) - - return storage_with_collection + return datelist