From a65319e8df833e6c11cfa238f3cc8816e450a4f7 Mon Sep 17 00:00:00 2001
From: praveen-elastic <praveen.kukreja@elastic.co>
Date: Tue, 1 Mar 2022 20:17:49 +0530
Subject: [PATCH 1/9] added concurrency and other changes from stacked PRs

---
 ees_sharepoint/deletion_sync_command.py |  31 ++-
 ees_sharepoint/fetch_index.py           | 313 ++++++++++++++----------
 ees_sharepoint/schema.py                |   8 +-
 ees_sharepoint/utils.py                 |  76 +++++-
 sharepoint_server_2016_connector.yml    |   2 +
 5 files changed, 287 insertions(+), 143 deletions(-)

diff --git a/ees_sharepoint/deletion_sync_command.py b/ees_sharepoint/deletion_sync_command.py
index 6bebb9d..1b2abb6 100644
--- a/ees_sharepoint/deletion_sync_command.py
+++ b/ees_sharepoint/deletion_sync_command.py
@@ -13,9 +13,11 @@
 import requests
 
 from .base_command import BaseCommand
-
+from .utils import split_list_in_chunks
 
 IDS_PATH = os.path.join(os.path.dirname(__file__), 'doc_id.json')
+# By default, Enterprise Search configuration has a maximum allowed limit set to 100 documents for an api request
+BATCH_SIZE = 100
 
 
 class DeletionSyncCommand(BaseCommand):
@@ -38,7 +40,7 @@ def deindexing_items(self, collection, ids, key):
         logger = self.logger
         delete_ids_items = ids["delete_keys"][collection].get(key)
 
-        logger.info("Deindexing items...")
+        logger.info(f"Deindexing {key}...")
         if delete_ids_items:
             delete_site = []
             global_ids_items = ids["global_keys"][collection][key]
@@ -56,9 +58,10 @@ def deindexing_items(self, collection, ids, key):
                         if resp.status_code == requests.codes['not_found'] or result == []:
                             doc.append(item_id)
                     if doc:
-                        self.workplace_search_client.delete_documents(
-                            content_source_id=self.ws_source,
-                            document_ids=doc)
+                        for chunk in split_list_in_chunks(doc, BATCH_SIZE):
+                            self.workplace_search_client.delete_documents(
+                                content_source_id=self.ws_source,
+                                document_ids=chunk)
                     updated_items = global_ids_items[site_url].get(list_id)
                     if updated_items is None:
                         continue
@@ -96,11 +99,12 @@ def deindexing_lists(self, collection, ids):
                 for list_id in list_details.keys():
                     url = f"{site_url}/_api/web/lists(guid\'{list_id}\')"
                     resp = self.sharepoint_client.get(url, '', "deindex")
-                    if resp and resp.status_code == requests.codes['not_found']:
+                    if resp is not None and resp.status_code == requests.codes['not_found']:
                         doc.append(list_id)
-                self.workplace_search_client.delete_documents(
-                    content_source_id=self.ws_source,
-                    document_ids=doc)
+                for chunk in split_list_in_chunks(doc, BATCH_SIZE):
+                    self.workplace_search_client.delete_documents(
+                        content_source_id=self.ws_source,
+                        document_ids=chunk)
                 for list_id in doc:
                     if list_id in global_ids_lists[site_url]:
                         global_ids_lists[site_url].pop(list_id)
@@ -126,11 +130,12 @@ def deindexing_sites(self, collection, ids):
             for site_id, site_url in site_details.items():
                 url = f"{site_url}/_api/web"
                 resp = self.sharepoint_client.get(url, '', "deindex")
-                if resp and resp.status_code == requests.codes['not_found']:
+                if resp is not None and resp.status_code == requests.codes['not_found']:
                     doc.append(site_id)
-            self.workplace_search_client.delete_documents(
-                content_source_id=self.ws_source,
-                document_ids=doc)
+            for chunk in split_list_in_chunks(doc, BATCH_SIZE):
+                self.workplace_search_client.delete_documents(
+                    content_source_id=self.ws_source,
+                    document_ids=chunk)
             for site_id in doc:
                 ids["global_keys"][collection]["sites"].pop(site_id)
         else:
diff --git a/ees_sharepoint/fetch_index.py b/ees_sharepoint/fetch_index.py
index 44ee221..36b32bb 100644
--- a/ees_sharepoint/fetch_index.py
+++ b/ees_sharepoint/fetch_index.py
@@ -16,10 +16,11 @@
 from dateutil.parser import parse
 
 from tika.tika import TikaException
+from multiprocessing.pool import ThreadPool
 
 from .checkpointing import Checkpoint
 from .usergroup_permissions import Permissions
-from .utils import encode, extract
+from .utils import encode, extract, partition_equal_share, split_list_in_chunks, get_partition_time, split_dict_in_chunks
 from . import adapter
 
 IDS_PATH = os.path.join(os.path.dirname(__file__), 'doc_id.json')
@@ -31,8 +32,7 @@
 LISTS = "lists"
 LIST_ITEMS = "list_items"
 DRIVE_ITEMS = "drive_items"
-DOCUMENT_SIZE = 100
-DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
+BATCH_SIZE = 100
 
 
 def get_results(logger, response, entity_name):
@@ -66,21 +66,20 @@ def __init__(self, config, logger, workplace_search_client, sharepoint_client, s
         self.enable_permission = config.get_value("enable_document_permission")
         self.start_time = start_time
         self.end_time = end_time
+        self.max_threads = config.get_value("max_threads")
         self.mapping_sheet_path = config.get_value("sharepoint_workplace_user_mapping")
 
         self.checkpoint = Checkpoint(config, logger)
         self.permissions = Permissions(self.sharepoint_client, self.workplace_search_client, logger)
 
-    def index_document(self, document, parent_object, param_name):
+    def index_document(self, document, param_name):
         """ This method indexes the documents to the workplace.
             :param document: document to be indexed
-            :param parent_object: parent of the objects to be indexed
             :param param_name: parameter name whether it is SITES, LISTS LIST_ITEMS OR DRIVE_ITEMS
         """
         if document:
             total_documents_indexed = 0
-            document_list = [document[i * DOCUMENT_SIZE:(i + 1) * DOCUMENT_SIZE] for i in range((len(document) + DOCUMENT_SIZE - 1) // DOCUMENT_SIZE)]
-            for chunk in document_list:
+            for chunk in split_list_in_chunks(document, BATCH_SIZE):
                 response = self.workplace_search_client.index_documents(
                     content_source_id=self.ws_source,
                     documents=chunk
@@ -90,8 +89,21 @@ def index_document(self, document, parent_object, param_name):
                         total_documents_indexed += 1
                     else:
                         self.logger.error("Error while indexing %s. Error: %s" % (each['id'], each['errors']))
-        self.logger.info("Successfully indexed %s %s for %s to the workplace" % (
-            total_documents_indexed, param_name, parent_object))
+        self.logger.info("Successfully indexed %s %s to the workplace" % (
+            total_documents_indexed, param_name))
+
+    def threaded_index_documents(self, document, param_name):
+        """ Applies multithreading on indexing functionality
+            :param document: documents to be indexed equally in each thread
+            :param param_name: parameter name whether it is SITES, LISTS LIST_ITEMS OR DRIVE_ITEMS
+        """
+        chunk_documents = partition_equal_share(document, self.max_threads)
+        thread_pool = ThreadPool(self.max_threads)
+        for doc in chunk_documents:
+            thread_pool.apply_async(self.index_document, (doc, param_name))
+
+        thread_pool.close()
+        thread_pool.join()
 
     def get_schema_fields(self, document_name):
         """ returns the schema of all the include_fields or exclude_fields specified in the configuration file.
@@ -112,7 +124,7 @@ def get_schema_fields(self, document_name):
             adapter_schema['id'] = field_id
         return adapter_schema
 
-    def index_sites(self, parent_site_url, sites, ids, index):
+    def fetch_sites(self, parent_site_url, sites, ids, index, start_time, end_time):
         """This method fetches sites from a collection and invokes the
             index permission method to get the document level permissions.
             If the fetching is not successful, it logs proper message.
@@ -120,23 +132,24 @@ def index_sites(self, parent_site_url, sites, ids, index):
             :param sites: dictionary of site path and it's last updated time
             :param ids: structure containing id's of all objects
             :param index: index, boolean value
+            :param start_time: start time for fetching the data
+            :param end_time: end time for fetching the data
             Returns:
                 document: response of sharepoint GET call, with fields specified in the schema
         """
         rel_url = f"{parent_site_url}/_api/web/webs"
         self.logger.info("Fetching the sites detail from url: %s" % (rel_url))
         query = self.sharepoint_client.get_query(
-            self.start_time, self.end_time, SITES)
+            start_time, end_time, SITES)
         response = self.sharepoint_client.get(rel_url, query, SITES)
 
         response_data = get_results(self.logger, response, SITES)
         if not response_data:
-            self.logger.info("No sites were created in %s for this interval: start time: %s and end time: %s" % (parent_site_url, self.start_time, self.end_time))
+            self.logger.info("No sites were created in %s for this interval: start time: %s and end time: %s" % (parent_site_url, start_time, end_time))
             return sites
         self.logger.info(
             "Successfully fetched and parsed %s sites response from SharePoint" % len(response_data)
         )
-        self.logger.info("Indexing the sites to the Workplace")
 
         schema = self.get_schema_fields(SITES)
         document = []
@@ -153,18 +166,18 @@ def index_sites(self, parent_site_url, sites, ids, index):
                         key=SITES, site=response_data[i]['ServerRelativeUrl'])
                 document.append(doc)
                 ids["sites"].update({doc["id"]: response_data[i]["ServerRelativeUrl"]})
-            self.index_document(document, parent_site_url, SITES)
         for result in response_data:
             site_server_url = result.get("ServerRelativeUrl")
             sites.update({site_server_url: result.get("LastItemModifiedDate")})
-            self.index_sites(site_server_url, sites, ids, index)
-        return sites
+            self.fetch_sites(site_server_url, sites, ids, index, start_time, end_time)
+        return sites, document
 
-    def index_lists(self, sites, ids, index):
+    def fetch_lists(self, sites, ids, index):
         """This method fetches lists from all sites in a collection and invokes the
             index permission method to get the document level permissions.
             If the fetching is not successful, it logs proper message.
             :param sites: dictionary of site path and it's last updated time
+            :param ids: structure containing id's of all objects
             :param index: index, boolean value
             Returns:
                 document: response of sharepoint GET call, with fields specified in the schema
@@ -176,70 +189,67 @@ def index_lists(self, sites, ids, index):
             self.logger.info("No list was created in this interval: start time: %s and end time: %s" % (self.start_time, self.end_time))
             return [], []
         schema_list = self.get_schema_fields(LISTS)
-        for site, time_modified in sites.items():
-            if parse(self.start_time) > parse(time_modified):
-                continue
-            rel_url = f"{site}/_api/web/lists"
-            self.logger.info(
-                "Fetching the lists for site: %s from url: %s"
-                % (site, rel_url)
-            )
-
-            query = self.sharepoint_client.get_query(
-                self.start_time, self.end_time, LISTS)
-            response = self.sharepoint_client.get(
-                rel_url, query, LISTS)
-
-            response_data = get_results(self.logger, response, LISTS)
-            if not response_data:
-                self.logger.info("No list was created for the site : %s in this interval: start time: %s and end time: %s" % (site, self.start_time, self.end_time))
-                continue
-            self.logger.info(
-                "Successfully fetched and parsed %s list response for site: %s from SharePoint"
-                % (len(response_data), site)
-            )
+        for site_details in sites:
+            for site, time_modified in site_details.items():
+                if parse(self.start_time) > parse(time_modified):
+                    continue
+                rel_url = f"{site}/_api/web/lists"
+                self.logger.info(
+                    "Fetching the lists for site: %s from url: %s"
+                    % (site, rel_url)
+                )
 
-            base_list_url = f"{site}/Lists/"
+                query = self.sharepoint_client.get_query(
+                    self.start_time, self.end_time, LISTS)
+                response = self.sharepoint_client.get(
+                    rel_url, query, LISTS)
 
-            if index:
-                if not ids["lists"].get(site):
-                    ids["lists"].update({site: {}})
-                for i, _ in enumerate(response_data):
-                    doc = {'type': LIST}
-                    for field, response_field in schema_list.items():
-                        doc[field] = response_data[i].get(
-                            response_field)
-                    if self.enable_permission is True:
-                        doc["_allow_permissions"] = self.index_permissions(
-                            key=LISTS, site=site, list_id=doc["id"], list_url=response_data[i]['ParentWebUrl'], itemid=None)
-                    doc["url"] = urljoin(base_list_url, re.sub(
-                        r'[^ \w+]', '', response_data[i]["Title"]))
-                    document.append(doc)
-                    ids["lists"][site].update({doc["id"]: response_data[i]["Title"]})
+                response_data = get_results(self.logger, response, LISTS)
+                if not response_data:
+                    self.logger.info("No list was created for the site : %s in this interval: start time: %s and end time: %s" % (site, self.start_time, self.end_time))
+                    continue
                 self.logger.info(
-                    "Indexing the list for site: %s to the Workplace" % (site)
+                    "Successfully fetched and parsed %s list response for site: %s from SharePoint"
+                    % (len(response_data), site)
                 )
 
-                self.index_document(document, site, LISTS)
-
-            responses.append(response_data)
-        lists = {}
-        libraries = {}
-        for response in responses:
-            for result in response:
-                if result.get('BaseType') == 1:
-                    libraries[result.get("Id")] = [result.get(
-                        "ParentWebUrl"), result.get("Title"), result.get("LastItemModifiedDate")]
-                else:
-                    lists[result.get("Id")] = [result.get(
-                        "ParentWebUrl"), result.get("Title"), result.get("LastItemModifiedDate")]
-        return lists, libraries
-
-    def index_items(self, lists, ids):
+                base_list_url = f"{site}/Lists/"
+
+                if index:
+                    if not ids["lists"].get(site):
+                        ids["lists"].update({site: {}})
+                    for i, _ in enumerate(response_data):
+                        doc = {'type': LIST}
+                        for field, response_field in schema_list.items():
+                            doc[field] = response_data[i].get(
+                                response_field)
+                        if self.enable_permission is True:
+                            doc["_allow_permissions"] = self.index_permissions(
+                                key=LISTS, site=site, list_id=doc["id"], list_url=response_data[i]['ParentWebUrl'], itemid=None)
+                        doc["url"] = urljoin(base_list_url, re.sub(
+                            r'[^ \w+]', '', response_data[i]["Title"]))
+                        document.append(doc)
+                        ids["lists"][site].update({doc["id"]: response_data[i]["Title"]})
+
+                responses.append(response_data)
+            lists = {}
+            libraries = {}
+            for response in responses:
+                for result in response:
+                    if result.get('BaseType') == 1:
+                        libraries[result.get("Id")] = [result.get(
+                            "ParentWebUrl"), result.get("Title"), result.get("LastItemModifiedDate")]
+                    else:
+                        lists[result.get("Id")] = [result.get(
+                            "ParentWebUrl"), result.get("Title"), result.get("LastItemModifiedDate")]
+        return lists, libraries, document
+
+    def fetch_items(self, lists, ids):
         """This method fetches items from all the lists in a collection and
             invokes theindex permission method to get the document level permissions.
             If the fetching is not successful, it logs proper message.
             :param lists: document lists
+            :param ids: structure containing id's of all objects
             Returns:
                 document: response of sharepoint GET call, with fields specified in the schema
         """
@@ -316,23 +326,17 @@ def index_items(self, lists, ids):
                     if response_data[i].get("GUID") not in ids["list_items"][value[0]][list_content]:
                         ids["list_items"][value[0]][list_content].append(
                             response_data[i].get("GUID"))
-                self.logger.info(
-                    "Indexing the listitem for list: %s to the Workplace"
-                    % (value[1])
-                )
-
-                self.index_document(document, value[1], LIST_ITEMS)
-
-                responses.append(document)
+                responses.extend(document)
         return responses
 
-    def index_drive_items(self, libraries, ids):
+    def fetch_drive_items(self, libraries, ids):
         """This method fetches items from all the lists in a collection and
             invokes theindex permission method to get the document level permissions.
             If the fetching is not successful, it logs proper message.
             :param libraries: document lists
             :param ids: structure containing id's of all objects
         """
+        responses = []
         #  here value is a list of url and title of the library
         self.logger.info("Fetching all the files for the library")
         if not libraries:
@@ -390,12 +394,8 @@ def index_drive_items(self, libraries, ids):
                     document.append(doc)
                     if doc['id'] not in ids["drive_items"][value[0]][lib_content]:
                         ids["drive_items"][value[0]][lib_content].append(doc['id'])
-                if document:
-                    self.logger.info("Indexing the drive items for library: %s to the Workplace" % (value[1]))
-                    self.index_document(document, value[1], DRIVE_ITEMS)
-                else:
-                    self.logger.info("No item was present in the library %s for the interval: start time: %s and end time: %s" % (
-                        value[1], self.start_time, self.end_time))
+                responses.extend(document)
+        return responses
 
     def get_roles(self, key, site, list_url, list_id, itemid):
         """ Checks the permissions and returns the user roles.
@@ -458,40 +458,113 @@ def index_permissions(
             groups.append(title)
         return groups
 
+    def index_sites(self, parent_site_url, ids, sites_path):
+        """ Indexes the site details to the Workplace Search
+            :param parent_site_url: parent site relative path
+            :param ids: id collection of the all the objects
+            :param sites_path: dictionary of site path and it's last updated time
+        """
+        _, datelist = get_partition_time(self.max_threads, self.start_time, self.end_time)
+        results = []
+        thread_pool = ThreadPool(self.max_threads)
+        for num in range(0, self.max_threads):
+            start_time_partition = datelist[num]
+            end_time_partition = datelist[num + 1]
+            thread = thread_pool.apply_async(
+                self.fetch_sites, (parent_site_url, {}, ids, (SITES in self.objects),
+                                   start_time_partition, end_time_partition))
+            results.append(thread.get())
+
+        sites, documents = [], []
+        for result in results:
+            if result:
+                sites.append(result[0])
+                documents.extend(result[1])
+        thread_pool.close()
+        thread_pool.join()
+        self.threaded_index_documents(documents, SITES)
+        sites_path.extend(sites)
+
+    def index_lists(self, sites_path, ids, lists_details, libraries_details):
+        """ Indexes the list details to the Workplace Search
+            :param sites_path: dictionary of site path and it's last updated time
+            :param ids: id collection of the all the objects
+            :param lists_details: dictionary containing list name, list path and id
+            :param libraries_details: dictionary containing library name, library path and id
+        """
+        results = []
+        thread_pool = ThreadPool(self.max_threads)
+        partitioned_sites = partition_equal_share(sites_path, self.max_threads)
+        for site in partitioned_sites:
+            thread = thread_pool.apply_async(self.fetch_lists, (site, ids, (LISTS in self.objects)))
+            results.append(thread.get())
+        documents = []
+        for result in results:
+            if result:
+                lists_details.update(result[0])
+                libraries_details.update(result[1])
+                documents.extend(result[2])
+        thread_pool.close()
+        thread_pool.join()
+        self.threaded_index_documents(documents, LISTS)
+
+    def index_items(self, job_type, lists_details, libraries_details, ids):
+        """ Indexes the list_items and drive_items to the Workplace Search
+            :param job_type: denotes the type of sharepoint object being fetched in a particular process
+            :param lists_details: dictionary containing list name, list path and id
+            :param libraries_details: dictionary containing library name, library path and id
+            :param ids: id collection of the all the objects
+        """
+        results = []
+        partition = []
+        if job_type == "list_items" and LIST_ITEMS in self.objects:
+            thread_pool = ThreadPool(self.max_threads)
+            func = self.fetch_items
+            partition = split_dict_in_chunks(lists_details, self.max_threads)
+        elif job_type == "drive_items" and DRIVE_ITEMS in self.objects:
+            thread_pool = ThreadPool(self.max_threads)
+            func = self.fetch_drive_items
+            partition = split_dict_in_chunks(libraries_details, self.max_threads)
+        for list_data in partition:
+            thread = thread_pool.apply_async(func, (list_data, ids))
+            results.append(thread.get())
+        documents = []
+        for result in results:
+            if result:
+                documents.extend(result)
+        thread_pool.close()
+        thread_pool.join()
+        self.threaded_index_documents(documents, job_type)
+
     def indexing(self, collection, ids, storage, job_type, parent_site_url, sites_path, lists_details, libraries_details):
         """This method fetches all the objects from sharepoint server and
             ingests them into the workplace search
             :param collection: collection name
             :param ids: id collection of the all the objects
             :param storage: temporary storage for storing all the documents
-            :job_type: denotes the type of sharepoint object being fetched in a particular process
-            :parent_site_url: parent site relative path
-            :sites_path: dictionary of site path and it's last updated time
-            :lists_details: dictionary containing list name, list path and id
-            :library_details: dictionary containing library name, library path and id
+            :param job_type: denotes the type of sharepoint object being fetched in a particular process
+            :param parent_site_url: parent site relative path
+            :param sites_path: dictionary of site path and it's last updated time
+            :param lists_details: dictionary containing list name, list path and id
+            :param libraries_details: dictionary containing library name, library path and id
         """
         if job_type == "sites":
-            sites = self.index_sites(parent_site_url, {}, ids, index=(SITES in self.objects))
-            sites_path.update(sites)
+            self.index_sites(parent_site_url, ids, sites_path)
+
         elif job_type == "lists":
-            lists, libraries = self.index_lists(sites_path, ids, index=(LISTS in self.objects))
-            lists_details.update(lists)
-            libraries_details.update(libraries)
-        elif job_type == "list_items":
-            if LIST_ITEMS in self.objects:
-                self.index_items(lists_details, ids)
-        else:
-            if DRIVE_ITEMS in self.objects:
-                self.index_drive_items(libraries_details, ids)
+            self.index_lists(sites_path, ids, lists_details, libraries_details)
 
-            self.logger.info(
-                "Completed fetching all the objects for site collection: %s"
-                % (collection)
-            )
+        elif job_type in ["list_items", "drive_items"]:
+            self.index_items(job_type, lists_details, libraries_details, ids)
 
-            self.logger.info(
-                "Saving the checkpoint for the site collection: %s" % (collection)
-            )
+        self.logger.info(
+            "Completed fetching all the objects for site collection: %s"
+            % (collection)
+        )
+
+        self.logger.info(
+            "Saving the checkpoint for the site collection: %s" % (collection)
+        )
         if ids.get(job_type):
             prev_ids = storage[job_type]
             if job_type == 'sites':
@@ -507,21 +580,6 @@ def indexing(self, collection, ids, storage, job_type, parent_site_url, sites_pa
             storage[job_type] = prev_ids
 
 
-def datetime_partitioning(start_time, end_time, processes):
-    """ Divides the timerange in equal partitions by number of processors
-        :param start_time: start time of the interval
-        :param end_time: end time of the interval
-        :param processes: number of processors the device have
-    """
-    start_time = datetime.strptime(start_time, DATETIME_FORMAT)
-    end_time = datetime.strptime(end_time, DATETIME_FORMAT)
-
-    diff = (end_time - start_time) / processes
-    for idx in range(processes):
-        yield start_time + diff * idx
-    yield end_time
-
-
 def start(indexing_type, config, logger, workplace_search_client, sharepoint_client):
     """Runs the indexing logic
         :param indexing_type: The type of the indexing i.e. incremental or full
@@ -568,7 +626,7 @@ def start(indexing_type, config, logger, workplace_search_client, sharepoint_cli
                     "sites": {}, "lists": {}, "list_items": {}, "drive_items": {}}
 
             parent_site_url = f"/sites/{collection}"
-            sites_path = {parent_site_url: end_time}
+            sites_path = [{parent_site_url: end_time}]
             lists_details = {}
             libraries_details = {}
             logger.info(
@@ -592,9 +650,8 @@ def start(indexing_type, config, logger, workplace_search_client, sharepoint_cli
 
             storage_with_collection["global_keys"][collection] = storage.copy()
 
-            check.set_checkpoint(collection, start_time, indexing_type)
+            check.set_checkpoint(collection, end_time, indexing_type)
     except Exception as exception:
-        check.set_checkpoint(collection, end_time, indexing_type)
         raise exception
 
     with open(IDS_PATH, "w") as file:
diff --git a/ees_sharepoint/schema.py b/ees_sharepoint/schema.py
index 5505da1..bf63ad0 100644
--- a/ees_sharepoint/schema.py
+++ b/ees_sharepoint/schema.py
@@ -156,7 +156,7 @@ def coerce_rfc_3339_date(input_date):
     'log_level': {
         'required': False,
         'type': 'string',
-        'default': 'info',
+        'default': 'INFO',
         'allowed': ['DEBUG', 'INFO', 'WARN', 'ERROR']
     },
     'retry_count': {
@@ -165,6 +165,12 @@ def coerce_rfc_3339_date(input_date):
         'default': 3,
         'min': 1
     },
+    'max_threads': {
+        'required': False,
+        'type': 'integer',
+        'default': 40,
+        'min': 1
+    },
     'sharepoint_workplace_user_mapping': {
         'required': False,
         'type': 'string'
diff --git a/ees_sharepoint/utils.py b/ees_sharepoint/utils.py
index 89e41a0..ab3a932 100644
--- a/ees_sharepoint/utils.py
+++ b/ees_sharepoint/utils.py
@@ -3,11 +3,13 @@
 # or more contributor license agreements. Licensed under the Elastic License 2.0;
 # you may not use this file except in compliance with the Elastic License 2.0.
 #
-"""This module contains uncategorisied utility methods."""
+"""This module contains uncategorized utility methods."""
 
 import urllib.parse
 
 from tika import parser
+from datetime import datetime
+DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
 
 
 def extract(content):
@@ -28,3 +30,75 @@ def encode(object_name):
         :param object_name: name that contains special characters"""
     name = urllib.parse.quote(object_name, safe="'")
     return name.replace("'", "''")
+
+
+def partition_equal_share(object_list, total_groups):
+    """ Divides the list in groups of approximately equal sizes
+        :param object_list: list to be partitioned
+        :param total_groups: number of groups to be formed
+    """
+    if object_list:
+        groups = min(total_groups, len(object_list))
+        group_list = []
+        for i in range(groups):
+            group_list.append(object_list[i::groups])
+        return group_list
+    else:
+        return []
+
+
+def split_list_in_chunks(input_list, chunk_size):
+    """ This method splits a list into separate chunks with maximum size
+        as chunk_size
+        :param input_list: List to be partitioned into chunks
+        :param chunk_size: Maximum size of a chunk
+        Returns:
+            list_of_chunks: List containing the chunks
+    """
+    list_of_chunks = []
+    for i in range(0, len(input_list), chunk_size):
+        list_of_chunks.append(input_list[i:i + chunk_size])
+    return list_of_chunks
+
+
+def split_dict_in_chunks(input_dict, chunk_size):
+    """ This method splits a dictionary into separate chunks with maximum size
+        as chunk_size
+        :param input_dict: Dictionary to be partitioned into chunks
+        :param chunk_size: Maximum size of a chunk
+        Returns:
+            list_of_chunks: List containing the chunks
+    """
+    list_of_chunks = []
+    for i in range(0, len(input_dict), chunk_size):
+        partitioned_chunk = list(input_dict.items())[i:i + chunk_size]
+        list_of_chunks.append(dict(partitioned_chunk))
+    return list_of_chunks
+
+
+def datetime_partitioning(start_time, end_time, processes):
+    """ Divides the timerange in equal partitions by number of processors
+        :param start_time: start time of the interval
+        :param end_time: end time of the interval
+        :param processes: number of processors the device have
+    """
+    start_time = datetime.strptime(start_time, DATETIME_FORMAT)
+    end_time = datetime.strptime(end_time, DATETIME_FORMAT)
+
+    diff = (end_time - start_time) / processes
+    for idx in range(processes):
+        yield start_time + diff * idx
+    yield end_time
+
+
+def get_partition_time(max_threads, start_time, end_time):
+    """ Divides the time range of indexing into partitions based on number of processes.
+        :param max_threads: Number of threads in multithreading
+        :param start_time: Start time of a time range
+        :param end_time: End time of a time range
+    """
+    partitions = list(datetime_partitioning(start_time, end_time, max_threads))
+    datelist = []
+    for sub in partitions:
+        datelist.append(sub.strftime(DATETIME_FORMAT))
+    return end_time, datelist
diff --git a/sharepoint_server_2016_connector.yml b/sharepoint_server_2016_connector.yml
index 5899064..948b328 100644
--- a/sharepoint_server_2016_connector.yml
+++ b/sharepoint_server_2016_connector.yml
@@ -44,5 +44,7 @@ end_time :
 log_level: INFO
 #The number of retries to perform in case of server error. The connector will use exponential backoff for retry mechanism
 retry_count: 3
+#Number of threads to be used in multithreading for the connector.
+max_threads: 40
 #the path of csv file containing mapping of sharepoint user ID to Workplace user ID
 sharepoint_workplace_user_mapping: "C:/Users/abc/folder_name/file_name.csv"

From 40d94b7a57bfc493f8d73e5ab745014d4d1dbd08 Mon Sep 17 00:00:00 2001
From: praveen-elastic <praveen.kukreja@elastic.co>
Date: Fri, 4 Mar 2022 16:07:09 +0530
Subject: [PATCH 2/9] updated multithreading

---
 ees_sharepoint/fetch_index.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ees_sharepoint/fetch_index.py b/ees_sharepoint/fetch_index.py
index 36b32bb..9816e27 100644
--- a/ees_sharepoint/fetch_index.py
+++ b/ees_sharepoint/fetch_index.py
@@ -473,10 +473,10 @@ def index_sites(self, parent_site_url, ids, sites_path):
             thread = thread_pool.apply_async(
                 self.fetch_sites, (parent_site_url, {}, ids, (SITES in self.objects),
                                    start_time_partition, end_time_partition))
-            results.append(thread.get())
+            results.append(thread)
 
         sites, documents = [], []
-        for result in results:
+        for result in [r.get() for r in results]:
             if result:
                 sites.append(result[0])
                 documents.extend(result[1])
@@ -497,9 +497,9 @@ def index_lists(self, sites_path, ids, lists_details, libraries_details):
         partitioned_sites = partition_equal_share(sites_path, self.max_threads)
         for site in partitioned_sites:
             thread = thread_pool.apply_async(self.fetch_lists, (site, ids, (LISTS in self.objects)))
-            results.append(thread.get())
+            results.append(thread)
         documents = []
-        for result in results:
+        for result in [r.get() for r in results]:
             if result:
                 lists_details.update(result[0])
                 libraries_details.update(result[1])
@@ -527,9 +527,9 @@ def index_items(self, job_type, lists_details, libraries_details, ids):
             partition = split_dict_in_chunks(libraries_details, self.max_threads)
         for list_data in partition:
             thread = thread_pool.apply_async(func, (list_data, ids))
-            results.append(thread.get())
+            results.append(thread)
         documents = []
-        for result in results:
+        for result in [r.get() for r in results]:
             if result:
                 documents.extend(result)
         thread_pool.close()

From b472d3773cf6ef75636d84c36e5394bfc103ae32 Mon Sep 17 00:00:00 2001
From: praveen-elastic <praveen.kukreja@elastic.co>
Date: Fri, 11 Mar 2022 15:50:28 +0530
Subject: [PATCH 3/9] resolve PR comments

---
 ees_sharepoint/deletion_sync_command.py |   8 +-
 ees_sharepoint/fetch_index.py           | 104 +++++++++++++-----------
 ees_sharepoint/utils.py                 |  49 ++++-------
 3 files changed, 78 insertions(+), 83 deletions(-)

diff --git a/ees_sharepoint/deletion_sync_command.py b/ees_sharepoint/deletion_sync_command.py
index 1b2abb6..800a9be 100644
--- a/ees_sharepoint/deletion_sync_command.py
+++ b/ees_sharepoint/deletion_sync_command.py
@@ -13,7 +13,7 @@
 import requests
 
 from .base_command import BaseCommand
-from .utils import split_list_in_chunks
+from .utils import split_list_into_buckets
 
 IDS_PATH = os.path.join(os.path.dirname(__file__), 'doc_id.json')
 # By default, Enterprise Search configuration has a maximum allowed limit set to 100 documents for an api request
@@ -58,7 +58,7 @@ def deindexing_items(self, collection, ids, key):
                         if resp.status_code == requests.codes['not_found'] or result == []:
                             doc.append(item_id)
                     if doc:
-                        for chunk in split_list_in_chunks(doc, BATCH_SIZE):
+                        for chunk in split_list_into_buckets(doc, BATCH_SIZE):
                             self.workplace_search_client.delete_documents(
                                 content_source_id=self.ws_source,
                                 document_ids=chunk)
@@ -101,7 +101,7 @@ def deindexing_lists(self, collection, ids):
                     resp = self.sharepoint_client.get(url, '', "deindex")
                     if resp is not None and resp.status_code == requests.codes['not_found']:
                         doc.append(list_id)
-                for chunk in split_list_in_chunks(doc, BATCH_SIZE):
+                for chunk in split_list_into_buckets(doc, BATCH_SIZE):
                     self.workplace_search_client.delete_documents(
                         content_source_id=self.ws_source,
                         document_ids=chunk)
@@ -132,7 +132,7 @@ def deindexing_sites(self, collection, ids):
                 resp = self.sharepoint_client.get(url, '', "deindex")
                 if resp is not None and resp.status_code == requests.codes['not_found']:
                     doc.append(site_id)
-            for chunk in split_list_in_chunks(doc, BATCH_SIZE):
+            for chunk in split_list_into_buckets(doc, BATCH_SIZE):
                 self.workplace_search_client.delete_documents(
                     content_source_id=self.ws_source,
                     document_ids=chunk)
diff --git a/ees_sharepoint/fetch_index.py b/ees_sharepoint/fetch_index.py
index 9816e27..fc31c38 100644
--- a/ees_sharepoint/fetch_index.py
+++ b/ees_sharepoint/fetch_index.py
@@ -16,11 +16,10 @@
 from dateutil.parser import parse
 
 from tika.tika import TikaException
-from multiprocessing.pool import ThreadPool
 
 from .checkpointing import Checkpoint
 from .usergroup_permissions import Permissions
-from .utils import encode, extract, partition_equal_share, split_list_in_chunks, get_partition_time, split_dict_in_chunks
+from .utils import encode, extract, split_list_into_buckets, split_date_range_into_chunks, split_dict_in_chunks, spawn_threads
 from . import adapter
 
 IDS_PATH = os.path.join(os.path.dirname(__file__), 'doc_id.json')
@@ -79,7 +78,7 @@ def index_document(self, document, param_name):
         """
         if document:
             total_documents_indexed = 0
-            for chunk in split_list_in_chunks(document, BATCH_SIZE):
+            for chunk in split_list_into_buckets(document, BATCH_SIZE):
                 response = self.workplace_search_client.index_documents(
                     content_source_id=self.ws_source,
                     documents=chunk
@@ -97,8 +96,8 @@ def threaded_index_documents(self, document, param_name):
             :param document: documents to be indexed equally in each thread
             :param param_name: parameter name whether it is SITES, LISTS LIST_ITEMS OR DRIVE_ITEMS
         """
-        chunk_documents = partition_equal_share(document, self.max_threads)
-        thread_pool = ThreadPool(self.max_threads)
+        chunk_documents = split_list_into_buckets(document, self.max_threads)
+        thread_pool = spawn_threads(self.max_threads)
         for doc in chunk_documents:
             thread_pool.apply_async(self.index_document, (doc, param_name))
 
@@ -458,15 +457,17 @@ def index_permissions(
             groups.append(title)
         return groups
 
-    def index_sites(self, parent_site_url, ids, sites_path):
+    def index_sites(self, ids, end_time, collection):
         """ Indexes the site details to the Workplace Search
-            :param parent_site_url: parent site relative path
             :param ids: id collection of the all the objects
-            :param sites_path: dictionary of site path and it's last updated time
+            :param end_time: end time for fetching the data
+            :param collection: collection name
         """
-        _, datelist = get_partition_time(self.max_threads, self.start_time, self.end_time)
+        _, datelist = split_date_range_into_chunks(self.start_time, self.end_time, self.max_threads)
         results = []
-        thread_pool = ThreadPool(self.max_threads)
+        parent_site_url = f"/sites/{collection}"
+        sites_path = [{parent_site_url: end_time}]
+        thread_pool = spawn_threads(self.max_threads)
         for num in range(0, self.max_threads):
             start_time_partition = datelist[num]
             end_time_partition = datelist[num + 1]
@@ -484,17 +485,16 @@ def index_sites(self, parent_site_url, ids, sites_path):
         thread_pool.join()
         self.threaded_index_documents(documents, SITES)
         sites_path.extend(sites)
+        return sites_path
 
-    def index_lists(self, sites_path, ids, lists_details, libraries_details):
+    def index_lists(self, sites_path, ids):
         """ Indexes the list details to the Workplace Search
             :param sites_path: dictionary of site path and it's last updated time
             :param ids: id collection of the all the objects
-            :param lists_details: dictionary containing list name, list path and id
-            :param libraries_details: dictionary containing library name, library path and id
         """
-        results = []
-        thread_pool = ThreadPool(self.max_threads)
-        partitioned_sites = partition_equal_share(sites_path, self.max_threads)
+        results, lists_details, libraries_details = [], {}, {}
+        thread_pool = spawn_threads(self.max_threads)
+        partitioned_sites = split_list_into_buckets(sites_path, self.max_threads)
         for site in partitioned_sites:
             thread = thread_pool.apply_async(self.fetch_lists, (site, ids, (LISTS in self.objects)))
             results.append(thread)
@@ -507,26 +507,39 @@ def index_lists(self, sites_path, ids, lists_details, libraries_details):
         thread_pool.close()
         thread_pool.join()
         self.threaded_index_documents(documents, LISTS)
+        return [lists_details, libraries_details]
 
-    def index_items(self, job_type, lists_details, libraries_details, ids):
-        """ Indexes the list_items and drive_items to the Workplace Search
-            :param job_type: denotes the type of sharepoint object being fetched in a particular process
+    def index_list_items(self, lists_details, ids):
+        """ Indexes the list_items to the Workplace Search
             :param lists_details: dictionary containing list name, list path and id
+            :param ids: id collection of the all the objects
+        """
+        results = []
+        partition = []
+        thread_pool = spawn_threads(self.max_threads)
+        partition = split_dict_in_chunks(lists_details, self.max_threads)
+        for list_data in partition:
+            thread = thread_pool.apply_async(self.fetch_items, (list_data, ids))
+            results.append(thread)
+        documents = []
+        for result in [r.get() for r in results]:
+            if result:
+                documents.extend(result)
+        thread_pool.close()
+        thread_pool.join()
+        self.threaded_index_documents(documents, LIST_ITEMS)
+
+    def index_drive_items(self, libraries_details, ids):
+        """ Indexes the drive_items to the Workplace Search
             :param libraries_details: dictionary containing library name, library path and id
             :param ids: id collection of the all the objects
         """
         results = []
         partition = []
-        if job_type == "list_items" and LIST_ITEMS in self.objects:
-            thread_pool = ThreadPool(self.max_threads)
-            func = self.fetch_items
-            partition = split_dict_in_chunks(lists_details, self.max_threads)
-        elif job_type == "drive_items" and DRIVE_ITEMS in self.objects:
-            thread_pool = ThreadPool(self.max_threads)
-            func = self.fetch_drive_items
-            partition = split_dict_in_chunks(libraries_details, self.max_threads)
+        thread_pool = spawn_threads(self.max_threads)
+        partition = split_dict_in_chunks(libraries_details, self.max_threads)
         for list_data in partition:
-            thread = thread_pool.apply_async(func, (list_data, ids))
+            thread = thread_pool.apply_async(self.fetch_drive_items, (list_data, ids))
             results.append(thread)
         documents = []
         for result in [r.get() for r in results]:
@@ -534,28 +547,29 @@ def index_items(self, job_type, lists_details, libraries_details, ids):
                 documents.extend(result)
         thread_pool.close()
         thread_pool.join()
-        self.threaded_index_documents(documents, job_type)
+        self.threaded_index_documents(documents, DRIVE_ITEMS)
 
-    def indexing(self, collection, ids, storage, job_type, parent_site_url, sites_path, lists_details, libraries_details):
+    def indexing(self, collection, ids, storage, job_type, collected_objects, end_time):
         """This method fetches all the objects from sharepoint server and
             ingests them into the workplace search
             :param collection: collection name
             :param ids: id collection of the all the objects
             :param storage: temporary storage for storing all the documents
             :param job_type: denotes the type of sharepoint object being fetched in a particular process
-            :param parent_site_url: parent site relative path
-            :param sites_path: dictionary of site path and it's last updated time
-            :param lists_details: dictionary containing list name, list path and id
-            :param libraries_details: dictionary containing library name, library path and id
+            :param collected_objects: helper variable to provide the data to children object
+            :param end_time: end time for fetching the data
         """
         if job_type == "sites":
-            self.index_sites(parent_site_url, ids, sites_path)
+            collected_objects = self.index_sites(ids, end_time, collection)
 
         elif job_type == "lists":
-            self.index_lists(sites_path, ids, lists_details, libraries_details)
+            collected_objects = self.index_lists(collected_objects, ids)
+
+        elif job_type == LIST_ITEMS and LIST_ITEMS in self.objects:
+            self.index_list_items(collected_objects[0], ids)
 
-        elif job_type in ["list_items", "drive_items"]:
-            self.index_items(job_type, lists_details, libraries_details, ids)
+        elif job_type == DRIVE_ITEMS and DRIVE_ITEMS in self.objects:
+            self.index_drive_items(collected_objects[1], ids)
 
         self.logger.info(
             "Completed fetching all the objects for site collection: %s"
@@ -578,6 +592,7 @@ def indexing(self, collection, ids, storage, job_type, parent_site_url, sites_pa
                     for list_name in list_content.keys():
                         prev_ids[site][list_name] = list(set([*prev_ids[site].get(list_name, []), *ids[job_type][site][list_name]]))
             storage[job_type] = prev_ids
+        return collected_objects
 
 
 def start(indexing_type, config, logger, workplace_search_client, sharepoint_client):
@@ -625,27 +640,22 @@ def start(indexing_type, config, logger, workplace_search_client, sharepoint_cli
                 ids_collection["global_keys"][collection] = {
                     "sites": {}, "lists": {}, "list_items": {}, "drive_items": {}}
 
-            parent_site_url = f"/sites/{collection}"
-            sites_path = [{parent_site_url: end_time}]
-            lists_details = {}
-            libraries_details = {}
             logger.info(
                 "Starting to index all the objects configured in the object field: %s"
                 % (str(config.get_value("objects")))
             )
 
             indexer = FetchIndex(config, logger, workplace_search_client, sharepoint_client, start_time, end_time)
+            returned_documents = None
             for job_type in ["sites", "lists", "list_items", "drive_items"]:
                 logger.info(f"Indexing {job_type}")
-                indexer.indexing(
+                returned_documents = indexer.indexing(
                     collection,
                     ids_collection["global_keys"][collection],
                     storage,
                     job_type,
-                    parent_site_url,
-                    sites_path,
-                    lists_details,
-                    libraries_details
+                    returned_documents,
+                    end_time
                 )
 
             storage_with_collection["global_keys"][collection] = storage.copy()
diff --git a/ees_sharepoint/utils.py b/ees_sharepoint/utils.py
index ab3a932..ceba89f 100644
--- a/ees_sharepoint/utils.py
+++ b/ees_sharepoint/utils.py
@@ -9,6 +9,7 @@
 
 from tika import parser
 from datetime import datetime
+from multiprocessing.pool import ThreadPool
 DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
 
 
@@ -32,7 +33,7 @@ def encode(object_name):
     return name.replace("'", "''")
 
 
-def partition_equal_share(object_list, total_groups):
+def split_list_into_buckets(object_list, total_groups):
     """ Divides the list in groups of approximately equal sizes
         :param object_list: list to be partitioned
         :param total_groups: number of groups to be formed
@@ -47,20 +48,6 @@ def partition_equal_share(object_list, total_groups):
         return []
 
 
-def split_list_in_chunks(input_list, chunk_size):
-    """ This method splits a list into separate chunks with maximum size
-        as chunk_size
-        :param input_list: List to be partitioned into chunks
-        :param chunk_size: Maximum size of a chunk
-        Returns:
-            list_of_chunks: List containing the chunks
-    """
-    list_of_chunks = []
-    for i in range(0, len(input_list), chunk_size):
-        list_of_chunks.append(input_list[i:i + chunk_size])
-    return list_of_chunks
-
-
 def split_dict_in_chunks(input_dict, chunk_size):
     """ This method splits a dictionary into separate chunks with maximum size
         as chunk_size
@@ -76,29 +63,27 @@ def split_dict_in_chunks(input_dict, chunk_size):
     return list_of_chunks
 
 
-def datetime_partitioning(start_time, end_time, processes):
-    """ Divides the timerange in equal partitions by number of processors
+def split_date_range_into_chunks(start_time, end_time, number_of_threads):
+    """ Divides the timerange in equal partitions by number of threads
         :param start_time: start time of the interval
         :param end_time: end time of the interval
-        :param processes: number of processors the device have
+        :param number_of_threads: number of threads defined by user in config file
     """
     start_time = datetime.strptime(start_time, DATETIME_FORMAT)
     end_time = datetime.strptime(end_time, DATETIME_FORMAT)
 
-    diff = (end_time - start_time) / processes
-    for idx in range(processes):
-        yield start_time + diff * idx
-    yield end_time
+    diff = (end_time - start_time) / number_of_threads
+    datelist = []
+    for idx in range(number_of_threads):
+        date_time = start_time + diff * idx
+        datelist.append(date_time.strftime(DATETIME_FORMAT))
+    formatted_end_time = end_time.strftime(DATETIME_FORMAT)
+    datelist.append(formatted_end_time)
+    return formatted_end_time, datelist
 
 
-def get_partition_time(max_threads, start_time, end_time):
-    """ Divides the time range of indexing into partitions based on number of processes.
-        :param max_threads: Number of threads in multithreading
-        :param start_time: Start time of a time range
-        :param end_time: End time of a time range
+def spawn_threads(max_threads):
+    """ Spawns number of threads provided by user in the config file
+        :param max_threads: maximum number of threads defined by user
     """
-    partitions = list(datetime_partitioning(start_time, end_time, max_threads))
-    datelist = []
-    for sub in partitions:
-        datelist.append(sub.strftime(DATETIME_FORMAT))
-    return end_time, datelist
+    return ThreadPool(max_threads)

From 08e79e504db797f70d8283d1a73fdf9e4a102554 Mon Sep 17 00:00:00 2001
From: praveen-elastic <praveen.kukreja@elastic.co>
Date: Thu, 24 Mar 2022 20:03:15 +0530
Subject: [PATCH 4/9] modify multiprocessing approach to introduce queue

---
 ees_sharepoint/connector_queue.py          |  49 ++
 ees_sharepoint/fetch_index.py              | 672 ---------------------
 ees_sharepoint/full_sync_command.py        |  26 +-
 ees_sharepoint/incremental_sync_command.py |  26 +-
 ees_sharepoint/permission_sync_command.py  |  49 +-
 ees_sharepoint/schema.py                   |  10 +-
 ees_sharepoint/sync_enterprise_search.py   |  76 +++
 ees_sharepoint/sync_sharepoint.py          | 644 ++++++++++++++++++++
 ees_sharepoint/utils.py                    |  74 ++-
 sharepoint_server_connector.yml            |   6 +-
 10 files changed, 885 insertions(+), 747 deletions(-)
 create mode 100644 ees_sharepoint/connector_queue.py
 delete mode 100644 ees_sharepoint/fetch_index.py
 create mode 100644 ees_sharepoint/sync_enterprise_search.py
 create mode 100644 ees_sharepoint/sync_sharepoint.py

diff --git a/ees_sharepoint/connector_queue.py b/ees_sharepoint/connector_queue.py
new file mode 100644
index 0000000..71792a7
--- /dev/null
+++ b/ees_sharepoint/connector_queue.py
@@ -0,0 +1,49 @@
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0;
+# you may not use this file except in compliance with the Elastic License 2.0.
+#
+import multiprocessing
+from multiprocessing.queues import Queue
+from .utils import split_documents_into_equal_chunks
+
+
+BATCH_SIZE = 100
+
+
+class ConnectorQueue(Queue):
+    """Class to support additional queue operations specific to the connector"""
+
+    def __init__(self):
+        ctx = multiprocessing.get_context()
+        super(ConnectorQueue, self).__init__(ctx=ctx)
+
+    def end_signal(self):
+        """Send an terminate signal to indicate the queue can be closed"""
+
+        signal_close = {"type": "signal_close"}
+        self.put(signal_close)
+
+    def put_checkpoint(self, key, checkpoint_time, indexing_type):
+        """Put the checkpoint object in the queue which will be used by the consumer to update the checkpoint file
+
+        :param key: The key of the checkpoint dictionary
+        :param checkpoint_time: The end time that will be stored in the checkpoint as {'key': 'checkpoint_time'}
+        :param indexing_type: The type of the indexing i.e. Full or Incremental
+        """
+
+        checkpoint = {"type": "checkpoint", "data": (key, checkpoint_time, indexing_type)}
+        self.put(checkpoint)
+
+    def append_to_queue(self, documents):
+        """Append documents to the shared queue
+        :param documents: documents fetched from sharepoint
+        """
+        if documents:
+            results = documents
+            # In case documents is object of tuple
+            if isinstance(documents, tuple):
+                results = documents[-1]
+            for chunk in split_documents_into_equal_chunks(results.get("data"), BATCH_SIZE):
+                document = {"type": results.get("type"), "data": chunk}
+                self.put(document)
diff --git a/ees_sharepoint/fetch_index.py b/ees_sharepoint/fetch_index.py
deleted file mode 100644
index fc31c38..0000000
--- a/ees_sharepoint/fetch_index.py
+++ /dev/null
@@ -1,672 +0,0 @@
-#
-# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
-# or more contributor license agreements. Licensed under the Elastic License 2.0;
-# you may not use this file except in compliance with the Elastic License 2.0.
-#
-"""fetch_index module allows to sync data to Elastic Enterprise Search.
-
-It's possible to run full syncs and incremental syncs with this module."""
-
-import copy
-import os
-import re
-import json
-from datetime import datetime
-from urllib.parse import urljoin
-from dateutil.parser import parse
-
-from tika.tika import TikaException
-
-from .checkpointing import Checkpoint
-from .usergroup_permissions import Permissions
-from .utils import encode, extract, split_list_into_buckets, split_date_range_into_chunks, split_dict_in_chunks, spawn_threads
-from . import adapter
-
-IDS_PATH = os.path.join(os.path.dirname(__file__), 'doc_id.json')
-
-SITE = "site"
-LIST = "list"
-ITEM = "item"
-SITES = "sites"
-LISTS = "lists"
-LIST_ITEMS = "list_items"
-DRIVE_ITEMS = "drive_items"
-BATCH_SIZE = 100
-
-
-def get_results(logger, response, entity_name):
-    """ Attempts to fetch results from a Sharepoint Server response
-        :param response: response from the sharepoint client
-        :param entity_name: entity name whether it is SITES, LISTS, LIST_ITEMS OR DRIVE_ITEMS
-        Returns:
-            Parsed response
-    """
-    if not response:
-        logger.error(f"Empty response when fetching {entity_name}")  # TODO: should it be an error?
-        return None
-
-    if entity_name == "attachment" and not response.get("d", {}).get("results"):
-        logger.info("Failed to fetch attachment")  # TODO: not sure if it's the right message
-        return None
-    return response.get("d", {}).get("results")
-
-
-class FetchIndex:
-    """This class allows ingesting data from Sharepoint Server to Elastic Enterprise Search."""
-    def __init__(self, config, logger, workplace_search_client, sharepoint_client, start_time, end_time):
-        self.config = config
-        self.logger = logger
-        self.workplace_search_client = workplace_search_client
-        self.sharepoint_client = sharepoint_client
-
-        self.ws_source = config.get_value("workplace_search.source_id")
-        self.objects = config.get_value("objects")
-        self.site_collections = config.get_value("sharepoint.site_collections")
-        self.enable_permission = config.get_value("enable_document_permission")
-        self.start_time = start_time
-        self.end_time = end_time
-        self.max_threads = config.get_value("max_threads")
-        self.mapping_sheet_path = config.get_value("sharepoint_workplace_user_mapping")
-
-        self.checkpoint = Checkpoint(config, logger)
-        self.permissions = Permissions(self.sharepoint_client, self.workplace_search_client, logger)
-
-    def index_document(self, document, param_name):
-        """ This method indexes the documents to the workplace.
-            :param document: document to be indexed
-            :param param_name: parameter name whether it is SITES, LISTS LIST_ITEMS OR DRIVE_ITEMS
-        """
-        if document:
-            total_documents_indexed = 0
-            for chunk in split_list_into_buckets(document, BATCH_SIZE):
-                response = self.workplace_search_client.index_documents(
-                    content_source_id=self.ws_source,
-                    documents=chunk
-                )
-                for each in response['results']:
-                    if not each['errors']:
-                        total_documents_indexed += 1
-                    else:
-                        self.logger.error("Error while indexing %s. Error: %s" % (each['id'], each['errors']))
-        self.logger.info("Successfully indexed %s %s to the workplace" % (
-            total_documents_indexed, param_name))
-
-    def threaded_index_documents(self, document, param_name):
-        """ Applies multithreading on indexing functionality
-            :param document: documents to be indexed equally in each thread
-            :param param_name: parameter name whether it is SITES, LISTS LIST_ITEMS OR DRIVE_ITEMS
-        """
-        chunk_documents = split_list_into_buckets(document, self.max_threads)
-        thread_pool = spawn_threads(self.max_threads)
-        for doc in chunk_documents:
-            thread_pool.apply_async(self.index_document, (doc, param_name))
-
-        thread_pool.close()
-        thread_pool.join()
-
-    def get_schema_fields(self, document_name):
-        """ returns the schema of all the include_fields or exclude_fields specified in the configuration file.
-            :param document_name: document name from SITES, LISTS, LIST_ITEMS OR DRIVE_ITEMS
-            Returns:
-                schema: included and excluded fields schema
-        """
-        fields = self.objects.get(document_name)
-        adapter_schema = adapter.DEFAULT_SCHEMA[document_name]
-        field_id = adapter_schema['id']
-        if fields:
-            include_fields = fields.get("include_fields")
-            exclude_fields = fields.get("exclude_fields")
-            if include_fields:
-                adapter_schema = {key: val for key, val in adapter_schema.items() if val in include_fields}
-            elif exclude_fields:
-                adapter_schema = {key: val for key, val in adapter_schema.items() if val not in exclude_fields}
-            adapter_schema['id'] = field_id
-        return adapter_schema
-
-    def fetch_sites(self, parent_site_url, sites, ids, index, start_time, end_time):
-        """This method fetches sites from a collection and invokes the
-            index permission method to get the document level permissions.
-            If the fetching is not successful, it logs proper message.
-            :param parent_site_url: parent site relative path
-            :param sites: dictionary of site path and it's last updated time
-            :param ids: structure containing id's of all objects
-            :param index: index, boolean value
-            :param start_time: start time for fetching the data
-            :param end_time: end time for fetching the data
-            Returns:
-                document: response of sharepoint GET call, with fields specified in the schema
-        """
-        rel_url = f"{parent_site_url}/_api/web/webs"
-        self.logger.info("Fetching the sites detail from url: %s" % (rel_url))
-        query = self.sharepoint_client.get_query(
-            start_time, end_time, SITES)
-        response = self.sharepoint_client.get(rel_url, query, SITES)
-
-        response_data = get_results(self.logger, response, SITES)
-        if not response_data:
-            self.logger.info("No sites were created in %s for this interval: start time: %s and end time: %s" % (parent_site_url, start_time, end_time))
-            return sites
-        self.logger.info(
-            "Successfully fetched and parsed %s sites response from SharePoint" % len(response_data)
-        )
-
-        schema = self.get_schema_fields(SITES)
-        document = []
-
-        if index:
-            for i, _ in enumerate(response_data):
-                doc = {'type': SITE}
-                # need to convert date to iso else workplace search throws error on date format Invalid field value: Value '2021-09-29T08:13:00' cannot be parsed as a date (RFC 3339)"]}
-                response_data[i]['Created'] += 'Z'
-                for field, response_field in schema.items():
-                    doc[field] = response_data[i].get(response_field)
-                if self.enable_permission is True:
-                    doc["_allow_permissions"] = self.index_permissions(
-                        key=SITES, site=response_data[i]['ServerRelativeUrl'])
-                document.append(doc)
-                ids["sites"].update({doc["id"]: response_data[i]["ServerRelativeUrl"]})
-        for result in response_data:
-            site_server_url = result.get("ServerRelativeUrl")
-            sites.update({site_server_url: result.get("LastItemModifiedDate")})
-            self.fetch_sites(site_server_url, sites, ids, index, start_time, end_time)
-        return sites, document
-
-    def fetch_lists(self, sites, ids, index):
-        """This method fetches lists from all sites in a collection and invokes the
-            index permission method to get the document level permissions.
-            If the fetching is not successful, it logs proper message.
-            :param sites: dictionary of site path and it's last updated time
-            :param ids: structure containing id's of all objects
-            :param index: index, boolean value
-            Returns:
-                document: response of sharepoint GET call, with fields specified in the schema
-        """
-        self.logger.info("Fetching lists for all the sites")
-        responses = []
-        document = []
-        if not sites:
-            self.logger.info("No list was created in this interval: start time: %s and end time: %s" % (self.start_time, self.end_time))
-            return [], []
-        schema_list = self.get_schema_fields(LISTS)
-        for site_details in sites:
-            for site, time_modified in site_details.items():
-                if parse(self.start_time) > parse(time_modified):
-                    continue
-                rel_url = f"{site}/_api/web/lists"
-                self.logger.info(
-                    "Fetching the lists for site: %s from url: %s"
-                    % (site, rel_url)
-                )
-
-                query = self.sharepoint_client.get_query(
-                    self.start_time, self.end_time, LISTS)
-                response = self.sharepoint_client.get(
-                    rel_url, query, LISTS)
-
-                response_data = get_results(self.logger, response, LISTS)
-                if not response_data:
-                    self.logger.info("No list was created for the site : %s in this interval: start time: %s and end time: %s" % (site, self.start_time, self.end_time))
-                    continue
-                self.logger.info(
-                    "Successfully fetched and parsed %s list response for site: %s from SharePoint"
-                    % (len(response_data), site)
-                )
-
-                base_list_url = f"{site}/Lists/"
-
-                if index:
-                    if not ids["lists"].get(site):
-                        ids["lists"].update({site: {}})
-                    for i, _ in enumerate(response_data):
-                        doc = {'type': LIST}
-                        for field, response_field in schema_list.items():
-                            doc[field] = response_data[i].get(
-                                response_field)
-                        if self.enable_permission is True:
-                            doc["_allow_permissions"] = self.index_permissions(
-                                key=LISTS, site=site, list_id=doc["id"], list_url=response_data[i]['ParentWebUrl'], itemid=None)
-                        doc["url"] = urljoin(base_list_url, re.sub(
-                            r'[^ \w+]', '', response_data[i]["Title"]))
-                        document.append(doc)
-                        ids["lists"][site].update({doc["id"]: response_data[i]["Title"]})
-
-                responses.append(response_data)
-            lists = {}
-            libraries = {}
-            for response in responses:
-                for result in response:
-                    if result.get('BaseType') == 1:
-                        libraries[result.get("Id")] = [result.get(
-                            "ParentWebUrl"), result.get("Title"), result.get("LastItemModifiedDate")]
-                    else:
-                        lists[result.get("Id")] = [result.get(
-                            "ParentWebUrl"), result.get("Title"), result.get("LastItemModifiedDate")]
-        return lists, libraries, document
-
-    def fetch_items(self, lists, ids):
-        """This method fetches items from all the lists in a collection and
-            invokes theindex permission method to get the document level permissions.
-            If the fetching is not successful, it logs proper message.
-            :param lists: document lists
-            :param ids: structure containing id's of all objects
-            Returns:
-                document: response of sharepoint GET call, with fields specified in the schema
-        """
-        responses = []
-        #  here value is a list of url and title
-        self.logger.info("Fetching all the items for the lists")
-        if not lists:
-            self.logger.info("No item was created in this interval: start time: %s and end time: %s" % (self.start_time, self.end_time))
-        else:
-            for value in lists.values():
-                if not ids["list_items"].get(value[0]):
-                    ids["list_items"].update({value[0]: {}})
-            schema_item = self.get_schema_fields(LIST_ITEMS)
-            for list_content, value in lists.items():
-                if parse(self.start_time) > parse(value[2]):
-                    continue
-                rel_url = f"{value[0]}/_api/web/lists(guid'{list_content}')/items"
-                self.logger.info(
-                    "Fetching the items for list: %s from url: %s"
-                    % (value[1], rel_url)
-                )
-
-                query = self.sharepoint_client.get_query(
-                    self.start_time, self.end_time, LIST_ITEMS)
-                response = self.sharepoint_client.get(rel_url, query, LIST_ITEMS)
-
-                response_data = get_results(self.logger, response, LIST_ITEMS)
-                if not response_data:
-                    self.logger.info("No item was created for the list %s in this interval: start time: %s and end time: %s" % (value[1], self.start_time, self.end_time))
-                    continue
-                self.logger.info(
-                    "Successfully fetched and parsed %s listitem response for list: %s from SharePoint"
-                    % (len(response_data), value[1])
-                )
-
-                list_name = re.sub(r'[^ \w+]', '', value[1])
-                base_item_url = f"{value[0]}/Lists/{list_name}/DispForm.aspx?ID="
-                document = []
-                if not ids["list_items"][value[0]].get(list_content):
-                    ids["list_items"][value[0]].update({list_content: []})
-                rel_url = f'{value[0]}/_api/web/lists(guid\'{list_content}\')/items?$select=Attachments,AttachmentFiles,Title&$expand=AttachmentFiles'
-
-                new_query = "&" + query.split("?")[1]
-                file_response_data = self.sharepoint_client.get(rel_url, query=new_query, param_name="attachment")
-                if file_response_data:
-                    file_response_data = get_results(self.logger, file_response_data.json(), "attachment")
-
-                for i, _ in enumerate(response_data):
-                    doc = {'type': ITEM}
-                    if response_data[i].get('Attachments') and file_response_data:
-                        for data in file_response_data:
-                            if response_data[i].get('Title') == data['Title']:
-                                file_relative_url = data[
-                                    'AttachmentFiles']['results'][0]['ServerRelativeUrl']
-                                url_s = f"{value[0]}/_api/web/GetFileByServerRelativeUrl(\'{encode(file_relative_url)}\')/$value"
-                                response = self.sharepoint_client.get(
-                                    url_s, query='', param_name="attachment")
-                                doc['body'] = {}
-                                if response and response.ok:
-                                    try:
-                                        doc['body'] = extract(response.content)
-                                    except TikaException as exception:
-                                        self.logger.error('Error while extracting the contents from the attachment, Error %s' % (exception))
-
-                                break
-                    for field, response_field in schema_item.items():
-                        doc[field] = response_data[i].get(
-                            response_field)
-                    if self.enable_permission is True:
-                        doc["_allow_permissions"] = self.index_permissions(
-                            key=LIST_ITEMS, list_id=list_content, list_url=value[0], itemid=str(response_data[i]["Id"]))
-                    doc["url"] = base_item_url + str(response_data[i]["Id"])
-                    document.append(doc)
-                    if response_data[i].get("GUID") not in ids["list_items"][value[0]][list_content]:
-                        ids["list_items"][value[0]][list_content].append(
-                            response_data[i].get("GUID"))
-                responses.extend(document)
-        return responses
-
-    def fetch_drive_items(self, libraries, ids):
-        """This method fetches items from all the lists in a collection and
-            invokes theindex permission method to get the document level permissions.
-            If the fetching is not successful, it logs proper message.
-            :param libraries: document lists
-            :param ids: structure containing id's of all objects
-        """
-        responses = []
-        #  here value is a list of url and title of the library
-        self.logger.info("Fetching all the files for the library")
-        if not libraries:
-            self.logger.info("No file was created in this interval: start time: %s and end time: %s" % (self.start_time, self.end_time))
-        else:
-            schema_drive = self.get_schema_fields(DRIVE_ITEMS)
-            for lib_content, value in libraries.items():
-                if parse(self.start_time) > parse(value[2]):
-                    continue
-                if not ids["drive_items"].get(value[0]):
-                    ids["drive_items"].update({value[0]: {}})
-                rel_url = f"{value[0]}/_api/web/lists(guid'{lib_content}')/items?$select=Modified,Id,GUID,File,Folder&$expand=File,Folder"
-                self.logger.info(
-                    "Fetching the items for libraries: %s from url: %s"
-                    % (value[1], rel_url)
-                )
-                query = self.sharepoint_client.get_query(
-                    self.start_time, self.end_time, DRIVE_ITEMS)
-                response = self.sharepoint_client.get(rel_url, query, DRIVE_ITEMS)
-                response_data = get_results(self.logger, response, DRIVE_ITEMS)
-                if not response_data:
-                    self.logger.info("No item was created for the library %s in this interval: start time: %s and end time: %s" % (value[1], self.start_time, self.end_time))
-                    continue
-                self.logger.info(
-                    "Successfully fetched and parsed %s drive item response for library: %s from SharePoint"
-                    % (len(response_data), value[1])
-                )
-                document = []
-                if not ids["drive_items"][value[0]].get(lib_content):
-                    ids["drive_items"][value[0]].update({lib_content: []})
-                for i, _ in enumerate(response_data):
-                    if response_data[i]['File'].get('TimeLastModified'):
-                        obj_type = 'File'
-                        doc = {'type': "file"}
-                        file_relative_url = response_data[i]['File']['ServerRelativeUrl']
-                        url_s = f"{value[0]}/_api/web/GetFileByServerRelativeUrl(\'{encode(file_relative_url)}\')/$value"
-                        response = self.sharepoint_client.get(url_s, query='', param_name="attachment")
-                        doc['body'] = {}
-                        if response and response.ok:
-                            try:
-                                doc['body'] = extract(response.content)
-                            except TikaException as exception:
-                                self.logger.error('Error while extracting the contents from the file at %s, Error %s' % (response_data[i].get('Url'), exception))
-                    else:
-                        obj_type = 'Folder'
-                        doc = {'type': "folder"}
-                    for field, response_field in schema_drive.items():
-                        doc[field] = response_data[i][obj_type].get(
-                            response_field)
-                    doc['id'] = response_data[i].get("GUID")
-                    if self.enable_permission is True:
-                        doc["_allow_permissions"] = self.index_permissions(
-                            key=DRIVE_ITEMS, list_id=lib_content, list_url=value[0], itemid=str(response_data[i].get("ID")))
-                    doc["url"] = response_data[i][obj_type]["ServerRelativeUrl"]
-                    document.append(doc)
-                    if doc['id'] not in ids["drive_items"][value[0]][lib_content]:
-                        ids["drive_items"][value[0]][lib_content].append(doc['id'])
-                responses.extend(document)
-        return responses
-
-    def get_roles(self, key, site, list_url, list_id, itemid):
-        """ Checks the permissions and returns the user roles.
-            :param key: key, a string value
-            :param site: site name to check the permission
-            :param list_url: list url to access the list
-            :param list_id: list id to check the permission
-            :param itemid: item id to check the permission
-            Returns:
-                roles: user roles
-        """
-        if key == SITES:
-            rel_url = site
-            roles = self.permissions.fetch_users(key, rel_url)
-
-        elif key == LISTS:
-            rel_url = list_url
-            roles = self.permissions.fetch_users(
-                key, rel_url, list_id=list_id
-            )
-
-        else:
-            rel_url = list_url
-            roles = self.permissions.fetch_users(
-                key, rel_url, list_id=list_id, item_id=itemid
-            )
-
-        return roles
-
-    def index_permissions(
-        self,
-        key,
-        site=None,
-        list_id=None,
-        list_url=None,
-        itemid=None,
-    ):
-        """This method when invoked, checks the permission inheritance of each object.
-            If the object has unique permissions, the list of users having access to it
-            is fetched using sharepoint api else the permission levels of the that object
-            is taken same as the permission level of the site collection.
-            :param key: key, a string value
-            :param site: site name to index the permission for the site
-            :param list_id: list id to index the permission for the list
-            :param list_url: url of the list
-            :param itemid: item id to index the permission for the item
-            Returns:
-                groups: list of users having access to the given object
-        """
-        roles = self.get_roles(key, site, list_url, list_id, itemid)
-
-        groups = []
-
-        if not roles:
-            return []
-        roles = get_results(self.logger, roles.json(), "roles")
-
-        for role in roles:
-            title = role["Member"]["Title"]
-            groups.append(title)
-        return groups
-
-    def index_sites(self, ids, end_time, collection):
-        """ Indexes the site details to the Workplace Search
-            :param ids: id collection of the all the objects
-            :param end_time: end time for fetching the data
-            :param collection: collection name
-        """
-        _, datelist = split_date_range_into_chunks(self.start_time, self.end_time, self.max_threads)
-        results = []
-        parent_site_url = f"/sites/{collection}"
-        sites_path = [{parent_site_url: end_time}]
-        thread_pool = spawn_threads(self.max_threads)
-        for num in range(0, self.max_threads):
-            start_time_partition = datelist[num]
-            end_time_partition = datelist[num + 1]
-            thread = thread_pool.apply_async(
-                self.fetch_sites, (parent_site_url, {}, ids, (SITES in self.objects),
-                                   start_time_partition, end_time_partition))
-            results.append(thread)
-
-        sites, documents = [], []
-        for result in [r.get() for r in results]:
-            if result:
-                sites.append(result[0])
-                documents.extend(result[1])
-        thread_pool.close()
-        thread_pool.join()
-        self.threaded_index_documents(documents, SITES)
-        sites_path.extend(sites)
-        return sites_path
-
-    def index_lists(self, sites_path, ids):
-        """ Indexes the list details to the Workplace Search
-            :param sites_path: dictionary of site path and it's last updated time
-            :param ids: id collection of the all the objects
-        """
-        results, lists_details, libraries_details = [], {}, {}
-        thread_pool = spawn_threads(self.max_threads)
-        partitioned_sites = split_list_into_buckets(sites_path, self.max_threads)
-        for site in partitioned_sites:
-            thread = thread_pool.apply_async(self.fetch_lists, (site, ids, (LISTS in self.objects)))
-            results.append(thread)
-        documents = []
-        for result in [r.get() for r in results]:
-            if result:
-                lists_details.update(result[0])
-                libraries_details.update(result[1])
-                documents.extend(result[2])
-        thread_pool.close()
-        thread_pool.join()
-        self.threaded_index_documents(documents, LISTS)
-        return [lists_details, libraries_details]
-
-    def index_list_items(self, lists_details, ids):
-        """ Indexes the list_items to the Workplace Search
-            :param lists_details: dictionary containing list name, list path and id
-            :param ids: id collection of the all the objects
-        """
-        results = []
-        partition = []
-        thread_pool = spawn_threads(self.max_threads)
-        partition = split_dict_in_chunks(lists_details, self.max_threads)
-        for list_data in partition:
-            thread = thread_pool.apply_async(self.fetch_items, (list_data, ids))
-            results.append(thread)
-        documents = []
-        for result in [r.get() for r in results]:
-            if result:
-                documents.extend(result)
-        thread_pool.close()
-        thread_pool.join()
-        self.threaded_index_documents(documents, LIST_ITEMS)
-
-    def index_drive_items(self, libraries_details, ids):
-        """ Indexes the drive_items to the Workplace Search
-            :param libraries_details: dictionary containing library name, library path and id
-            :param ids: id collection of the all the objects
-        """
-        results = []
-        partition = []
-        thread_pool = spawn_threads(self.max_threads)
-        partition = split_dict_in_chunks(libraries_details, self.max_threads)
-        for list_data in partition:
-            thread = thread_pool.apply_async(self.fetch_drive_items, (list_data, ids))
-            results.append(thread)
-        documents = []
-        for result in [r.get() for r in results]:
-            if result:
-                documents.extend(result)
-        thread_pool.close()
-        thread_pool.join()
-        self.threaded_index_documents(documents, DRIVE_ITEMS)
-
-    def indexing(self, collection, ids, storage, job_type, collected_objects, end_time):
-        """This method fetches all the objects from sharepoint server and
-            ingests them into the workplace search
-            :param collection: collection name
-            :param ids: id collection of the all the objects
-            :param storage: temporary storage for storing all the documents
-            :param job_type: denotes the type of sharepoint object being fetched in a particular process
-            :param collected_objects: helper variable to provide the data to children object
-            :param end_time: end time for fetching the data
-        """
-        if job_type == "sites":
-            collected_objects = self.index_sites(ids, end_time, collection)
-
-        elif job_type == "lists":
-            collected_objects = self.index_lists(collected_objects, ids)
-
-        elif job_type == LIST_ITEMS and LIST_ITEMS in self.objects:
-            self.index_list_items(collected_objects[0], ids)
-
-        elif job_type == DRIVE_ITEMS and DRIVE_ITEMS in self.objects:
-            self.index_drive_items(collected_objects[1], ids)
-
-        self.logger.info(
-            "Completed fetching all the objects for site collection: %s"
-            % (collection)
-        )
-
-        self.logger.info(
-            "Saving the checkpoint for the site collection: %s" % (collection)
-        )
-        if ids.get(job_type):
-            prev_ids = storage[job_type]
-            if job_type == 'sites':
-                prev_ids.update(ids[job_type])
-            elif job_type == "lists":
-                for site, list_content in ids[job_type].items():
-                    prev_ids[site] = {**prev_ids.get(site, {}), **ids[job_type][site]}
-            else:
-                for site, list_content in ids[job_type].items():
-                    prev_ids[site] = ids[job_type][site] if not prev_ids.get(site) else prev_ids[site]
-                    for list_name in list_content.keys():
-                        prev_ids[site][list_name] = list(set([*prev_ids[site].get(list_name, []), *ids[job_type][site][list_name]]))
-            storage[job_type] = prev_ids
-        return collected_objects
-
-
-def start(indexing_type, config, logger, workplace_search_client, sharepoint_client):
-    """Runs the indexing logic
-        :param indexing_type: The type of the indexing i.e. incremental or full
-        :param config: instance of Configuration class
-        :param logger: instance of Logger class
-        :param workplace_search_client: instance of WorkplaceSearch
-        :param sharepoint_client: instance of SharePoint
-    """
-    logger.info(f"Starting the {indexing_type} indexing..")
-    current_time = (datetime.utcnow()).strftime("%Y-%m-%dT%H:%M:%SZ")
-    ids_collection = {"global_keys": {}}
-    storage_with_collection = {"global_keys": {}, "delete_keys": {}}
-
-    if (os.path.exists(IDS_PATH) and os.path.getsize(IDS_PATH) > 0):
-        with open(IDS_PATH) as ids_store:
-            try:
-                ids_collection = json.load(ids_store)
-            except ValueError as exception:
-                logger.exception(
-                    "Error while parsing the json file of the ids store from path: %s. Error: %s"
-                    % (IDS_PATH, exception)
-                )
-
-    storage_with_collection["delete_keys"] = copy.deepcopy(ids_collection.get("global_keys"))
-    check = Checkpoint(config, logger)
-
-    try:
-        for collection in config.get_value("sharepoint.site_collections"):
-            storage = {"sites": {}, "lists": {}, "list_items": {}, "drive_items": {}}
-            logger.info(
-                "Starting the data fetching for site collection: %s"
-                % (collection)
-            )
-
-            if indexing_type == "incremental":
-                start_time, end_time = check.get_checkpoint(
-                    collection, current_time)
-            else:
-                start_time = config.get_value("start_time")
-                end_time = current_time
-
-            if not ids_collection["global_keys"].get(collection):
-                ids_collection["global_keys"][collection] = {
-                    "sites": {}, "lists": {}, "list_items": {}, "drive_items": {}}
-
-            logger.info(
-                "Starting to index all the objects configured in the object field: %s"
-                % (str(config.get_value("objects")))
-            )
-
-            indexer = FetchIndex(config, logger, workplace_search_client, sharepoint_client, start_time, end_time)
-            returned_documents = None
-            for job_type in ["sites", "lists", "list_items", "drive_items"]:
-                logger.info(f"Indexing {job_type}")
-                returned_documents = indexer.indexing(
-                    collection,
-                    ids_collection["global_keys"][collection],
-                    storage,
-                    job_type,
-                    returned_documents,
-                    end_time
-                )
-
-            storage_with_collection["global_keys"][collection] = storage.copy()
-
-            check.set_checkpoint(collection, end_time, indexing_type)
-    except Exception as exception:
-        raise exception
-
-    with open(IDS_PATH, "w") as file:
-        try:
-            json.dump(storage_with_collection, file, indent=4)
-        except ValueError as exception:
-            logger.warning(
-                'Error while adding ids to json file. Error: %s' % (exception))
diff --git a/ees_sharepoint/full_sync_command.py b/ees_sharepoint/full_sync_command.py
index 99bfc4a..2d7bdd3 100644
--- a/ees_sharepoint/full_sync_command.py
+++ b/ees_sharepoint/full_sync_command.py
@@ -8,14 +8,36 @@
 It will attempt to sync absolutely all documents that are available in the
 third-party system and ingest them into Enterprise Search instance."""
 from .base_command import BaseCommand
-from .fetch_index import start
+from .sync_sharepoint import init_sharepoint_sync
+from .connector_queue import ConnectorQueue
+from .sync_enterprise_search import init_enterprise_search_sync
+from multiprocessing import Process
 
 
 class FullSyncCommand(BaseCommand):
+    """This class start execution of fullsync feature."""
+
     def execute(self):
+        """This function execute the start function."""
         config = self.config
         logger = self.logger
         workplace_search_client = self.workplace_search_client
         sharepoint_client = self.sharepoint_client
 
-        start("full", config, logger, workplace_search_client, sharepoint_client)
+        queue = ConnectorQueue()
+        producer = Process(
+            name="producer",
+            target=init_sharepoint_sync,
+            args=("full", config, logger, workplace_search_client, sharepoint_client, queue),
+        )
+        producer.start()
+
+        consumer = Process(
+            name="consumer",
+            target=init_enterprise_search_sync,
+            args=(config, logger, workplace_search_client, queue),
+        )
+        consumer.start()
+
+        producer.join()
+        consumer.join()
diff --git a/ees_sharepoint/incremental_sync_command.py b/ees_sharepoint/incremental_sync_command.py
index 604d8f3..d29d1dc 100644
--- a/ees_sharepoint/incremental_sync_command.py
+++ b/ees_sharepoint/incremental_sync_command.py
@@ -11,14 +11,36 @@
 Recency is determined by the time when the last successful incremental or full job
 was ran."""
 from .base_command import BaseCommand
-from .fetch_index import start
+from .sync_sharepoint import init_sharepoint_sync
+from .connector_queue import ConnectorQueue
+from .sync_enterprise_search import init_enterprise_search_sync
+from multiprocessing import Process
 
 
 class IncrementalSyncCommand(BaseCommand):
+    """This class start execution of incrementalsync feature."""
+
     def execute(self):
+        """This function execute the start function."""
         config = self.config
         logger = self.logger
         workplace_search_client = self.workplace_search_client
         sharepoint_client = self.sharepoint_client
 
-        start("incremental", config, logger, workplace_search_client, sharepoint_client)
+        queue = ConnectorQueue()
+        producer = Process(
+            name="producer",
+            target=init_sharepoint_sync,
+            args=("incremental", config, logger, workplace_search_client, sharepoint_client, queue),
+        )
+        producer.start()
+
+        consumer = Process(
+            name="consumer",
+            target=init_enterprise_search_sync,
+            args=(config, logger, workplace_search_client, queue),
+        )
+        consumer.start()
+
+        producer.join()
+        consumer.join()
diff --git a/ees_sharepoint/permission_sync_command.py b/ees_sharepoint/permission_sync_command.py
index 03cc70d..3332458 100644
--- a/ees_sharepoint/permission_sync_command.py
+++ b/ees_sharepoint/permission_sync_command.py
@@ -7,13 +7,14 @@
 
 It will attempt to remove from Enterprise Search instance the documents
 that have been deleted from the third-party system."""
-import os
 import csv
+import os
+
+from ees_sharepoint.base_command import BaseCommand
 
 from .checkpointing import Checkpoint
+from .sync_sharepoint import get_results
 from .usergroup_permissions import Permissions
-from .fetch_index import get_results
-from ees_sharepoint.base_command import BaseCommand
 
 
 class PermissionSyncDisabledException(Exception):
@@ -33,6 +34,7 @@ class PermissionSyncCommand(BaseCommand):
 
     It can be used to run the job that will periodically sync permissions
     from Sharepoint Server to Elastic Enteprise Search."""
+
     def __init__(self, args):
         super().__init__(args)
 
@@ -47,8 +49,8 @@ def __init__(self, args):
         self.permissions = Permissions(self.sharepoint_client, self.workplace_search_client, self.logger)
 
     def get_users_id(self):
-        """ This method returns the dictionary of dictionaries containing users and their id
-            as a key value pair for all the site-collections."""
+        """This method returns the dictionary of dictionaries containing users and their id
+        as a key value pair for all the site-collections."""
         user_ids = {}
         for collection in self.site_collections:
             user_id_collection = {}
@@ -65,8 +67,8 @@ def get_users_id(self):
         return user_ids
 
     def get_user_groups(self, user_ids):
-        """ This method returns the groups of each user in all the site-collections
-            :param user_ids: user ids to fetch the groups of the specific user"""
+        """This method returns the groups of each user in all the site-collections
+        :param user_ids: user ids to fetch the groups of the specific user"""
         user_group = {}
         for collection in self.site_collections:
             user_group_collection = {}
@@ -76,41 +78,34 @@ def get_user_groups(self, user_ids):
                 if response:
                     groups = get_results(self.logger, response.json(), "user_groups")
                     if groups:
-                        user_group_collection[name] = [group['Title'] for group in groups]
+                        user_group_collection[name] = [group["Title"] for group in groups]
             user_group.update({collection: user_group_collection})
         return user_group
 
     def workplace_add_permission(self, permissions):
-        """ This method when invoked would index the permission provided in the paramater
-            for the user in paramter user_name
-            :param permissions: dictionary of dictionaries containing permissions of all the users in each site-collection."""
+        """This method when invoked would index the permission provided in the paramater
+        for the user in paramter user_name
+        :param permissions: dictionary of dictionaries containing permissions of all the users in each site-collection."""
         for collection in self.site_collections:
             for user_name, permission_list in permissions[collection].items():
                 try:
                     self.workplace_search_client.add_user_permissions(
                         content_source_id=self.ws_source,
                         user=user_name,
-                        body={
-                            "permissions": permission_list
-                        },
-                    )
-                    self.logger.info(
-                        "Successfully indexed the permissions for user %s to the workplace" % (
-                            user_name
-                        )
+                        body={"permissions": permission_list},
                     )
+                    self.logger.info("Successfully indexed the permissions for user %s to the workplace" % (user_name))
                 except Exception as exception:
                     self.logger.exception(
-                        "Error while indexing the permissions for user: %s to the workplace. Error: %s" % (
-                            user_name, exception
-                        )
+                        "Error while indexing the permissions for user: %s to the workplace. Error: %s"
+                        % (user_name, exception)
                     )
 
     def sync_permissions(self):
-        """ This method when invoked, checks the permission of SharePoint users and update those user
-            permissions in the Workplace Search."""
+        """This method when invoked, checks the permission of SharePoint users and update those user
+        permissions in the Workplace Search."""
         rows = {}
-        if (os.path.exists(self.mapping_sheet_path) and os.path.getsize(self.mapping_sheet_path) > 0):
+        if os.path.exists(self.mapping_sheet_path) and os.path.getsize(self.mapping_sheet_path) > 0:
             with open(self.mapping_sheet_path) as file:
                 csvreader = csv.reader(file)
                 for row in csvreader:
@@ -133,7 +128,7 @@ def sync_permissions(self):
                 self.workplace_add_permission(user_groups)
 
     def execute(self):
-        """ Runs the permission indexing logic"""
+        """Runs the permission indexing logic"""
 
         logger = self.logger
         config = self.config
@@ -141,6 +136,6 @@ def execute(self):
 
         enable_permission = config.get_value("enable_document_permission")
         if not enable_permission:
-            logger.warn('Exiting as the enable permission flag is set to False')
+            logger.warn("Exiting as the enable permission flag is set to False")
             raise PermissionSyncDisabledException
         self.sync_permissions()
diff --git a/ees_sharepoint/schema.py b/ees_sharepoint/schema.py
index bf63ad0..807c6ca 100644
--- a/ees_sharepoint/schema.py
+++ b/ees_sharepoint/schema.py
@@ -165,10 +165,16 @@ def coerce_rfc_3339_date(input_date):
         'default': 3,
         'min': 1
     },
-    'max_threads': {
+    'sharepoint_sync_thread_count': {
         'required': False,
         'type': 'integer',
-        'default': 40,
+        'default': 5,
+        'min': 1
+    },
+    'enterprise_search_sync_thread_count': {
+        'required': False,
+        'type': 'integer',
+        'default': 5,
         'min': 1
     },
     'sharepoint_workplace_user_mapping': {
diff --git a/ees_sharepoint/sync_enterprise_search.py b/ees_sharepoint/sync_enterprise_search.py
new file mode 100644
index 0000000..87e8bb4
--- /dev/null
+++ b/ees_sharepoint/sync_enterprise_search.py
@@ -0,0 +1,76 @@
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0;
+# you may not use this file except in compliance with the Elastic License 2.0.
+#
+from multiprocessing.pool import ThreadPool
+from .utils import split_documents_into_equal_chunks
+from .checkpointing import Checkpoint
+
+BATCH_SIZE = 100
+
+
+class SyncEnterpriseSearch:
+    """This class allows ingesting documents to Elastic Enterprise Search."""
+
+    def __init__(self, config, logger, workplace_search_client, queue):
+        self.config = config
+        self.logger = logger
+        self.workplace_search_client = workplace_search_client
+        self.ws_source = config.get_value("workplace_search.source_id")
+        self.enterprise_search_thread_count = config.get_value("enterprise_search_sync_thread_count")
+        self.thread_pool = ThreadPool(self.enterprise_search_thread_count)
+        self.queue = queue
+
+    def index_documents(self, documents):
+        """This method indexes the documents to the Enterprise Search.
+        :param documents: documents to be indexed
+        """
+        total_documents_indexed = 0
+        if documents:
+            responses = self.workplace_search_client.index_documents(
+                content_source_id=self.ws_source, documents=documents
+            )
+            for response in responses["results"]:
+                if not response["errors"]:
+                    total_documents_indexed += 1
+                else:
+                    self.logger.error("Error while indexing %s. Error: %s" % (response["id"], response["errors"]))
+        self.logger.info("Successfully indexed %s documents to the workplace" % (total_documents_indexed))
+
+    def perform_sync(self):
+        """Pull documents from the queue and synchronize it to the Enterprise Search."""
+        checkpoint = Checkpoint(self.config, self.logger)
+        signal_open = True
+        while signal_open:
+            for _ in range(0, self.enterprise_search_thread_count):
+                documents_to_index = []
+                while len(documents_to_index) < BATCH_SIZE:
+                    documents = self.queue.get()
+                    if documents.get("type") == "signal_close":
+                        signal_open = False
+                        break
+                    elif documents.get("type") == "checkpoint":
+                        checkpoint.set_checkpoint(
+                            documents.get("data")[0], documents.get("data")[1], documents.get("data")[2]
+                        )
+                        break
+                    else:
+                        documents_to_index.extend(documents.get("data"))
+                for chunk in split_documents_into_equal_chunks(documents_to_index, BATCH_SIZE):
+                    self.thread_pool.apply_async(self.index_documents, (chunk,))
+                if not signal_open:
+                    break
+        self.thread_pool.close()
+        self.thread_pool.join()
+
+
+def init_enterprise_search_sync(config, logger, workplace_search_client, queue):
+    """Runs the indexing logic
+    :param config: instance of Configuration class
+    :param logger: instance of Logger class
+    :param workplace_search_client: instance of WorkplaceSearch
+    :param queue: Shared queue to push the objects fetched from SharePoint
+    """
+    indexer = SyncEnterpriseSearch(config, logger, workplace_search_client, queue)
+    indexer.perform_sync()
diff --git a/ees_sharepoint/sync_sharepoint.py b/ees_sharepoint/sync_sharepoint.py
new file mode 100644
index 0000000..1884c79
--- /dev/null
+++ b/ees_sharepoint/sync_sharepoint.py
@@ -0,0 +1,644 @@
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0;
+# you may not use this file except in compliance with the Elastic License 2.0.
+#
+"""sync_sharepoint module allows to sync data to Elastic Enterprise Search.
+
+It's possible to run full syncs and incremental syncs with this module."""
+
+import copy
+import json
+import os
+import re
+from datetime import datetime
+from urllib.parse import urljoin
+from dateutil.parser import parse
+from multiprocessing.pool import ThreadPool
+from tika.tika import TikaException
+
+from . import adapter
+from .checkpointing import Checkpoint
+from .usergroup_permissions import Permissions
+from .utils import (
+    encode,
+    extract,
+    split_list_into_buckets,
+    split_date_range_into_chunks,
+    split_documents_into_equal_chunks,
+)
+
+IDS_PATH = os.path.join(os.path.dirname(__file__), "doc_id.json")
+
+SITE = "site"
+LIST = "list"
+ITEM = "item"
+SITES = "sites"
+LISTS = "lists"
+LIST_ITEMS = "list_items"
+DRIVE_ITEMS = "drive_items"
+
+
+def get_results(logger, response, entity_name):
+    """Attempts to fetch results from a Sharepoint Server response
+    :param response: response from the sharepoint client
+    :param entity_name: entity name whether it is SITES, LISTS, LIST_ITEMS OR DRIVE_ITEMS
+    Returns:
+        Parsed response
+    """
+    if not response:
+        logger.error(f"Empty response when fetching {entity_name}")  # TODO: should it be an error?
+        return None
+
+    if entity_name == "attachment" and not response.get("d", {}).get("results"):
+        logger.info("Failed to fetch attachment")  # TODO: not sure if it's the right message
+        return None
+    return response.get("d", {}).get("results")
+
+
+class SyncSharepoint:
+    """This class allows synching objects from the SharePoint Server."""
+
+    def __init__(self, config, logger, workplace_search_client, sharepoint_client, start_time, end_time, queue):
+        self.config = config
+        self.logger = logger
+        self.workplace_search_client = workplace_search_client
+        self.sharepoint_client = sharepoint_client
+
+        self.ws_source = config.get_value("workplace_search.source_id")
+        self.objects = config.get_value("objects")
+        self.site_collections = config.get_value("sharepoint.site_collections")
+        self.enable_permission = config.get_value("enable_document_permission")
+        self.start_time = start_time
+        self.end_time = end_time
+        self.sharepoint_thread_count = config.get_value("sharepoint_sync_thread_count")
+        self.mapping_sheet_path = config.get_value("sharepoint_workplace_user_mapping")
+
+        self.checkpoint = Checkpoint(config, logger)
+        self.permissions = Permissions(self.sharepoint_client, self.workplace_search_client, logger)
+
+        self.thread_pool = ThreadPool(self.sharepoint_thread_count)
+        self.queue = queue
+
+    def get_schema_fields(self, document_name):
+        """returns the schema of all the include_fields or exclude_fields specified in the configuration file.
+        :param document_name: document name from SITES, LISTS, LIST_ITEMS OR DRIVE_ITEMS
+        Returns:
+            schema: included and excluded fields schema
+        """
+        fields = self.objects.get(document_name)
+        adapter_schema = adapter.DEFAULT_SCHEMA[document_name]
+        field_id = adapter_schema["id"]
+        if fields:
+            include_fields = fields.get("include_fields")
+            exclude_fields = fields.get("exclude_fields")
+            if include_fields:
+                adapter_schema = {key: val for key, val in adapter_schema.items() if val in include_fields}
+            elif exclude_fields:
+                adapter_schema = {key: val for key, val in adapter_schema.items() if val not in exclude_fields}
+            adapter_schema["id"] = field_id
+        return adapter_schema
+
+    def fetch_sites(self, parent_site_url, sites, ids, index, start_time, end_time):
+        """This method fetches sites from a collection and invokes the
+        index permission method to get the document level permissions.
+        If the fetching is not successful, it logs proper message.
+        :param parent_site_url: parent site relative path
+        :param sites: dictionary of site path and it's last updated time
+        :param ids: structure containing id's of all objects
+        :param index: index, boolean value
+        :param start_time: start time for fetching the data
+        :param end_time: end time for fetching the data
+        Returns:
+            document: response of sharepoint GET call, with fields specified in the schema
+        """
+        rel_url = f"{parent_site_url}/_api/web/webs"
+        self.logger.info("Fetching the sites detail from url: %s" % (rel_url))
+        query = self.sharepoint_client.get_query(start_time, end_time, SITES)
+        response = self.sharepoint_client.get(rel_url, query, SITES)
+
+        response_data = get_results(self.logger, response, SITES)
+        if not response_data:
+            self.logger.info(
+                "No sites were created in %s for this interval: start time: %s and end time: %s"
+                % (parent_site_url, start_time, end_time)
+            )
+            return sites
+        self.logger.info("Successfully fetched and parsed %s sites response from SharePoint" % len(response_data))
+
+        schema = self.get_schema_fields(SITES)
+        document = []
+
+        if index:
+            for i, _ in enumerate(response_data):
+                doc = {"type": SITE}
+                # need to convert date to iso else workplace search throws error on date format Invalid field value: Value '2021-09-29T08:13:00' cannot be parsed as a date (RFC 3339)"]}
+                response_data[i]["Created"] += "Z"
+                for field, response_field in schema.items():
+                    doc[field] = response_data[i].get(response_field)
+                if self.enable_permission is True:
+                    doc["_allow_permissions"] = self.fetch_permissions(
+                        key=SITES, site=response_data[i]["ServerRelativeUrl"]
+                    )
+                document.append(doc)
+                ids["sites"].update({doc["id"]: response_data[i]["ServerRelativeUrl"]})
+        for result in response_data:
+            site_server_url = result.get("ServerRelativeUrl")
+            sites.update({site_server_url: result.get("LastItemModifiedDate")})
+            self.fetch_sites(site_server_url, sites, ids, index, start_time, end_time)
+
+        documents = {"type": SITES, "data": document}
+        return sites, documents
+
+    def fetch_lists(self, sites, ids, index):
+        """This method fetches lists from all sites in a collection and invokes the
+        index permission method to get the document level permissions.
+        If the fetching is not successful, it logs proper message.
+        :param sites: dictionary of site path and it's last updated time
+        :param ids: structure containing id's of all objects
+        :param index: index, boolean value
+        Returns:
+            document: response of sharepoint GET call, with fields specified in the schema
+        """
+        self.logger.info("Fetching lists for all the sites")
+        responses = []
+        document = []
+        if not sites:
+            self.logger.info(
+                "No list was created in this interval: start time: %s and end time: %s"
+                % (self.start_time, self.end_time)
+            )
+            return [], []
+        schema_list = self.get_schema_fields(LISTS)
+        for site_details in sites:
+            for site, time_modified in site_details.items():
+                if parse(self.start_time) > parse(time_modified):
+                    continue
+                rel_url = f"{site}/_api/web/lists"
+                self.logger.info("Fetching the lists for site: %s from url: %s" % (site, rel_url))
+
+                query = self.sharepoint_client.get_query(self.start_time, self.end_time, LISTS)
+                response = self.sharepoint_client.get(rel_url, query, LISTS)
+
+                response_data = get_results(self.logger, response, LISTS)
+                if not response_data:
+                    self.logger.info(
+                        "No list was created for the site : %s in this interval: start time: %s and end time: %s"
+                        % (site, self.start_time, self.end_time)
+                    )
+                    continue
+                self.logger.info(
+                    "Successfully fetched and parsed %s list response for site: %s from SharePoint"
+                    % (len(response_data), site)
+                )
+
+                base_list_url = f"{site}/Lists/"
+
+                if index:
+                    if not ids["lists"].get(site):
+                        ids["lists"].update({site: {}})
+                    for i, _ in enumerate(response_data):
+                        doc = {"type": LIST}
+                        for field, response_field in schema_list.items():
+                            doc[field] = response_data[i].get(response_field)
+                        if self.enable_permission is True:
+                            doc["_allow_permissions"] = self.fetch_permissions(
+                                key=LISTS,
+                                site=site,
+                                list_id=doc["id"],
+                                list_url=response_data[i]["ParentWebUrl"],
+                                itemid=None,
+                            )
+                        doc["url"] = urljoin(base_list_url, re.sub(r"[^ \w+]", "", response_data[i]["Title"]))
+                        document.append(doc)
+                        ids["lists"][site].update({doc["id"]: response_data[i]["Title"]})
+
+                responses.append(response_data)
+            lists = {}
+            libraries = {}
+            for response in responses:
+                for result in response:
+                    if result.get("BaseType") == 1:
+                        libraries[result.get("Id")] = [
+                            result.get("ParentWebUrl"),
+                            result.get("Title"),
+                            result.get("LastItemModifiedDate"),
+                        ]
+                    else:
+                        lists[result.get("Id")] = [
+                            result.get("ParentWebUrl"),
+                            result.get("Title"),
+                            result.get("LastItemModifiedDate"),
+                        ]
+        documents = {"type": LISTS, "data": document}
+        return lists, libraries, documents
+
+    def fetch_items(self, lists, ids):
+        """This method fetches items from all the lists in a collection and
+        invokes theindex permission method to get the document level permissions.
+        If the fetching is not successful, it logs proper message.
+        :param lists: document lists
+        :param ids: structure containing id's of all objects
+        Returns:
+            document: response of sharepoint GET call, with fields specified in the schema
+        """
+        responses = []
+        #  here value is a list of url and title
+        self.logger.info("Fetching all the items for the lists")
+        if not lists:
+            self.logger.info(
+                "No item was created in this interval: start time: %s and end time: %s"
+                % (self.start_time, self.end_time)
+            )
+        else:
+            for value in lists.values():
+                if not ids["list_items"].get(value[0]):
+                    ids["list_items"].update({value[0]: {}})
+            schema_item = self.get_schema_fields(LIST_ITEMS)
+            for list_content, value in lists.items():
+                if parse(self.start_time) > parse(value[2]):
+                    continue
+                rel_url = f"{value[0]}/_api/web/lists(guid'{list_content}')/items"
+                self.logger.info("Fetching the items for list: %s from url: %s" % (value[1], rel_url))
+
+                query = self.sharepoint_client.get_query(self.start_time, self.end_time, LIST_ITEMS)
+                response = self.sharepoint_client.get(rel_url, query, LIST_ITEMS)
+
+                response_data = get_results(self.logger, response, LIST_ITEMS)
+                if not response_data:
+                    self.logger.info(
+                        "No item was created for the list %s in this interval: start time: %s and end time: %s"
+                        % (value[1], self.start_time, self.end_time)
+                    )
+                    continue
+                self.logger.info(
+                    "Successfully fetched and parsed %s listitem response for list: %s from SharePoint"
+                    % (len(response_data), value[1])
+                )
+
+                list_name = re.sub(r"[^ \w+]", "", value[1])
+                base_item_url = f"{value[0]}/Lists/{list_name}/DispForm.aspx?ID="
+                document = []
+                if not ids["list_items"][value[0]].get(list_content):
+                    ids["list_items"][value[0]].update({list_content: []})
+                rel_url = f"{value[0]}/_api/web/lists(guid'{list_content}')/items?$select=Attachments,AttachmentFiles,Title&$expand=AttachmentFiles"
+
+                new_query = "&" + query.split("?")[1]
+                file_response_data = self.sharepoint_client.get(rel_url, query=new_query, param_name="attachment")
+                if file_response_data:
+                    file_response_data = get_results(self.logger, file_response_data.json(), "attachment")
+
+                for i, _ in enumerate(response_data):
+                    doc = {"type": ITEM}
+                    if response_data[i].get("Attachments") and file_response_data:
+                        for data in file_response_data:
+                            if response_data[i].get("Title") == data["Title"]:
+                                file_relative_url = data["AttachmentFiles"]["results"][0]["ServerRelativeUrl"]
+                                url_s = f"{value[0]}/_api/web/GetFileByServerRelativeUrl('{encode(file_relative_url)}')/$value"
+                                response = self.sharepoint_client.get(url_s, query="", param_name="attachment")
+                                doc["body"] = {}
+                                if response and response.ok:
+                                    try:
+                                        doc["body"] = extract(response.content)
+                                    except TikaException as exception:
+                                        self.logger.error(
+                                            "Error while extracting the contents from the attachment, Error %s"
+                                            % (exception)
+                                        )
+
+                                break
+                    for field, response_field in schema_item.items():
+                        doc[field] = response_data[i].get(response_field)
+                    if self.enable_permission is True:
+                        doc["_allow_permissions"] = self.fetch_permissions(
+                            key=LIST_ITEMS, list_id=list_content, list_url=value[0], itemid=str(response_data[i]["Id"])
+                        )
+                    doc["url"] = base_item_url + str(response_data[i]["Id"])
+                    document.append(doc)
+                    if response_data[i].get("GUID") not in ids["list_items"][value[0]][list_content]:
+                        ids["list_items"][value[0]][list_content].append(response_data[i].get("GUID"))
+                responses.extend(document)
+        documents = {"type": LIST_ITEMS, "data": responses}
+        return documents
+
+    def fetch_drive_items(self, libraries, ids):
+        """This method fetches items from all the lists in a collection and
+        invokes theindex permission method to get the document level permissions.
+        If the fetching is not successful, it logs proper message.
+        :param libraries: document lists
+        :param ids: structure containing id's of all objects
+        """
+        responses = []
+        #  here value is a list of url and title of the library
+        self.logger.info("Fetching all the files for the library")
+        if not libraries:
+            self.logger.info(
+                "No file was created in this interval: start time: %s and end time: %s"
+                % (self.start_time, self.end_time)
+            )
+        else:
+            schema_drive = self.get_schema_fields(DRIVE_ITEMS)
+            for lib_content, value in libraries.items():
+                if parse(self.start_time) > parse(value[2]):
+                    continue
+                if not ids["drive_items"].get(value[0]):
+                    ids["drive_items"].update({value[0]: {}})
+                rel_url = f"{value[0]}/_api/web/lists(guid'{lib_content}')/items?$select=Modified,Id,GUID,File,Folder&$expand=File,Folder"
+                self.logger.info("Fetching the items for libraries: %s from url: %s" % (value[1], rel_url))
+                query = self.sharepoint_client.get_query(self.start_time, self.end_time, DRIVE_ITEMS)
+                response = self.sharepoint_client.get(rel_url, query, DRIVE_ITEMS)
+                response_data = get_results(self.logger, response, DRIVE_ITEMS)
+                if not response_data:
+                    self.logger.info(
+                        "No item was created for the library %s in this interval: start time: %s and end time: %s"
+                        % (value[1], self.start_time, self.end_time)
+                    )
+                    continue
+                self.logger.info(
+                    "Successfully fetched and parsed %s drive item response for library: %s from SharePoint"
+                    % (len(response_data), value[1])
+                )
+                document = []
+                if not ids["drive_items"][value[0]].get(lib_content):
+                    ids["drive_items"][value[0]].update({lib_content: []})
+                for i, _ in enumerate(response_data):
+                    if response_data[i]["File"].get("TimeLastModified"):
+                        obj_type = "File"
+                        doc = {"type": "file"}
+                        file_relative_url = response_data[i]["File"]["ServerRelativeUrl"]
+                        url_s = f"{value[0]}/_api/web/GetFileByServerRelativeUrl('{encode(file_relative_url)}')/$value"
+                        response = self.sharepoint_client.get(url_s, query="", param_name="attachment")
+                        doc["body"] = {}
+                        if response and response.ok:
+                            try:
+                                doc["body"] = extract(response.content)
+                            except TikaException as exception:
+                                self.logger.error(
+                                    "Error while extracting the contents from the file at %s, Error %s"
+                                    % (response_data[i].get("Url"), exception)
+                                )
+                    else:
+                        obj_type = "Folder"
+                        doc = {"type": "folder"}
+                    for field, response_field in schema_drive.items():
+                        doc[field] = response_data[i][obj_type].get(response_field)
+                    doc["id"] = response_data[i].get("GUID")
+                    if self.enable_permission is True:
+                        doc["_allow_permissions"] = self.fetch_permissions(
+                            key=DRIVE_ITEMS,
+                            list_id=lib_content,
+                            list_url=value[0],
+                            itemid=str(response_data[i].get("ID")),
+                        )
+                    doc["url"] = response_data[i][obj_type]["ServerRelativeUrl"]
+                    document.append(doc)
+                    if doc["id"] not in ids["drive_items"][value[0]][lib_content]:
+                        ids["drive_items"][value[0]][lib_content].append(doc["id"])
+                responses.extend(document)
+        documents = {"type": DRIVE_ITEMS, "data": responses}
+        return documents
+
+    def get_roles(self, key, site, list_url, list_id, itemid):
+        """Checks the permissions and returns the user roles.
+        :param key: key, a string value
+        :param site: site name to check the permission
+        :param list_url: list url to access the list
+        :param list_id: list id to check the permission
+        :param itemid: item id to check the permission
+        Returns:
+            roles: user roles
+        """
+        if key == SITES:
+            rel_url = site
+            roles = self.permissions.fetch_users(key, rel_url)
+
+        elif key == LISTS:
+            rel_url = list_url
+            roles = self.permissions.fetch_users(key, rel_url, list_id=list_id)
+
+        else:
+            rel_url = list_url
+            roles = self.permissions.fetch_users(key, rel_url, list_id=list_id, item_id=itemid)
+
+        return roles
+
+    def fetch_permissions(
+        self,
+        key,
+        site=None,
+        list_id=None,
+        list_url=None,
+        itemid=None,
+    ):
+        """This method when invoked, checks the permission inheritance of each object.
+        If the object has unique permissions, the list of users having access to it
+        is fetched using sharepoint api else the permission levels of the that object
+        is taken same as the permission level of the site collection.
+        :param key: key, a string value
+        :param site: site name to index the permission for the site
+        :param list_id: list id to index the permission for the list
+        :param list_url: url of the list
+        :param itemid: item id to index the permission for the item
+        Returns:
+            groups: list of users having access to the given object
+        """
+        roles = self.get_roles(key, site, list_url, list_id, itemid)
+
+        groups = []
+
+        if not roles:
+            return []
+        roles = get_results(self.logger, roles.json(), "roles")
+
+        for role in roles:
+            title = role["Member"]["Title"]
+            groups.append(title)
+        return groups
+
+    def fetch_and_append_sites_to_queue(self, ids, end_time, collection):
+        """Fetches and appends site details to queue
+        :param ids: id collection of the all the objects
+        :param end_time: end time for fetching the data
+        :param collection: collection name
+        """
+        _, datelist = split_date_range_into_chunks(self.start_time, self.end_time, self.sharepoint_thread_count)
+        results = []
+        parent_site_url = f"/sites/{collection}"
+        sites_path = [{parent_site_url: end_time}]
+        for num in range(0, self.sharepoint_thread_count):
+            start_time_partition = datelist[num]
+            end_time_partition = datelist[num + 1]
+            thread = self.thread_pool.apply_async(
+                self.fetch_sites,
+                (parent_site_url, {}, ids, (SITES in self.objects), start_time_partition, end_time_partition),
+                callback=self.queue.append_to_queue,
+            )
+            results.append(thread)
+
+        sites = []
+        for result in [r.get() for r in results]:
+            if result:
+                sites.append(result[0])
+
+        sites_path.extend(sites)
+        return sites_path
+
+    def fetch_and_append_lists_to_queue(self, sites_path, ids):
+        """Fetches and appends list details to queue
+        :param sites_path: dictionary of site path and it's last updated time
+        :param ids: id collection of the all the objects
+        """
+        results, lists_details, libraries_details = [], {}, {}
+        partitioned_sites = split_list_into_buckets(sites_path, self.sharepoint_thread_count)
+        for site in partitioned_sites:
+            thread = self.thread_pool.apply_async(
+                self.fetch_lists, (site, ids, (LISTS in self.objects)), callback=self.queue.append_to_queue
+            )
+            results.append(thread)
+        for result in [r.get() for r in results]:
+            if result:
+                lists_details.update(result[0])
+                libraries_details.update(result[1])
+        return [lists_details, libraries_details]
+
+    def fetch_and_append_list_items_to_queue(self, lists_details, ids):
+        """Fetches and appends list_items to the queue
+        :param lists_details: dictionary containing list name, list path and id
+        :param ids: id collection of the all the objects
+        """
+        partition = []
+        partition = split_documents_into_equal_chunks(lists_details, self.sharepoint_thread_count)
+        for list_data in partition:
+            self.thread_pool.apply_async(self.fetch_items, (list_data, ids), callback=self.queue.append_to_queue)
+
+    def fetch_and_append_drive_items_to_queue(self, libraries_details, ids):
+        """Fetches and appends the drive items to the queue
+        :param libraries_details: dictionary containing library name, library path and id
+        :param ids: id collection of the all the objects
+        """
+        partition = []
+        partition = split_documents_into_equal_chunks(libraries_details, self.sharepoint_thread_count)
+        for list_data in partition:
+            self.thread_pool.apply_async(self.fetch_drive_items, (list_data, ids), callback=self.queue.append_to_queue)
+
+    def perform_sync(self, collection, ids, storage, job_type, collected_objects, end_time):
+        """This method fetches all the objects from sharepoint server
+        :param collection: collection name
+        :param ids: id collection of the all the objects
+        :param storage: temporary storage for storing all the documents
+        :param job_type: denotes the type of sharepoint object being fetched in a particular process
+        :param collected_objects: helper variable to provide the data to children object
+        :param end_time: end time for fetching the data
+        """
+        if job_type == "sites":
+            collected_objects = self.fetch_and_append_sites_to_queue(ids, end_time, collection)
+
+        elif job_type == "lists":
+            collected_objects = self.fetch_and_append_lists_to_queue(collected_objects, ids)
+
+        elif job_type == LIST_ITEMS and LIST_ITEMS in self.objects:
+            self.fetch_and_append_list_items_to_queue(collected_objects[0], ids)
+
+        elif job_type == DRIVE_ITEMS and DRIVE_ITEMS in self.objects:
+            self.fetch_and_append_drive_items_to_queue(collected_objects[1], ids)
+
+        self.logger.info("Completed fetching all the objects for site collection: %s" % (collection))
+
+        self.logger.info("Saving the checkpoint for the site collection: %s" % (collection))
+        if ids.get(job_type):
+            prev_ids = storage[job_type]
+            if job_type == "sites":
+                prev_ids.update(ids[job_type])
+            elif job_type == "lists":
+                for site, list_content in ids[job_type].items():
+                    prev_ids[site] = {**prev_ids.get(site, {}), **ids[job_type][site]}
+            else:
+                for site, list_content in ids[job_type].items():
+                    prev_ids[site] = ids[job_type][site] if not prev_ids.get(site) else prev_ids[site]
+                    for list_name in list_content.keys():
+                        prev_ids[site][list_name] = list(
+                            set([*prev_ids[site].get(list_name, []), *ids[job_type][site][list_name]])
+                        )
+            storage[job_type] = prev_ids
+        return collected_objects
+
+
+def init_sharepoint_sync(indexing_type, config, logger, workplace_search_client, sharepoint_client, queue):
+    """Initialize the process for synching
+    :param indexing_type: The type of the indexing i.e. incremental or full
+    :param config: instance of Configuration class
+    :param logger: instance of Logger class
+    :param workplace_search_client: instance of WorkplaceSearch
+    :param sharepoint_client: instance of SharePoint
+    :param queue: Shared queue to push the objects fetched from SharePoint
+    """
+    logger.info(f"Starting the {indexing_type} indexing..")
+    current_time = (datetime.utcnow()).strftime("%Y-%m-%dT%H:%M:%SZ")
+    ids_collection = {"global_keys": {}}
+    storage_with_collection = {"global_keys": {}, "delete_keys": {}}
+
+    if os.path.exists(IDS_PATH) and os.path.getsize(IDS_PATH) > 0:
+        with open(IDS_PATH) as ids_store:
+            try:
+                ids_collection = json.load(ids_store)
+            except ValueError as exception:
+                logger.exception(
+                    "Error while parsing the json file of the ids store from path: %s. Error: %s"
+                    % (IDS_PATH, exception)
+                )
+
+    storage_with_collection["delete_keys"] = copy.deepcopy(ids_collection.get("global_keys"))
+    check = Checkpoint(config, logger)
+
+    try:
+        for collection in config.get_value("sharepoint.site_collections"):
+            storage = {"sites": {}, "lists": {}, "list_items": {}, "drive_items": {}}
+            logger.info("Starting the data fetching for site collection: %s" % (collection))
+
+            if indexing_type == "incremental":
+                start_time, end_time = check.get_checkpoint(collection, current_time)
+            else:
+                start_time = config.get_value("start_time")
+                end_time = current_time
+
+            if not ids_collection["global_keys"].get(collection):
+                ids_collection["global_keys"][collection] = {
+                    "sites": {},
+                    "lists": {},
+                    "list_items": {},
+                    "drive_items": {},
+                }
+
+            logger.info(
+                "Starting to index all the objects configured in the object field: %s"
+                % (str(config.get_value("objects")))
+            )
+
+            sync_sharepoint = SyncSharepoint(
+                config, logger, workplace_search_client, sharepoint_client, start_time, end_time, queue
+            )
+            returned_documents = None
+            for job_type in ["sites", "lists", "list_items", "drive_items"]:
+                logger.info(f"Indexing {job_type}")
+                returned_documents = sync_sharepoint.perform_sync(
+                    collection,
+                    ids_collection["global_keys"][collection],
+                    storage,
+                    job_type,
+                    returned_documents,
+                    end_time,
+                )
+            sync_sharepoint.thread_pool.close()
+            sync_sharepoint.thread_pool.join()
+            queue.put_checkpoint(collection, end_time, indexing_type)
+
+            storage_with_collection["global_keys"][collection] = storage.copy()
+        queue.end_signal()
+    except Exception as exception:
+        raise exception
+
+    with open(IDS_PATH, "w") as file:
+        try:
+            json.dump(storage_with_collection, file, indent=4)
+        except ValueError as exception:
+            logger.warning("Error while adding ids to json file. Error: %s" % (exception))
diff --git a/ees_sharepoint/utils.py b/ees_sharepoint/utils.py
index ceba89f..8f77b87 100644
--- a/ees_sharepoint/utils.py
+++ b/ees_sharepoint/utils.py
@@ -6,68 +6,69 @@
 """This module contains uncategorized utility methods."""
 
 import urllib.parse
-
-from tika import parser
 from datetime import datetime
-from multiprocessing.pool import ThreadPool
+from tika import parser
+
 DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
 
 
 def extract(content):
-    """ Extracts the contents
-        :param content: content to be extracted
-        Returns:
-            parsed_test: parsed text"""
+    """Extracts the contents
+    :param content: content to be extracted
+    Returns:
+        parsed_test: parsed text"""
     parsed = parser.from_buffer(content)
-    parsed_text = parsed['content']
+    parsed_text = parsed["content"]
     return parsed_text
 
 
 def encode(object_name):
     """Performs encoding on the name of objects
-        containing special characters in their url, and
-        replaces single quote with two single quote since quote
-        is treated as an escape character in odata
-        :param object_name: name that contains special characters"""
+    containing special characters in their url, and
+    replaces single quote with two single quote since quote
+    is treated as an escape character in odata
+    :param object_name: name that contains special characters"""
     name = urllib.parse.quote(object_name, safe="'")
     return name.replace("'", "''")
 
 
-def split_list_into_buckets(object_list, total_groups):
-    """ Divides the list in groups of approximately equal sizes
-        :param object_list: list to be partitioned
-        :param total_groups: number of groups to be formed
+def split_list_into_buckets(documents, total_buckets):
+    """Divide large number of documents amongst the total buckets
+    :param documents: list to be partitioned
+    :param total_buckets: number of groups to be formed
     """
-    if object_list:
-        groups = min(total_groups, len(object_list))
+    if documents:
+        groups = min(total_buckets, len(documents))
         group_list = []
         for i in range(groups):
-            group_list.append(object_list[i::groups])
+            group_list.append(documents[i::groups])
         return group_list
     else:
         return []
 
 
-def split_dict_in_chunks(input_dict, chunk_size):
-    """ This method splits a dictionary into separate chunks with maximum size
-        as chunk_size
-        :param input_dict: Dictionary to be partitioned into chunks
-        :param chunk_size: Maximum size of a chunk
-        Returns:
-            list_of_chunks: List containing the chunks
+def split_documents_into_equal_chunks(documents, chunk_size):
+    """This method splits a list or dictionary into equal chunks size
+    :param documents: List or Dictionary to be partitioned into chunks
+    :param chunk_size: Maximum size of a chunk
+    Returns:
+        list_of_chunks: List containing the chunks
     """
     list_of_chunks = []
-    for i in range(0, len(input_dict), chunk_size):
-        partitioned_chunk = list(input_dict.items())[i:i + chunk_size]
-        list_of_chunks.append(dict(partitioned_chunk))
+    for i in range(0, len(documents), chunk_size):
+        if type(documents) is dict:
+            partitioned_chunk = list(documents.items())[i: i + chunk_size]
+            list_of_chunks.append(dict(partitioned_chunk))
+        else:
+            list_of_chunks.append(documents[i: i + chunk_size])
     return list_of_chunks
 
 
 def split_date_range_into_chunks(start_time, end_time, number_of_threads):
-    """ Divides the timerange in equal partitions by number of threads
-        :param start_time: start time of the interval
-        :param end_time: end time of the interval
-        :param number_of_threads: number of threads defined by user in config file
+    """Divides the timerange in equal partitions by number of threads
+    :param start_time: start time of the interval
+    :param end_time: end time of the interval
+    :param number_of_threads: number of threads defined by user in config file
     """
     start_time = datetime.strptime(start_time, DATETIME_FORMAT)
     end_time = datetime.strptime(end_time, DATETIME_FORMAT)
@@ -80,10 +81,3 @@ def split_date_range_into_chunks(start_time, end_time, number_of_threads):
     formatted_end_time = end_time.strftime(DATETIME_FORMAT)
     datelist.append(formatted_end_time)
     return formatted_end_time, datelist
-
-
-def spawn_threads(max_threads):
-    """ Spawns number of threads provided by user in the config file
-        :param max_threads: maximum number of threads defined by user
-    """
-    return ThreadPool(max_threads)
diff --git a/sharepoint_server_connector.yml b/sharepoint_server_connector.yml
index 948b328..7131e9b 100644
--- a/sharepoint_server_connector.yml
+++ b/sharepoint_server_connector.yml
@@ -44,7 +44,9 @@ end_time :
 log_level: INFO
 #The number of retries to perform in case of server error. The connector will use exponential backoff for retry mechanism
 retry_count: 3
-#Number of threads to be used in multithreading for the connector.
-max_threads: 40
+#Number of threads to be used in multithreading for the sharepoint sync.
+sharepoint_sync_thread_count: 5
+#Number of threads to be used in multithreading for the enterprise search sync.
+enterprise_search_sync_thread_count: 5
 #the path of csv file containing mapping of sharepoint user ID to Workplace user ID
 sharepoint_workplace_user_mapping: "C:/Users/abc/folder_name/file_name.csv"

From 2b60766df3af6a1f7f2341c72e6c40a362883f5a Mon Sep 17 00:00:00 2001
From: praveen-elastic <praveen.kukreja@elastic.co>
Date: Sat, 26 Mar 2022 05:38:23 +0530
Subject: [PATCH 5/9] Fixing the issue of Pickling on Windows

---
 ees_sharepoint/full_sync_command.py        |  7 +++----
 ees_sharepoint/incremental_sync_command.py |  7 +++----
 ees_sharepoint/sync_enterprise_search.py   |  8 ++++++--
 ees_sharepoint/sync_sharepoint.py          | 10 +++++++---
 4 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/ees_sharepoint/full_sync_command.py b/ees_sharepoint/full_sync_command.py
index 2d7bdd3..4e6e4df 100644
--- a/ees_sharepoint/full_sync_command.py
+++ b/ees_sharepoint/full_sync_command.py
@@ -21,21 +21,20 @@ def execute(self):
         """This function execute the start function."""
         config = self.config
         logger = self.logger
-        workplace_search_client = self.workplace_search_client
-        sharepoint_client = self.sharepoint_client
+        args = self.args
 
         queue = ConnectorQueue()
         producer = Process(
             name="producer",
             target=init_sharepoint_sync,
-            args=("full", config, logger, workplace_search_client, sharepoint_client, queue),
+            args=("full", config, logger, queue, args),
         )
         producer.start()
 
         consumer = Process(
             name="consumer",
             target=init_enterprise_search_sync,
-            args=(config, logger, workplace_search_client, queue),
+            args=(config, logger, queue, args),
         )
         consumer.start()
 
diff --git a/ees_sharepoint/incremental_sync_command.py b/ees_sharepoint/incremental_sync_command.py
index d29d1dc..1112c77 100644
--- a/ees_sharepoint/incremental_sync_command.py
+++ b/ees_sharepoint/incremental_sync_command.py
@@ -24,21 +24,20 @@ def execute(self):
         """This function execute the start function."""
         config = self.config
         logger = self.logger
-        workplace_search_client = self.workplace_search_client
-        sharepoint_client = self.sharepoint_client
+        args = self.args
 
         queue = ConnectorQueue()
         producer = Process(
             name="producer",
             target=init_sharepoint_sync,
-            args=("incremental", config, logger, workplace_search_client, sharepoint_client, queue),
+            args=("incremental", config, logger, queue, args),
         )
         producer.start()
 
         consumer = Process(
             name="consumer",
             target=init_enterprise_search_sync,
-            args=(config, logger, workplace_search_client, queue),
+            args=(config, logger, queue, args),
         )
         consumer.start()
 
diff --git a/ees_sharepoint/sync_enterprise_search.py b/ees_sharepoint/sync_enterprise_search.py
index 87e8bb4..604c5e0 100644
--- a/ees_sharepoint/sync_enterprise_search.py
+++ b/ees_sharepoint/sync_enterprise_search.py
@@ -5,6 +5,7 @@
 #
 from multiprocessing.pool import ThreadPool
 from .utils import split_documents_into_equal_chunks
+from .base_command import BaseCommand
 from .checkpointing import Checkpoint
 
 BATCH_SIZE = 100
@@ -65,12 +66,15 @@ def perform_sync(self):
         self.thread_pool.join()
 
 
-def init_enterprise_search_sync(config, logger, workplace_search_client, queue):
+def init_enterprise_search_sync(config, logger, queue, args):
     """Runs the indexing logic
     :param config: instance of Configuration class
     :param logger: instance of Logger class
-    :param workplace_search_client: instance of WorkplaceSearch
     :param queue: Shared queue to push the objects fetched from SharePoint
+    :param args: The command line arguments passed from the base command
     """
+    # Added this workaround of initializing the base_command since workplace_search_client and sharepoint_client cannot be passed in the Process argument as doing so would throw pickling errors on Windows
+    base_command = BaseCommand(args)
+    workplace_search_client = base_command.workplace_search_client
     indexer = SyncEnterpriseSearch(config, logger, workplace_search_client, queue)
     indexer.perform_sync()
diff --git a/ees_sharepoint/sync_sharepoint.py b/ees_sharepoint/sync_sharepoint.py
index 1884c79..8b8af37 100644
--- a/ees_sharepoint/sync_sharepoint.py
+++ b/ees_sharepoint/sync_sharepoint.py
@@ -18,6 +18,7 @@
 from tika.tika import TikaException
 
 from . import adapter
+from .base_command import BaseCommand
 from .checkpointing import Checkpoint
 from .usergroup_permissions import Permissions
 from .utils import (
@@ -563,19 +564,22 @@ def perform_sync(self, collection, ids, storage, job_type, collected_objects, en
         return collected_objects
 
 
-def init_sharepoint_sync(indexing_type, config, logger, workplace_search_client, sharepoint_client, queue):
+def init_sharepoint_sync(indexing_type, config, logger, queue, args):
     """Initialize the process for synching
     :param indexing_type: The type of the indexing i.e. incremental or full
     :param config: instance of Configuration class
     :param logger: instance of Logger class
-    :param workplace_search_client: instance of WorkplaceSearch
-    :param sharepoint_client: instance of SharePoint
     :param queue: Shared queue to push the objects fetched from SharePoint
+    :param args: The command line arguments passed from the base command
     """
     logger.info(f"Starting the {indexing_type} indexing..")
     current_time = (datetime.utcnow()).strftime("%Y-%m-%dT%H:%M:%SZ")
     ids_collection = {"global_keys": {}}
     storage_with_collection = {"global_keys": {}, "delete_keys": {}}
+    # Added this workaround of initializing the base_command since workplace_search_client and sharepoint_client cannot be passed in the Process argument as doing so would throw pickling errors on Windows
+    base_command = BaseCommand(args)
+    workplace_search_client = base_command.workplace_search_client
+    sharepoint_client = base_command.sharepoint_client
 
     if os.path.exists(IDS_PATH) and os.path.getsize(IDS_PATH) > 0:
         with open(IDS_PATH) as ids_store:

From 229ab29474ecd125f85d37775b6581720f73275d Mon Sep 17 00:00:00 2001
From: praveen-elastic <praveen.kukreja@elastic.co>
Date: Thu, 7 Apr 2022 18:19:21 +0530
Subject: [PATCH 6/9] update multithreading approach

---
 ees_sharepoint/base_command.py             |  44 ++-
 ees_sharepoint/connector_queue.py          |  23 +-
 ees_sharepoint/full_sync_command.py        |  89 +++--
 ees_sharepoint/incremental_sync_command.py |  93 +++--
 ees_sharepoint/local_storage.py            |  39 ++
 ees_sharepoint/sync_enterprise_search.py   |  82 ++--
 ees_sharepoint/sync_sharepoint.py          | 414 ++++++++++-----------
 ees_sharepoint/utils.py                    |  21 ++
 8 files changed, 471 insertions(+), 334 deletions(-)
 create mode 100644 ees_sharepoint/local_storage.py

diff --git a/ees_sharepoint/base_command.py b/ees_sharepoint/base_command.py
index 5b3b691..0574fd4 100644
--- a/ees_sharepoint/base_command.py
+++ b/ees_sharepoint/base_command.py
@@ -16,9 +16,13 @@
     from functools import cached_property
 except ImportError:
     from cached_property import cached_property
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
 from elastic_enterprise_search import WorkplaceSearch
 
 from .configuration import Configuration
+from .local_storage import LocalStorage
 from .sharepoint_client import SharePoint
 
 
@@ -27,13 +31,14 @@ class BaseCommand:
 
     Inherit from it and implement 'execute' method, then add
     code to cli.py to register this command."""
+
     def __init__(self, args):
         self.args = args
 
     def execute(self):
         """Run the command.
 
-        This method is overriden by actual commands with logic
+        This method is overridden by actual commands with logic
         that is specific to each command implementing it."""
         raise NotImplementedError
 
@@ -44,7 +49,7 @@ def logger(self):
         log level will be determined by the configuration
         setting log_level.
         """
-        log_level = self.config.get_value('log_level')
+        log_level = self.config.get_value("log_level")
         logger = logging.getLogger(__name__)
         logger.propagate = False
         logger.setLevel(log_level)
@@ -69,13 +74,14 @@ def workplace_search_client(self):
         args = self.args
         host = self.config.get_value("enterprise_search.host_url")
 
-        if hasattr(args, 'user') and args.user:
+        if hasattr(args, "user") and args.user:
             return WorkplaceSearch(
                 f"{host}/api/ws/v1/sources", http_auth=(args.user, args.password)
             )
         else:
             return WorkplaceSearch(
-                f"{host}/api/ws/v1/sources", http_auth=self.config.get_value("workplace_search.api_key")
+                f"{host}/api/ws/v1/sources",
+                http_auth=self.config.get_value("workplace_search.api_key"),
             )
 
     @cached_property
@@ -88,3 +94,33 @@ def config(self):
     def sharepoint_client(self):
         """Get the sharepoint client instance for the running command."""
         return SharePoint(self.config, self.logger)
+
+    @staticmethod
+    def producer(thread_count, func, args, items, wait=False):
+        """Apply async calls using multithreading to the targeted function
+        :param thread_count: Total number of threads to be spawned
+        :param func: The target function on which the async calls would be made
+        :param args: Arguments for the targeted function
+        :param items: iterator of partition
+        :param wait: wait until job completes if true, otherwise returns immediately
+        """
+        with ThreadPoolExecutor(max_workers=thread_count) as executor:
+            futures = (executor.submit(func, *args, item) for item in items)
+            if wait:
+                result = [future.result() for future in as_completed(futures)]
+                return result
+
+    @staticmethod
+    def consumer(thread_count, func):
+        """Apply async calls using multithreading to the targeted function
+        :param thread_count: Total number of threads to be spawned
+        :param func: The target function on which the async calls would be made
+        """
+        with ThreadPoolExecutor(max_workers=thread_count) as executor:
+            for _ in range(thread_count):
+                executor.submit(func)
+
+    @cached_property
+    def local_storage(self):
+        """Get the object for local storage to fetch and update ids stored locally"""
+        return LocalStorage(self.logger)
diff --git a/ees_sharepoint/connector_queue.py b/ees_sharepoint/connector_queue.py
index 71792a7..f772db8 100644
--- a/ees_sharepoint/connector_queue.py
+++ b/ees_sharepoint/connector_queue.py
@@ -5,8 +5,6 @@
 #
 import multiprocessing
 from multiprocessing.queues import Queue
-from .utils import split_documents_into_equal_chunks
-
 
 BATCH_SIZE = 100
 
@@ -14,8 +12,9 @@
 class ConnectorQueue(Queue):
     """Class to support additional queue operations specific to the connector"""
 
-    def __init__(self):
+    def __init__(self, logger):
         ctx = multiprocessing.get_context()
+        self.logger = logger
         super(ConnectorQueue, self).__init__(ctx=ctx)
 
     def end_signal(self):
@@ -32,18 +31,8 @@ def put_checkpoint(self, key, checkpoint_time, indexing_type):
         :param indexing_type: The type of the indexing i.e. Full or Incremental
         """
 
-        checkpoint = {"type": "checkpoint", "data": (key, checkpoint_time, indexing_type)}
+        checkpoint = {
+            "type": "checkpoint",
+            "data": (key, checkpoint_time, indexing_type),
+        }
         self.put(checkpoint)
-
-    def append_to_queue(self, documents):
-        """Append documents to the shared queue
-        :param documents: documents fetched from sharepoint
-        """
-        if documents:
-            results = documents
-            # In case documents is object of tuple
-            if isinstance(documents, tuple):
-                results = documents[-1]
-            for chunk in split_documents_into_equal_chunks(results.get("data"), BATCH_SIZE):
-                document = {"type": results.get("type"), "data": chunk}
-                self.put(document)
diff --git a/ees_sharepoint/full_sync_command.py b/ees_sharepoint/full_sync_command.py
index 4e6e4df..0fd3643 100644
--- a/ees_sharepoint/full_sync_command.py
+++ b/ees_sharepoint/full_sync_command.py
@@ -7,36 +7,77 @@
 
 It will attempt to sync absolutely all documents that are available in the
 third-party system and ingest them into Enterprise Search instance."""
+from datetime import datetime
+
 from .base_command import BaseCommand
-from .sync_sharepoint import init_sharepoint_sync
 from .connector_queue import ConnectorQueue
-from .sync_enterprise_search import init_enterprise_search_sync
-from multiprocessing import Process
+from .sync_enterprise_search import SyncEnterpriseSearch
+from .sync_sharepoint import SyncSharepoint
+from .utils import get_storage_with_collection, split_date_range_into_chunks
 
 
 class FullSyncCommand(BaseCommand):
     """This class start execution of fullsync feature."""
 
+    def start_producer(self, queue):
+        """This method starts async calls for the producer which is responsible for fetching documents from
+        the SharePoint and pushing them in the shared queue
+        :param queue: Shared queue to fetch the stored documents
+        """
+        self.logger.debug("Starting the full indexing..")
+        current_time = (datetime.utcnow()).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+        thread_count = self.config.get_value("sharepoint_sync_thread_count")
+
+        start_time, end_time = self.config.get_value("start_time"), current_time
+        try:
+            sync_sharepoint = SyncSharepoint(
+                self.config,
+                self.logger,
+                self.workplace_search_client,
+                self.sharepoint_client,
+                start_time,
+                end_time,
+                queue,
+            )
+            _, datelist = split_date_range_into_chunks(
+                start_time,
+                end_time,
+                thread_count,
+            )
+            for collection in self.config.get_value("sharepoint.site_collections"):
+                storage_with_collection = get_storage_with_collection(self.local_storage, collection)
+                self.logger.info(
+                    "Starting to index all the objects configured in the object field: %s"
+                    % (str(self.config.get_value("objects")))
+                )
+
+                ids = storage_with_collection["global_keys"][collection]
+                storage_with_collection["global_keys"][collection] = sync_sharepoint.fetch_records_from_sharepoint(self.producer, datelist, thread_count, ids, collection)
+
+                queue.put_checkpoint(collection, end_time, "full")
+
+            enterprise_thread_count = self.config.get_value("enterprise_search_sync_thread_count")
+            for _ in range(enterprise_thread_count):
+                queue.end_signal()
+        except Exception as exception:
+            self.logger.exception(f"Error while fetching the objects . Error {exception}")
+            raise exception
+        self.local_storage.update_storage(storage_with_collection)
+
+    def start_consumer(self, queue):
+        """This method starts async calls for the consumer which is responsible for indexing documents to the
+        Enterprise Search
+        :param queue: Shared queue to fetch the stored documents
+        """
+        thread_count = self.config.get_value("enterprise_search_sync_thread_count")
+        sync_es = SyncEnterpriseSearch(self.config, self.logger, self.workplace_search_client, queue)
+
+        self.consumer(thread_count, sync_es.perform_sync)
+
     def execute(self):
         """This function execute the start function."""
-        config = self.config
-        logger = self.logger
-        args = self.args
-
-        queue = ConnectorQueue()
-        producer = Process(
-            name="producer",
-            target=init_sharepoint_sync,
-            args=("full", config, logger, queue, args),
-        )
-        producer.start()
-
-        consumer = Process(
-            name="consumer",
-            target=init_enterprise_search_sync,
-            args=(config, logger, queue, args),
-        )
-        consumer.start()
-
-        producer.join()
-        consumer.join()
+        queue = ConnectorQueue(self.logger)
+
+        self.start_producer(queue)
+        self.start_consumer(queue)
diff --git a/ees_sharepoint/incremental_sync_command.py b/ees_sharepoint/incremental_sync_command.py
index 1112c77..67f7e82 100644
--- a/ees_sharepoint/incremental_sync_command.py
+++ b/ees_sharepoint/incremental_sync_command.py
@@ -10,36 +10,79 @@
 
 Recency is determined by the time when the last successful incremental or full job
 was ran."""
+from datetime import datetime
+
 from .base_command import BaseCommand
-from .sync_sharepoint import init_sharepoint_sync
+from .checkpointing import Checkpoint
 from .connector_queue import ConnectorQueue
-from .sync_enterprise_search import init_enterprise_search_sync
-from multiprocessing import Process
+from .sync_enterprise_search import SyncEnterpriseSearch
+from .sync_sharepoint import SyncSharepoint
+from .utils import get_storage_with_collection, split_date_range_into_chunks
 
 
 class IncrementalSyncCommand(BaseCommand):
-    """This class start execution of incrementalsync feature."""
+    """This class start execution of incremental sync feature."""
+
+    def start_producer(self, queue):
+        """This method starts async calls for the producer which is responsible for fetching documents from the
+        SharePoint and pushing them in the shared queue
+        :param queue: Shared queue to fetch the stored documents
+        """
+        self.logger.debug("Starting the incremental indexing..")
+        current_time = (datetime.utcnow()).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+        thread_count = self.config.get_value("sharepoint_sync_thread_count")
+
+        checkpoint = Checkpoint(self.config, self.logger)
+        try:
+            for collection in self.config.get_value("sharepoint.site_collections"):
+                start_time, end_time = checkpoint.get_checkpoint(collection, current_time)
+                sync_sharepoint = SyncSharepoint(
+                    self.config,
+                    self.logger,
+                    self.workplace_search_client,
+                    self.sharepoint_client,
+                    start_time,
+                    end_time,
+                    queue,
+                )
+                _, datelist = split_date_range_into_chunks(
+                    start_time,
+                    end_time,
+                    thread_count,
+                )
+                storage_with_collection = get_storage_with_collection(self.local_storage, collection)
+                self.logger.info(
+                    "Starting to index all the objects configured in the object field: %s"
+                    % (str(self.config.get_value("objects")))
+                )
+
+                ids = storage_with_collection["global_keys"][collection]
+                storage_with_collection["global_keys"][collection] = sync_sharepoint.fetch_records_from_sharepoint(self.producer, datelist, thread_count, ids, collection)
+
+                queue.put_checkpoint(collection, end_time, "incremental")
+
+            enterprise_thread_count = self.config.get_value("enterprise_search_sync_thread_count")
+            for _ in range(enterprise_thread_count):
+                queue.end_signal()
+        except Exception as exception:
+            self.logger.exception(f"Error while fetching the objects . Error {exception}")
+            raise exception
+        self.local_storage.update_storage(storage_with_collection)
+
+    def start_consumer(self, queue):
+        """This method starts async calls for the consumer which is responsible for indexing documents to the
+        Enterprise Search
+        :param queue: Shared queue to fetch the stored documents
+        """
+        thread_count = self.config.get_value("enterprise_search_sync_thread_count")
+        sync_es = SyncEnterpriseSearch(self.config, self.logger, self.workplace_search_client, queue)
+
+        self.consumer(thread_count, sync_es.perform_sync)
 
     def execute(self):
         """This function execute the start function."""
-        config = self.config
-        logger = self.logger
-        args = self.args
-
-        queue = ConnectorQueue()
-        producer = Process(
-            name="producer",
-            target=init_sharepoint_sync,
-            args=("incremental", config, logger, queue, args),
-        )
-        producer.start()
-
-        consumer = Process(
-            name="consumer",
-            target=init_enterprise_search_sync,
-            args=(config, logger, queue, args),
-        )
-        consumer.start()
-
-        producer.join()
-        consumer.join()
+        queue = ConnectorQueue(self.logger)
+
+        self.start_producer(queue)
+        self.start_consumer(queue)
diff --git a/ees_sharepoint/local_storage.py b/ees_sharepoint/local_storage.py
new file mode 100644
index 0000000..71166e4
--- /dev/null
+++ b/ees_sharepoint/local_storage.py
@@ -0,0 +1,39 @@
+import json
+import os
+
+IDS_PATH = os.path.join(os.path.dirname(__file__), 'doc_id.json')
+
+
+class LocalStorage:
+    """This class contains all the methods to do operations on doc_id json file
+    """
+
+    def __init__(self, logger):
+        self.logger = logger
+
+    def load_storage(self):
+        """This method fetches the contents of doc_id.json(local ids storage)
+        """
+        try:
+            with open(IDS_PATH, encoding='utf-8') as ids_file:
+                try:
+                    return json.load(ids_file)
+                except ValueError as exception:
+                    self.logger.exception(
+                        f"Error while parsing the json file of the ids store from path: {IDS_PATH}. Error: {exception}"
+                    )
+        except FileNotFoundError:
+            self.logger.debug("Local storage for ids was not found.")
+            return {"global_keys": {}}
+
+    def update_storage(self, ids):
+        """This method is used to update the ids stored in doc_id.json file
+            :param ids: updated ids to be stored in the doc_id.json file
+        """
+        with open(IDS_PATH, "w", encoding='utf-8') as ids_file:
+            try:
+                json.dump(ids, ids_file, indent=4)
+            except ValueError as exception:
+                self.logger.exception(
+                    f"Error while updating the doc_id json file. Error: {exception}"
+                )
diff --git a/ees_sharepoint/sync_enterprise_search.py b/ees_sharepoint/sync_enterprise_search.py
index 604c5e0..cf199ba 100644
--- a/ees_sharepoint/sync_enterprise_search.py
+++ b/ees_sharepoint/sync_enterprise_search.py
@@ -3,10 +3,10 @@
 # or more contributor license agreements. Licensed under the Elastic License 2.0;
 # you may not use this file except in compliance with the Elastic License 2.0.
 #
-from multiprocessing.pool import ThreadPool
-from .utils import split_documents_into_equal_chunks
-from .base_command import BaseCommand
+import threading
+
 from .checkpointing import Checkpoint
+from .utils import split_documents_into_equal_chunks
 
 BATCH_SIZE = 100
 
@@ -19,62 +19,64 @@ def __init__(self, config, logger, workplace_search_client, queue):
         self.logger = logger
         self.workplace_search_client = workplace_search_client
         self.ws_source = config.get_value("workplace_search.source_id")
-        self.enterprise_search_thread_count = config.get_value("enterprise_search_sync_thread_count")
-        self.thread_pool = ThreadPool(self.enterprise_search_thread_count)
         self.queue = queue
 
     def index_documents(self, documents):
         """This method indexes the documents to the Enterprise Search.
         :param documents: documents to be indexed
         """
-        total_documents_indexed = 0
-        if documents:
-            responses = self.workplace_search_client.index_documents(
-                content_source_id=self.ws_source, documents=documents
+        try:
+            total_documents_indexed = 0
+            if documents:
+                responses = self.workplace_search_client.index_documents(
+                    content_source_id=self.ws_source,
+                    documents=documents,
+                    request_timeout=1000,
+                )
+                for response in responses["results"]:
+                    if not response["errors"]:
+                        total_documents_indexed += 1
+                    else:
+                        self.logger.error(
+                            "Error while indexing %s. Error: %s"
+                            % (response["id"], response["errors"])
+                        )
+            self.logger.info(
+                f"[{threading.get_ident()}] Successfully indexed {total_documents_indexed} documents to the workplace"
             )
-            for response in responses["results"]:
-                if not response["errors"]:
-                    total_documents_indexed += 1
-                else:
-                    self.logger.error("Error while indexing %s. Error: %s" % (response["id"], response["errors"]))
-        self.logger.info("Successfully indexed %s documents to the workplace" % (total_documents_indexed))
+        except Exception as exception:
+            self.logger.exception(f"Error while indexing the files. Error: {exception}")
+            raise exception
 
     def perform_sync(self):
         """Pull documents from the queue and synchronize it to the Enterprise Search."""
-        checkpoint = Checkpoint(self.config, self.logger)
-        signal_open = True
-        while signal_open:
-            for _ in range(0, self.enterprise_search_thread_count):
+        try:
+            checkpoint = Checkpoint(self.config, self.logger)
+            signal_open = True
+            while signal_open:
                 documents_to_index = []
                 while len(documents_to_index) < BATCH_SIZE:
                     documents = self.queue.get()
                     if documents.get("type") == "signal_close":
+                        self.logger.info(
+                            f"Found an end signal in the queue. Closing Thread ID {threading.get_ident()}"
+                        )
                         signal_open = False
                         break
                     elif documents.get("type") == "checkpoint":
                         checkpoint.set_checkpoint(
-                            documents.get("data")[0], documents.get("data")[1], documents.get("data")[2]
+                            documents.get("data")[0],
+                            documents.get("data")[1],
+                            documents.get("data")[2],
                         )
                         break
                     else:
                         documents_to_index.extend(documents.get("data"))
-                for chunk in split_documents_into_equal_chunks(documents_to_index, BATCH_SIZE):
-                    self.thread_pool.apply_async(self.index_documents, (chunk,))
-                if not signal_open:
-                    break
-        self.thread_pool.close()
-        self.thread_pool.join()
-
-
-def init_enterprise_search_sync(config, logger, queue, args):
-    """Runs the indexing logic
-    :param config: instance of Configuration class
-    :param logger: instance of Logger class
-    :param queue: Shared queue to push the objects fetched from SharePoint
-    :param args: The command line arguments passed from the base command
-    """
-    # Added this workaround of initializing the base_command since workplace_search_client and sharepoint_client cannot be passed in the Process argument as doing so would throw pickling errors on Windows
-    base_command = BaseCommand(args)
-    workplace_search_client = base_command.workplace_search_client
-    indexer = SyncEnterpriseSearch(config, logger, workplace_search_client, queue)
-    indexer.perform_sync()
+                # This loop is to ensure if the last document fetched from the queue exceeds the size of
+                # documents_to_index to more than the permitted chunk size, then we split the documents as per the limit
+                for chunk in split_documents_into_equal_chunks(
+                    documents_to_index, BATCH_SIZE
+                ):
+                    self.index_documents(chunk)
+        except Exception as exception:
+            self.logger.error(f"Error while indexing the documents to the Enterprise Search. Error {exception}")
diff --git a/ees_sharepoint/sync_sharepoint.py b/ees_sharepoint/sync_sharepoint.py
index 8b8af37..e51f7ac 100644
--- a/ees_sharepoint/sync_sharepoint.py
+++ b/ees_sharepoint/sync_sharepoint.py
@@ -6,28 +6,18 @@
 """sync_sharepoint module allows to sync data to Elastic Enterprise Search.
 
 It's possible to run full syncs and incremental syncs with this module."""
-
-import copy
-import json
 import os
 import re
-from datetime import datetime
+import threading
 from urllib.parse import urljoin
+
 from dateutil.parser import parse
-from multiprocessing.pool import ThreadPool
 from tika.tika import TikaException
 
 from . import adapter
-from .base_command import BaseCommand
 from .checkpointing import Checkpoint
 from .usergroup_permissions import Permissions
-from .utils import (
-    encode,
-    extract,
-    split_list_into_buckets,
-    split_date_range_into_chunks,
-    split_documents_into_equal_chunks,
-)
+from .utils import encode, extract, split_documents_into_equal_chunks, split_list_into_buckets
 
 IDS_PATH = os.path.join(os.path.dirname(__file__), "doc_id.json")
 
@@ -48,19 +38,28 @@ def get_results(logger, response, entity_name):
         Parsed response
     """
     if not response:
-        logger.error(f"Empty response when fetching {entity_name}")  # TODO: should it be an error?
+        logger.error(f"Empty response when fetching {entity_name}")
         return None
 
     if entity_name == "attachment" and not response.get("d", {}).get("results"):
-        logger.info("Failed to fetch attachment")  # TODO: not sure if it's the right message
+        logger.info("Failed to fetch attachment")
         return None
     return response.get("d", {}).get("results")
 
 
 class SyncSharepoint:
-    """This class allows synching objects from the SharePoint Server."""
-
-    def __init__(self, config, logger, workplace_search_client, sharepoint_client, start_time, end_time, queue):
+    """This class allows syncing objects from the SharePoint Server."""
+
+    def __init__(
+            self,
+            config,
+            logger,
+            workplace_search_client,
+            sharepoint_client,
+            start_time,
+            end_time,
+            queue,
+    ):
         self.config = config
         self.logger = logger
         self.workplace_search_client = workplace_search_client
@@ -74,11 +73,11 @@ def __init__(self, config, logger, workplace_search_client, sharepoint_client, s
         self.end_time = end_time
         self.sharepoint_thread_count = config.get_value("sharepoint_sync_thread_count")
         self.mapping_sheet_path = config.get_value("sharepoint_workplace_user_mapping")
-
+        self.sharepoint_host = config.get_value("sharepoint.host_url")
         self.checkpoint = Checkpoint(config, logger)
-        self.permissions = Permissions(self.sharepoint_client, self.workplace_search_client, logger)
-
-        self.thread_pool = ThreadPool(self.sharepoint_thread_count)
+        self.permissions = Permissions(
+            self.sharepoint_client, self.workplace_search_client, logger
+        )
         self.queue = queue
 
     def get_schema_fields(self, document_name):
@@ -94,9 +93,17 @@ def get_schema_fields(self, document_name):
             include_fields = fields.get("include_fields")
             exclude_fields = fields.get("exclude_fields")
             if include_fields:
-                adapter_schema = {key: val for key, val in adapter_schema.items() if val in include_fields}
+                adapter_schema = {
+                    key: val
+                    for key, val in adapter_schema.items()
+                    if val in include_fields
+                }
             elif exclude_fields:
-                adapter_schema = {key: val for key, val in adapter_schema.items() if val not in exclude_fields}
+                adapter_schema = {
+                    key: val
+                    for key, val in adapter_schema.items()
+                    if val not in exclude_fields
+                }
             adapter_schema["id"] = field_id
         return adapter_schema
 
@@ -124,16 +131,19 @@ def fetch_sites(self, parent_site_url, sites, ids, index, start_time, end_time):
                 "No sites were created in %s for this interval: start time: %s and end time: %s"
                 % (parent_site_url, start_time, end_time)
             )
-            return sites
-        self.logger.info("Successfully fetched and parsed %s sites response from SharePoint" % len(response_data))
-
+            return sites, {}
+        self.logger.info(
+            "Successfully fetched and parsed %s sites response from SharePoint"
+            % len(response_data)
+        )
         schema = self.get_schema_fields(SITES)
         document = []
 
         if index:
             for i, _ in enumerate(response_data):
                 doc = {"type": SITE}
-                # need to convert date to iso else workplace search throws error on date format Invalid field value: Value '2021-09-29T08:13:00' cannot be parsed as a date (RFC 3339)"]}
+                # need to convert date to iso else workplace search throws error on date format Invalid field
+                # value: Value '2021-09-29T08:13:00' cannot be parsed as a date (RFC 3339)"]}
                 response_data[i]["Created"] += "Z"
                 for field, response_field in schema.items():
                     doc[field] = response_data[i].get(response_field)
@@ -169,16 +179,20 @@ def fetch_lists(self, sites, ids, index):
                 "No list was created in this interval: start time: %s and end time: %s"
                 % (self.start_time, self.end_time)
             )
-            return [], []
+            return [], [], {}
         schema_list = self.get_schema_fields(LISTS)
         for site_details in sites:
             for site, time_modified in site_details.items():
                 if parse(self.start_time) > parse(time_modified):
                     continue
                 rel_url = f"{site}/_api/web/lists"
-                self.logger.info("Fetching the lists for site: %s from url: %s" % (site, rel_url))
+                self.logger.info(
+                    "Fetching the lists for site: %s from url: %s" % (site, rel_url)
+                )
 
-                query = self.sharepoint_client.get_query(self.start_time, self.end_time, LISTS)
+                query = self.sharepoint_client.get_query(
+                    self.start_time, self.end_time, LISTS
+                )
                 response = self.sharepoint_client.get(rel_url, query, LISTS)
 
                 response_data = get_results(self.logger, response, LISTS)
@@ -193,7 +207,7 @@ def fetch_lists(self, sites, ids, index):
                     % (len(response_data), site)
                 )
 
-                base_list_url = f"{site}/Lists/"
+                base_list_url = urljoin(self.sharepoint_host, f"{site}/Lists/")
 
                 if index:
                     if not ids["lists"].get(site):
@@ -210,9 +224,14 @@ def fetch_lists(self, sites, ids, index):
                                 list_url=response_data[i]["ParentWebUrl"],
                                 itemid=None,
                             )
-                        doc["url"] = urljoin(base_list_url, re.sub(r"[^ \w+]", "", response_data[i]["Title"]))
+                        doc["url"] = urljoin(
+                            base_list_url,
+                            re.sub(r"[^ \w+]", "", response_data[i]["Title"]),
+                        )
                         document.append(doc)
-                        ids["lists"][site].update({doc["id"]: response_data[i]["Title"]})
+                        ids["lists"][site].update(
+                            {doc["id"]: response_data[i]["Title"]}
+                        )
 
                 responses.append(response_data)
             lists = {}
@@ -260,9 +279,13 @@ def fetch_items(self, lists, ids):
                 if parse(self.start_time) > parse(value[2]):
                     continue
                 rel_url = f"{value[0]}/_api/web/lists(guid'{list_content}')/items"
-                self.logger.info("Fetching the items for list: %s from url: %s" % (value[1], rel_url))
+                self.logger.info(
+                    "Fetching the items for list: %s from url: %s" % (value[1], rel_url)
+                )
 
-                query = self.sharepoint_client.get_query(self.start_time, self.end_time, LIST_ITEMS)
+                query = self.sharepoint_client.get_query(
+                    self.start_time, self.end_time, LIST_ITEMS
+                )
                 response = self.sharepoint_client.get(rel_url, query, LIST_ITEMS)
 
                 response_data = get_results(self.logger, response, LIST_ITEMS)
@@ -278,25 +301,36 @@ def fetch_items(self, lists, ids):
                 )
 
                 list_name = re.sub(r"[^ \w+]", "", value[1])
-                base_item_url = f"{value[0]}/Lists/{list_name}/DispForm.aspx?ID="
+                base_item_url = urljoin(
+                    self.sharepoint_host,
+                    f"{value[0]}/Lists/{list_name}/DispForm.aspx?ID=",
+                )
                 document = []
                 if not ids["list_items"][value[0]].get(list_content):
                     ids["list_items"][value[0]].update({list_content: []})
                 rel_url = f"{value[0]}/_api/web/lists(guid'{list_content}')/items?$select=Attachments,AttachmentFiles,Title&$expand=AttachmentFiles"
 
                 new_query = "&" + query.split("?")[1]
-                file_response_data = self.sharepoint_client.get(rel_url, query=new_query, param_name="attachment")
+                file_response_data = self.sharepoint_client.get(
+                    rel_url, query=new_query, param_name="attachment"
+                )
                 if file_response_data:
-                    file_response_data = get_results(self.logger, file_response_data.json(), "attachment")
+                    file_response_data = get_results(
+                        self.logger, file_response_data.json(), "attachment"
+                    )
 
                 for i, _ in enumerate(response_data):
                     doc = {"type": ITEM}
                     if response_data[i].get("Attachments") and file_response_data:
                         for data in file_response_data:
                             if response_data[i].get("Title") == data["Title"]:
-                                file_relative_url = data["AttachmentFiles"]["results"][0]["ServerRelativeUrl"]
+                                file_relative_url = data["AttachmentFiles"]["results"][
+                                    0
+                                ]["ServerRelativeUrl"]
                                 url_s = f"{value[0]}/_api/web/GetFileByServerRelativeUrl('{encode(file_relative_url)}')/$value"
-                                response = self.sharepoint_client.get(url_s, query="", param_name="attachment")
+                                response = self.sharepoint_client.get(
+                                    url_s, query="", param_name="attachment"
+                                )
                                 doc["body"] = {}
                                 if response and response.ok:
                                     try:
@@ -312,19 +346,27 @@ def fetch_items(self, lists, ids):
                         doc[field] = response_data[i].get(response_field)
                     if self.enable_permission is True:
                         doc["_allow_permissions"] = self.fetch_permissions(
-                            key=LIST_ITEMS, list_id=list_content, list_url=value[0], itemid=str(response_data[i]["Id"])
+                            key=LIST_ITEMS,
+                            list_id=list_content,
+                            list_url=value[0],
+                            itemid=str(response_data[i]["Id"]),
                         )
                     doc["url"] = base_item_url + str(response_data[i]["Id"])
                     document.append(doc)
-                    if response_data[i].get("GUID") not in ids["list_items"][value[0]][list_content]:
-                        ids["list_items"][value[0]][list_content].append(response_data[i].get("GUID"))
+                    if (
+                            response_data[i].get("GUID")
+                            not in ids["list_items"][value[0]][list_content]
+                    ):
+                        ids["list_items"][value[0]][list_content].append(
+                            response_data[i].get("GUID")
+                        )
                 responses.extend(document)
         documents = {"type": LIST_ITEMS, "data": responses}
         return documents
 
     def fetch_drive_items(self, libraries, ids):
         """This method fetches items from all the lists in a collection and
-        invokes theindex permission method to get the document level permissions.
+        invokes the index permission method to get the document level permissions.
         If the fetching is not successful, it logs proper message.
         :param libraries: document lists
         :param ids: structure containing id's of all objects
@@ -345,8 +387,13 @@ def fetch_drive_items(self, libraries, ids):
                 if not ids["drive_items"].get(value[0]):
                     ids["drive_items"].update({value[0]: {}})
                 rel_url = f"{value[0]}/_api/web/lists(guid'{lib_content}')/items?$select=Modified,Id,GUID,File,Folder&$expand=File,Folder"
-                self.logger.info("Fetching the items for libraries: %s from url: %s" % (value[1], rel_url))
-                query = self.sharepoint_client.get_query(self.start_time, self.end_time, DRIVE_ITEMS)
+                self.logger.info(
+                    "Fetching the items for libraries: %s from url: %s"
+                    % (value[1], rel_url)
+                )
+                query = self.sharepoint_client.get_query(
+                    self.start_time, self.end_time, DRIVE_ITEMS
+                )
                 response = self.sharepoint_client.get(rel_url, query, DRIVE_ITEMS)
                 response_data = get_results(self.logger, response, DRIVE_ITEMS)
                 if not response_data:
@@ -366,9 +413,13 @@ def fetch_drive_items(self, libraries, ids):
                     if response_data[i]["File"].get("TimeLastModified"):
                         obj_type = "File"
                         doc = {"type": "file"}
-                        file_relative_url = response_data[i]["File"]["ServerRelativeUrl"]
+                        file_relative_url = response_data[i]["File"][
+                            "ServerRelativeUrl"
+                        ]
                         url_s = f"{value[0]}/_api/web/GetFileByServerRelativeUrl('{encode(file_relative_url)}')/$value"
-                        response = self.sharepoint_client.get(url_s, query="", param_name="attachment")
+                        response = self.sharepoint_client.get(
+                            url_s, query="", param_name="attachment"
+                        )
                         doc["body"] = {}
                         if response and response.ok:
                             try:
@@ -391,7 +442,10 @@ def fetch_drive_items(self, libraries, ids):
                             list_url=value[0],
                             itemid=str(response_data[i].get("ID")),
                         )
-                    doc["url"] = response_data[i][obj_type]["ServerRelativeUrl"]
+                    doc["url"] = urljoin(
+                        self.sharepoint_host,
+                        response_data[i][obj_type]["ServerRelativeUrl"],
+                    )
                     document.append(doc)
                     if doc["id"] not in ids["drive_items"][value[0]][lib_content]:
                         ids["drive_items"][value[0]][lib_content].append(doc["id"])
@@ -419,17 +473,19 @@ def get_roles(self, key, site, list_url, list_id, itemid):
 
         else:
             rel_url = list_url
-            roles = self.permissions.fetch_users(key, rel_url, list_id=list_id, item_id=itemid)
+            roles = self.permissions.fetch_users(
+                key, rel_url, list_id=list_id, item_id=itemid
+            )
 
         return roles
 
     def fetch_permissions(
-        self,
-        key,
-        site=None,
-        list_id=None,
-        list_url=None,
-        itemid=None,
+            self,
+            key,
+            site=None,
+            list_id=None,
+            list_url=None,
+            itemid=None,
     ):
         """This method when invoked, checks the permission inheritance of each object.
         If the object has unique permissions, the list of users having access to it
@@ -456,193 +512,103 @@ def fetch_permissions(
             groups.append(title)
         return groups
 
-    def fetch_and_append_sites_to_queue(self, ids, end_time, collection):
+    def fetch_and_append_sites_to_queue(
+            self, ids, collection, duration
+    ):
         """Fetches and appends site details to queue
         :param ids: id collection of the all the objects
-        :param end_time: end time for fetching the data
         :param collection: collection name
+        :param duration: List of time range consisting of the [start_time, end_time]
         """
-        _, datelist = split_date_range_into_chunks(self.start_time, self.end_time, self.sharepoint_thread_count)
-        results = []
+        start_time, end_time = duration[0], duration[1]
         parent_site_url = f"/sites/{collection}"
-        sites_path = [{parent_site_url: end_time}]
-        for num in range(0, self.sharepoint_thread_count):
-            start_time_partition = datelist[num]
-            end_time_partition = datelist[num + 1]
-            thread = self.thread_pool.apply_async(
-                self.fetch_sites,
-                (parent_site_url, {}, ids, (SITES in self.objects), start_time_partition, end_time_partition),
-                callback=self.queue.append_to_queue,
+        sites_path = [{parent_site_url: self.end_time}]
+        sites, documents = self.fetch_sites(
+            parent_site_url,
+            {},
+            ids,
+            (SITES in self.objects),
+            start_time,
+            end_time,
+        )
+        if documents:
+            self.queue.put(documents)
+            self.logger.debug(
+                f"Thread ID {threading.get_ident()} added list of {len(documents.get('data'))} sites into the queue"
             )
-            results.append(thread)
-
-        sites = []
-        for result in [r.get() for r in results]:
-            if result:
-                sites.append(result[0])
-
-        sites_path.extend(sites)
+        sites_path.append(sites)
         return sites_path
 
-    def fetch_and_append_lists_to_queue(self, sites_path, ids):
+    def fetch_and_append_lists_to_queue(self, ids, sites_path):
         """Fetches and appends list details to queue
-        :param sites_path: dictionary of site path and it's last updated time
         :param ids: id collection of the all the objects
+        :param sites_path: dictionary of site path and it's last updated time
         """
-        results, lists_details, libraries_details = [], {}, {}
-        partitioned_sites = split_list_into_buckets(sites_path, self.sharepoint_thread_count)
-        for site in partitioned_sites:
-            thread = self.thread_pool.apply_async(
-                self.fetch_lists, (site, ids, (LISTS in self.objects)), callback=self.queue.append_to_queue
+        lists_details, libraries_details, documents = self.fetch_lists(
+            sites_path, ids, (LISTS in self.objects)
+        )
+        if documents:
+            self.queue.put(documents)
+            self.logger.debug(
+                f"Thread ID {threading.get_ident()} added list of {len(documents.get('data'))} lists into the queue"
             )
-            results.append(thread)
-        for result in [r.get() for r in results]:
-            if result:
-                lists_details.update(result[0])
-                libraries_details.update(result[1])
         return [lists_details, libraries_details]
 
-    def fetch_and_append_list_items_to_queue(self, lists_details, ids):
+    def fetch_and_append_list_items_to_queue(self, ids, lists_details):
         """Fetches and appends list_items to the queue
-        :param lists_details: dictionary containing list name, list path and id
         :param ids: id collection of the all the objects
+        :param lists_details: dictionary containing list name, list path and id
         """
-        partition = []
-        partition = split_documents_into_equal_chunks(lists_details, self.sharepoint_thread_count)
-        for list_data in partition:
-            self.thread_pool.apply_async(self.fetch_items, (list_data, ids), callback=self.queue.append_to_queue)
+        documents = self.fetch_items(lists_details, ids)
+        if documents:
+            self.queue.put(documents)
+            self.logger.debug(
+                f"Thread ID {threading.get_ident()} added list of {len(documents.get('data'))} list items into the queue"
+            )
 
-    def fetch_and_append_drive_items_to_queue(self, libraries_details, ids):
+    def fetch_and_append_drive_items_to_queue(self, ids, libraries_details):
         """Fetches and appends the drive items to the queue
-        :param libraries_details: dictionary containing library name, library path and id
         :param ids: id collection of the all the objects
+        :param libraries_details: dictionary containing library name, library path and id
         """
-        partition = []
-        partition = split_documents_into_equal_chunks(libraries_details, self.sharepoint_thread_count)
-        for list_data in partition:
-            self.thread_pool.apply_async(self.fetch_drive_items, (list_data, ids), callback=self.queue.append_to_queue)
-
-    def perform_sync(self, collection, ids, storage, job_type, collected_objects, end_time):
-        """This method fetches all the objects from sharepoint server
-        :param collection: collection name
-        :param ids: id collection of the all the objects
-        :param storage: temporary storage for storing all the documents
-        :param job_type: denotes the type of sharepoint object being fetched in a particular process
-        :param collected_objects: helper variable to provide the data to children object
-        :param end_time: end time for fetching the data
-        """
-        if job_type == "sites":
-            collected_objects = self.fetch_and_append_sites_to_queue(ids, end_time, collection)
-
-        elif job_type == "lists":
-            collected_objects = self.fetch_and_append_lists_to_queue(collected_objects, ids)
-
-        elif job_type == LIST_ITEMS and LIST_ITEMS in self.objects:
-            self.fetch_and_append_list_items_to_queue(collected_objects[0], ids)
-
-        elif job_type == DRIVE_ITEMS and DRIVE_ITEMS in self.objects:
-            self.fetch_and_append_drive_items_to_queue(collected_objects[1], ids)
-
-        self.logger.info("Completed fetching all the objects for site collection: %s" % (collection))
-
-        self.logger.info("Saving the checkpoint for the site collection: %s" % (collection))
-        if ids.get(job_type):
-            prev_ids = storage[job_type]
-            if job_type == "sites":
-                prev_ids.update(ids[job_type])
-            elif job_type == "lists":
-                for site, list_content in ids[job_type].items():
-                    prev_ids[site] = {**prev_ids.get(site, {}), **ids[job_type][site]}
-            else:
-                for site, list_content in ids[job_type].items():
-                    prev_ids[site] = ids[job_type][site] if not prev_ids.get(site) else prev_ids[site]
-                    for list_name in list_content.keys():
-                        prev_ids[site][list_name] = list(
-                            set([*prev_ids[site].get(list_name, []), *ids[job_type][site][list_name]])
-                        )
-            storage[job_type] = prev_ids
-        return collected_objects
-
-
-def init_sharepoint_sync(indexing_type, config, logger, queue, args):
-    """Initialize the process for synching
-    :param indexing_type: The type of the indexing i.e. incremental or full
-    :param config: instance of Configuration class
-    :param logger: instance of Logger class
-    :param queue: Shared queue to push the objects fetched from SharePoint
-    :param args: The command line arguments passed from the base command
-    """
-    logger.info(f"Starting the {indexing_type} indexing..")
-    current_time = (datetime.utcnow()).strftime("%Y-%m-%dT%H:%M:%SZ")
-    ids_collection = {"global_keys": {}}
-    storage_with_collection = {"global_keys": {}, "delete_keys": {}}
-    # Added this workaround of initializing the base_command since workplace_search_client and sharepoint_client cannot be passed in the Process argument as doing so would throw pickling errors on Windows
-    base_command = BaseCommand(args)
-    workplace_search_client = base_command.workplace_search_client
-    sharepoint_client = base_command.sharepoint_client
-
-    if os.path.exists(IDS_PATH) and os.path.getsize(IDS_PATH) > 0:
-        with open(IDS_PATH) as ids_store:
-            try:
-                ids_collection = json.load(ids_store)
-            except ValueError as exception:
-                logger.exception(
-                    "Error while parsing the json file of the ids store from path: %s. Error: %s"
-                    % (IDS_PATH, exception)
-                )
-
-    storage_with_collection["delete_keys"] = copy.deepcopy(ids_collection.get("global_keys"))
-    check = Checkpoint(config, logger)
-
-    try:
-        for collection in config.get_value("sharepoint.site_collections"):
-            storage = {"sites": {}, "lists": {}, "list_items": {}, "drive_items": {}}
-            logger.info("Starting the data fetching for site collection: %s" % (collection))
-
-            if indexing_type == "incremental":
-                start_time, end_time = check.get_checkpoint(collection, current_time)
-            else:
-                start_time = config.get_value("start_time")
-                end_time = current_time
-
-            if not ids_collection["global_keys"].get(collection):
-                ids_collection["global_keys"][collection] = {
-                    "sites": {},
-                    "lists": {},
-                    "list_items": {},
-                    "drive_items": {},
-                }
-
-            logger.info(
-                "Starting to index all the objects configured in the object field: %s"
-                % (str(config.get_value("objects")))
+        documents = self.fetch_drive_items(libraries_details, ids)
+        if documents:
+            self.queue.put(documents)
+            self.logger.debug(
+                f"Thread ID {threading.get_ident()} added list of {len(documents.get('data'))} drive items into the queue"
             )
 
-            sync_sharepoint = SyncSharepoint(
-                config, logger, workplace_search_client, sharepoint_client, start_time, end_time, queue
-            )
-            returned_documents = None
-            for job_type in ["sites", "lists", "list_items", "drive_items"]:
-                logger.info(f"Indexing {job_type}")
-                returned_documents = sync_sharepoint.perform_sync(
-                    collection,
-                    ids_collection["global_keys"][collection],
-                    storage,
-                    job_type,
-                    returned_documents,
-                    end_time,
-                )
-            sync_sharepoint.thread_pool.close()
-            sync_sharepoint.thread_pool.join()
-            queue.put_checkpoint(collection, end_time, indexing_type)
-
-            storage_with_collection["global_keys"][collection] = storage.copy()
-        queue.end_signal()
-    except Exception as exception:
-        raise exception
-
-    with open(IDS_PATH, "w") as file:
-        try:
-            json.dump(storage_with_collection, file, indent=4)
-        except ValueError as exception:
-            logger.warning("Error while adding ids to json file. Error: %s" % (exception))
+    def fetch_records_from_sharepoint(self, producer, date_ranges, thread_count, ids, collection):
+        """Fetches Sites, Lists, List Items and Drive Items from sharepoint.
+        :param producer: Producer function
+        :param date_ranges: Partition of time range
+        :param thread_count: Thread count
+        :param ids: Content of the local storage
+        :param collection: SharePoint server Collection name
+        """
+        # Fetch sites
+        time_range_list = [(date_ranges[num], date_ranges[num + 1]) for num in range(0, thread_count)]
+        sites = producer(thread_count, self.fetch_and_append_sites_to_queue,
+                         [ids, collection], time_range_list, wait=True)
+        all_sites = []
+        for site in sites:
+            all_sites.extend(site)
+
+        # Fetch lists
+        partitioned_sites = split_list_into_buckets(all_sites, thread_count)
+
+        lists = producer(thread_count, self.fetch_and_append_lists_to_queue, [ids], partitioned_sites, wait=True)
+
+        # Fetch list items
+        lists_details, libraries_details = {}, {}
+        for result in lists:
+            lists_details.update(result[0])
+            libraries_details.update(result[1])
+
+        list_items = split_documents_into_equal_chunks(lists_details, thread_count)
+        producer(thread_count, self.fetch_and_append_list_items_to_queue, [ids], list_items, wait=True)
+
+        # Fetch library details
+        libraries_items = split_documents_into_equal_chunks(libraries_details, thread_count)
+        producer(thread_count, self.fetch_and_append_drive_items_to_queue, [ids], libraries_items, wait=True)
+        return ids
diff --git a/ees_sharepoint/utils.py b/ees_sharepoint/utils.py
index 8f77b87..6e5ca65 100644
--- a/ees_sharepoint/utils.py
+++ b/ees_sharepoint/utils.py
@@ -6,6 +6,7 @@
 """This module contains uncategorized utility methods."""
 
 import urllib.parse
+import copy
 from datetime import datetime
 from tika import parser
 
@@ -81,3 +82,23 @@ def split_date_range_into_chunks(start_time, end_time, number_of_threads):
     formatted_end_time = end_time.strftime(DATETIME_FORMAT)
     datelist.append(formatted_end_time)
     return formatted_end_time, datelist
+
+
+def get_storage_with_collection(local_storage, collection):
+    """Returns a dictionary containing the locally stored IDs of files fetched from network drives
+    :param local_storage: The object of the local storage used to store the indexed document IDs
+    :param collection: The SharePoint server collection which is currently being fetched
+    """
+    storage_with_collection = {"global_keys": {}, "delete_keys": {}}
+    ids_collection = local_storage.load_storage()
+    storage_with_collection["delete_keys"] = copy.deepcopy(ids_collection.get("global_keys"))
+    if not ids_collection["global_keys"].get(collection):
+        ids_collection["global_keys"][collection] = {
+            "sites": {},
+            "lists": {},
+            "list_items": {},
+            "drive_items": {},
+        }
+    storage_with_collection["global_keys"][collection] = copy.deepcopy(ids_collection["global_keys"][collection])
+
+    return storage_with_collection

From 05218492bd234c17423bdc0782e89570d28d7eaa Mon Sep 17 00:00:00 2001
From: praveen-elastic <praveen.kukreja@elastic.co>
Date: Thu, 7 Apr 2022 19:37:26 +0530
Subject: [PATCH 7/9] add last updated field

---
 ees_sharepoint/adapter.py                |  2 ++
 ees_sharepoint/sync_enterprise_search.py | 10 ++++++----
 ees_sharepoint/utils.py                  | 11 ++++++++---
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/ees_sharepoint/adapter.py b/ees_sharepoint/adapter.py
index b20f3b4..aafda0f 100644
--- a/ees_sharepoint/adapter.py
+++ b/ees_sharepoint/adapter.py
@@ -23,6 +23,7 @@
     'lists': {
         'created_at': 'Created',
         'id': 'Id',
+        'last_updated': 'LastItemModifiedDate',
         'relative_url': 'ParentWebUrl',
         'title': 'Title'
     },
@@ -30,6 +31,7 @@
         'title': 'Title',
         'id': 'GUID',
         'created_at': 'Created',
+        'last_updated': 'Modified',
         'author_id': 'AuthorId'
     },
     'drive_items': {
diff --git a/ees_sharepoint/sync_enterprise_search.py b/ees_sharepoint/sync_enterprise_search.py
index cf199ba..124c40d 100644
--- a/ees_sharepoint/sync_enterprise_search.py
+++ b/ees_sharepoint/sync_enterprise_search.py
@@ -41,9 +41,9 @@ def index_documents(self, documents):
                             "Error while indexing %s. Error: %s"
                             % (response["id"], response["errors"])
                         )
-            self.logger.info(
-                f"[{threading.get_ident()}] Successfully indexed {total_documents_indexed} documents to the workplace"
-            )
+                self.logger.info(
+                    f"[{threading.get_ident()}] Successfully indexed {total_documents_indexed} documents to the workplace"
+                )
         except Exception as exception:
             self.logger.exception(f"Error while indexing the files. Error: {exception}")
             raise exception
@@ -79,4 +79,6 @@ def perform_sync(self):
                 ):
                     self.index_documents(chunk)
         except Exception as exception:
-            self.logger.error(f"Error while indexing the documents to the Enterprise Search. Error {exception}")
+            self.logger.error(
+                f"Error while indexing the documents to the Enterprise Search. Error {exception}"
+            )
diff --git a/ees_sharepoint/utils.py b/ees_sharepoint/utils.py
index 6e5ca65..7f55f14 100644
--- a/ees_sharepoint/utils.py
+++ b/ees_sharepoint/utils.py
@@ -5,9 +5,10 @@
 #
 """This module contains uncategorized utility methods."""
 
-import urllib.parse
 import copy
+import urllib.parse
 from datetime import datetime
+
 from tika import parser
 
 DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
@@ -91,7 +92,9 @@ def get_storage_with_collection(local_storage, collection):
     """
     storage_with_collection = {"global_keys": {}, "delete_keys": {}}
     ids_collection = local_storage.load_storage()
-    storage_with_collection["delete_keys"] = copy.deepcopy(ids_collection.get("global_keys"))
+    storage_with_collection["delete_keys"] = copy.deepcopy(
+        ids_collection.get("global_keys")
+    )
     if not ids_collection["global_keys"].get(collection):
         ids_collection["global_keys"][collection] = {
             "sites": {},
@@ -99,6 +102,8 @@ def get_storage_with_collection(local_storage, collection):
             "list_items": {},
             "drive_items": {},
         }
-    storage_with_collection["global_keys"][collection] = copy.deepcopy(ids_collection["global_keys"][collection])
+    storage_with_collection["global_keys"][collection] = copy.deepcopy(
+        ids_collection["global_keys"][collection]
+    )
 
     return storage_with_collection

From 99849981338e267221aaa84dd5870da3959d8c00 Mon Sep 17 00:00:00 2001
From: praveen-elastic <praveen.kukreja@elastic.co>
Date: Mon, 11 Apr 2022 16:45:36 +0530
Subject: [PATCH 8/9] readme.md in place of readme.rst

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b750303..26d216a 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@
 
 description = ""
 
-with open("README.rst") as f:
+with open("README.md") as f:
     description += f.read() + "\n\n"
 
 

From 7053f7ce4e4dabcb899d36996891886d5589a773 Mon Sep 17 00:00:00 2001
From: praveen-elastic <praveen.kukreja@elastic.co>
Date: Tue, 12 Apr 2022 18:20:09 +0530
Subject: [PATCH 9/9] Addressing review comments

---
 ees_sharepoint/full_sync_command.py        |  6 ++---
 ees_sharepoint/incremental_sync_command.py |  6 ++---
 ees_sharepoint/local_storage.py            | 28 ++++++++++++++++++++++
 ees_sharepoint/utils.py                    | 27 +--------------------
 4 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/ees_sharepoint/full_sync_command.py b/ees_sharepoint/full_sync_command.py
index 0fd3643..766a573 100644
--- a/ees_sharepoint/full_sync_command.py
+++ b/ees_sharepoint/full_sync_command.py
@@ -13,7 +13,7 @@
 from .connector_queue import ConnectorQueue
 from .sync_enterprise_search import SyncEnterpriseSearch
 from .sync_sharepoint import SyncSharepoint
-from .utils import get_storage_with_collection, split_date_range_into_chunks
+from .utils import split_date_range_into_chunks
 
 
 class FullSyncCommand(BaseCommand):
@@ -40,13 +40,13 @@ def start_producer(self, queue):
                 end_time,
                 queue,
             )
-            _, datelist = split_date_range_into_chunks(
+            datelist = split_date_range_into_chunks(
                 start_time,
                 end_time,
                 thread_count,
             )
             for collection in self.config.get_value("sharepoint.site_collections"):
-                storage_with_collection = get_storage_with_collection(self.local_storage, collection)
+                storage_with_collection = self.local_storage.get_storage_with_collection(collection)
                 self.logger.info(
                     "Starting to index all the objects configured in the object field: %s"
                     % (str(self.config.get_value("objects")))
diff --git a/ees_sharepoint/incremental_sync_command.py b/ees_sharepoint/incremental_sync_command.py
index 67f7e82..adffb86 100644
--- a/ees_sharepoint/incremental_sync_command.py
+++ b/ees_sharepoint/incremental_sync_command.py
@@ -17,7 +17,7 @@
 from .connector_queue import ConnectorQueue
 from .sync_enterprise_search import SyncEnterpriseSearch
 from .sync_sharepoint import SyncSharepoint
-from .utils import get_storage_with_collection, split_date_range_into_chunks
+from .utils import split_date_range_into_chunks
 
 
 class IncrementalSyncCommand(BaseCommand):
@@ -46,12 +46,12 @@ def start_producer(self, queue):
                     end_time,
                     queue,
                 )
-                _, datelist = split_date_range_into_chunks(
+                datelist = split_date_range_into_chunks(
                     start_time,
                     end_time,
                     thread_count,
                 )
-                storage_with_collection = get_storage_with_collection(self.local_storage, collection)
+                storage_with_collection = self.local_storage.get_storage_with_collection(collection)
                 self.logger.info(
                     "Starting to index all the objects configured in the object field: %s"
                     % (str(self.config.get_value("objects")))
diff --git a/ees_sharepoint/local_storage.py b/ees_sharepoint/local_storage.py
index 71166e4..af28f93 100644
--- a/ees_sharepoint/local_storage.py
+++ b/ees_sharepoint/local_storage.py
@@ -1,3 +1,9 @@
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0;
+# you may not use this file except in compliance with the Elastic License 2.0.
+#
+import copy
 import json
 import os
 
@@ -37,3 +43,25 @@ def update_storage(self, ids):
                 self.logger.exception(
                     f"Error while updating the doc_id json file. Error: {exception}"
                 )
+
+    def get_storage_with_collection(self, collection):
+        """Returns a dictionary containing the locally stored IDs of files fetched from SharePoint
+            :param collection: The SharePoint server collection which is currently being fetched
+        """
+        storage_with_collection = {"global_keys": {}, "delete_keys": {}}
+        ids_collection = self.load_storage()
+        storage_with_collection["delete_keys"] = copy.deepcopy(
+            ids_collection.get("global_keys")
+        )
+        if not ids_collection["global_keys"].get(collection):
+            ids_collection["global_keys"][collection] = {
+                "sites": {},
+                "lists": {},
+                "list_items": {},
+                "drive_items": {},
+            }
+        storage_with_collection["global_keys"][collection] = copy.deepcopy(
+            ids_collection["global_keys"][collection]
+        )
+
+        return storage_with_collection
diff --git a/ees_sharepoint/utils.py b/ees_sharepoint/utils.py
index 7f55f14..314ee01 100644
--- a/ees_sharepoint/utils.py
+++ b/ees_sharepoint/utils.py
@@ -5,7 +5,6 @@
 #
 """This module contains uncategorized utility methods."""
 
-import copy
 import urllib.parse
 from datetime import datetime
 
@@ -82,28 +81,4 @@ def split_date_range_into_chunks(start_time, end_time, number_of_threads):
         datelist.append(date_time.strftime(DATETIME_FORMAT))
     formatted_end_time = end_time.strftime(DATETIME_FORMAT)
     datelist.append(formatted_end_time)
-    return formatted_end_time, datelist
-
-
-def get_storage_with_collection(local_storage, collection):
-    """Returns a dictionary containing the locally stored IDs of files fetched from network drives
-    :param local_storage: The object of the local storage used to store the indexed document IDs
-    :param collection: The SharePoint server collection which is currently being fetched
-    """
-    storage_with_collection = {"global_keys": {}, "delete_keys": {}}
-    ids_collection = local_storage.load_storage()
-    storage_with_collection["delete_keys"] = copy.deepcopy(
-        ids_collection.get("global_keys")
-    )
-    if not ids_collection["global_keys"].get(collection):
-        ids_collection["global_keys"][collection] = {
-            "sites": {},
-            "lists": {},
-            "list_items": {},
-            "drive_items": {},
-        }
-    storage_with_collection["global_keys"][collection] = copy.deepcopy(
-        ids_collection["global_keys"][collection]
-    )
-
-    return storage_with_collection
+    return datelist