Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added concurrency and other changes from stacked PRs #30

Merged
merged 13 commits into from
Apr 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ees_sharepoint/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@
'lists': {
'created_at': 'Created',
'id': 'Id',
'last_updated': 'LastItemModifiedDate',
'relative_url': 'ParentWebUrl',
'title': 'Title'
},
'list_items': {
'title': 'Title',
'id': 'GUID',
'created_at': 'Created',
'last_updated': 'Modified',
'author_id': 'AuthorId'
},
'drive_items': {
Expand Down
44 changes: 40 additions & 4 deletions ees_sharepoint/base_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,13 @@
from functools import cached_property
except ImportError:
from cached_property import cached_property

from concurrent.futures import ThreadPoolExecutor, as_completed

from elastic_enterprise_search import WorkplaceSearch

from .configuration import Configuration
from .local_storage import LocalStorage
from .sharepoint_client import SharePoint


Expand All @@ -27,13 +31,14 @@ class BaseCommand:

Inherit from it and implement 'execute' method, then add
code to cli.py to register this command."""

def __init__(self, args):
self.args = args

def execute(self):
"""Run the command.

This method is overriden by actual commands with logic
This method is overridden by actual commands with logic
that is specific to each command implementing it."""
raise NotImplementedError

Expand All @@ -44,7 +49,7 @@ def logger(self):
log level will be determined by the configuration
setting log_level.
"""
log_level = self.config.get_value('log_level')
log_level = self.config.get_value("log_level")
logger = logging.getLogger(__name__)
logger.propagate = False
logger.setLevel(log_level)
Expand All @@ -69,13 +74,14 @@ def workplace_search_client(self):
args = self.args
host = self.config.get_value("enterprise_search.host_url")

if hasattr(args, 'user') and args.user:
if hasattr(args, "user") and args.user:
return WorkplaceSearch(
f"{host}/api/ws/v1/sources", http_auth=(args.user, args.password)
)
else:
return WorkplaceSearch(
f"{host}/api/ws/v1/sources", http_auth=self.config.get_value("workplace_search.api_key")
f"{host}/api/ws/v1/sources",
http_auth=self.config.get_value("workplace_search.api_key"),
)

@cached_property
Expand All @@ -88,3 +94,33 @@ def config(self):
def sharepoint_client(self):
"""Get the sharepoint client instance for the running command."""
return SharePoint(self.config, self.logger)

@staticmethod
def producer(thread_count, func, args, items, wait=False):
"""Apply async calls using multithreading to the targeted function
:param thread_count: Total number of threads to be spawned
:param func: The target function on which the async calls would be made
:param args: Arguments for the targeted function
:param items: iterator of partition
:param wait: wait until job completes if true, otherwise returns immediately
"""
with ThreadPoolExecutor(max_workers=thread_count) as executor:
futures = (executor.submit(func, *args, item) for item in items)
if wait:
result = [future.result() for future in as_completed(futures)]
return result

@staticmethod
def consumer(thread_count, func):
"""Apply async calls using multithreading to the targeted function
:param thread_count: Total number of threads to be spawned
:param func: The target function on which the async calls would be made
"""
with ThreadPoolExecutor(max_workers=thread_count) as executor:
for _ in range(thread_count):
executor.submit(func)

@cached_property
def local_storage(self):
"""Get the object for local storage to fetch and update ids stored locally"""
return LocalStorage(self.logger)
38 changes: 38 additions & 0 deletions ees_sharepoint/connector_queue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
# or more contributor license agreements. Licensed under the Elastic License 2.0;
# you may not use this file except in compliance with the Elastic License 2.0.
#
import multiprocessing
from multiprocessing.queues import Queue

BATCH_SIZE = 100


class ConnectorQueue(Queue):
"""Class to support additional queue operations specific to the connector"""

def __init__(self, logger):
ctx = multiprocessing.get_context()
self.logger = logger
super(ConnectorQueue, self).__init__(ctx=ctx)

def end_signal(self):
"""Send an terminate signal to indicate the queue can be closed"""

signal_close = {"type": "signal_close"}
self.put(signal_close)

def put_checkpoint(self, key, checkpoint_time, indexing_type):
"""Put the checkpoint object in the queue which will be used by the consumer to update the checkpoint file

:param key: The key of the checkpoint dictionary
:param checkpoint_time: The end time that will be stored in the checkpoint as {'key': 'checkpoint_time'}
:param indexing_type: The type of the indexing i.e. Full or Incremental
"""

checkpoint = {
"type": "checkpoint",
"data": (key, checkpoint_time, indexing_type),
}
self.put(checkpoint)
8 changes: 4 additions & 4 deletions ees_sharepoint/deletion_sync_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import requests

from .base_command import BaseCommand
from .utils import split_in_chunks
from .utils import split_list_into_buckets

IDS_PATH = os.path.join(os.path.dirname(__file__), 'doc_id.json')
# By default, Enterprise Search configuration has a maximum allowed limit set to 100 documents for an api request
Expand Down Expand Up @@ -58,7 +58,7 @@ def deindexing_items(self, collection, ids, key):
if resp.status_code == requests.codes['not_found'] or result == []:
doc.append(item_id)
if doc:
for chunk in split_in_chunks(doc, BATCH_SIZE):
for chunk in split_list_into_buckets(doc, BATCH_SIZE):
self.workplace_search_client.delete_documents(
content_source_id=self.ws_source,
document_ids=chunk)
Expand Down Expand Up @@ -101,7 +101,7 @@ def deindexing_lists(self, collection, ids):
resp = self.sharepoint_client.get(url, '', "deindex")
if resp is not None and resp.status_code == requests.codes['not_found']:
doc.append(list_id)
for chunk in split_in_chunks(doc, BATCH_SIZE):
for chunk in split_list_into_buckets(doc, BATCH_SIZE):
self.workplace_search_client.delete_documents(
content_source_id=self.ws_source,
document_ids=chunk)
Expand Down Expand Up @@ -132,7 +132,7 @@ def deindexing_sites(self, collection, ids):
resp = self.sharepoint_client.get(url, '', "deindex")
if resp is not None and resp.status_code == requests.codes['not_found']:
doc.append(site_id)
for chunk in split_in_chunks(doc, BATCH_SIZE):
for chunk in split_list_into_buckets(doc, BATCH_SIZE):
self.workplace_search_client.delete_documents(
content_source_id=self.ws_source,
document_ids=chunk)
Expand Down
Loading