diff --git a/Makefile b/Makefile index 854270d..14f3b48 100644 --- a/Makefile +++ b/Makefile @@ -15,4 +15,13 @@ run_pytest: test: venv run_pytest clean -.PHONY: install test venv clean run_pytest +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SOURCEDIR = docs/source +BUILDDIR = docs/build + +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: install test venv clean run_pytest help Makefile diff --git a/README.md b/README.md index e2346b1..583a393 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,8 @@ ## fabric8-analytics-rudra -This library is the collection of the common utils and tools required by various fabric8-analytics machine learning projects. +This library is the collection of the common utils and tools required by various fabric8-analytics machine learning projects. Documentation is available at [fabric8-analytics-rudra Docs](https://fabric8-analytics-rudra.readthedocs.io). Please follow this article [Docs Examples](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) for documenting the code. ### Installation: -#### - ```bash $ pip install git+https://github.com/fabric8-analytics/fabric8-analytics-rudra ``` @@ -18,6 +16,14 @@ $ make install $ make test ``` +### Documentation: +```bash +$ export PYTHONPATH=/path/to/fabric8-analytics-rudra +``` +```bash +$ make html +``` + ### Footnotes #### Coding standards @@ -80,3 +86,4 @@ The script named `check-bashscripts.sh` can be used to check all BASH scripts (i Please see [the following link](https://github.com/koalaman/shellcheck) for further explanation, how the ShellCheck works and which issues can be detected. + diff --git a/docs/source/conf.py b/docs/source/conf.py index bee24df..16e1471 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -39,10 +39,14 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.mathjax', + 'sphinx.ext.ifconfig', + 'sphinx.ext.viewcode', ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['rudratemplates'] +templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: @@ -74,7 +78,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -85,7 +89,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['rudrastatic'] +html_static_path = ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. @@ -171,3 +175,6 @@ # A list of files that should not be packed into the epub file. epub_exclude_files = ['search.html'] + + +# -- Extension configuration ------------------------------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 77a6d4e..57f07b2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,5 +1,5 @@ .. fabric8-analytics-rudra documentation master file, created by - sphinx-quickstart on Wed Mar 20 21:38:11 2019. + sphinx-quickstart on Wed Mar 20 23:00:30 2019. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. @@ -18,3 +18,106 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` * :ref:`search` + +.. toctree:: + :maxdepth: 3 + +.. automodule:: rudra.data_store + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.data_store.aws + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.data_store.local_data_store + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.data_store.bigquery + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.data_store.bigquery.base + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.data_store.bigquery.maven_bigquery + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.data_store.bigquery.pypi_bigquery + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.data_store.bigquery.npm_bigquery + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.deployments + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.deployments.emr_scripts + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.deployments.emr_scripts.emr_config + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.deployments.emr_scripts.emr_script_builder + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.deployments.emr_scripts.maven_emr + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.deployments.emr_scripts.npm_emr + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.deployments.emr_scripts.pypi_emr + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.utils + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.utils.validation + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.utils.helper + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.utils.mercator + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: rudra.utils.pypi_parser + :members: + :undoc-members: + :show-inheritance: diff --git a/requirements.txt b/requirements.txt index ba9bab5..7ee48a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,9 @@ +alabaster==0.7.12 asn1crypto==0.24.0 atomicwrites==1.3.0 attrs==18.2.0 aws-xray-sdk==0.95 +Babel==2.6.0 beautifulsoup4==4.7.1 boto==2.49.0 boto3==1.7.84 @@ -26,6 +28,7 @@ google-cloud-core==0.29.1 google-resumable-media==0.3.2 googleapis-common-protos==1.5.8 idna==2.8 +imagesize==1.1.0 Jinja2==2.10 jmespath==0.9.3 jsondiff==1.1.1 @@ -36,6 +39,7 @@ mock==2.0.0 more-itertools==6.0.0 moto==1.3.6 numpy==1.16.1 +packaging==19.0 pbr==5.1.2 pluggy==0.8.1 protobuf==3.7.0 @@ -46,6 +50,8 @@ pyasn1-modules==0.2.4 pycodestyle==2.5.0 pycparser==2.19 pycryptodome==3.7.3 +Pygments==2.3.1 +pyparsing==2.3.1 pytest==4.3.0 python-dateutil==2.8.0 python-jose==2.0.2 @@ -59,7 +65,16 @@ ruamel.yaml==0.15.88 s3transfer==0.1.13 scipy==1.2.1 six==1.12.0 +snowballstemmer==1.2.1 soupsieve==1.8 +Sphinx==2.0.0 +sphinx-rtd-theme==0.4.3 +sphinxcontrib-applehelp==1.0.1 +sphinxcontrib-devhelp==1.0.1 +sphinxcontrib-htmlhelp==1.0.1 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.2 +sphinxcontrib-serializinghtml==1.1.1 urllib3==1.24.1 websocket-client==0.54.0 Werkzeug==0.14.1 diff --git a/rudra/data_store/bigquery/base.py b/rudra/data_store/bigquery/base.py index 2b4ec7e..868df2c 100644 --- a/rudra/data_store/bigquery/base.py +++ b/rudra/data_store/bigquery/base.py @@ -1,14 +1,10 @@ """Implementation Bigquery builder base.""" import os import time -from collections import Counter from google.cloud import bigquery -from requests import Session -from requests_futures.sessions import FuturesSession from rudra import logger -from rudra.utils.helper import CacheDict from rudra.data_store.aws import AmazonS3 @@ -88,40 +84,7 @@ class DataProcessing: def __init__(self, s3_client=None): """Initialize DataProcessing object.""" - self.data = None - self.cache = CacheDict(max_len=50000) - self.pkg_counter = Counter() self.s3_client = s3_client - self.req_session = FuturesSession(session=Session()) - - def async_fetch(self, url, - method='GET', - others=None): - """Fetch urls asynchronously.""" - if url in self.cache: - self.responses.append(self.cache[url]) - else: - self.process_queue.append( - (others, url, self.req_session.request(method, url))) - - def is_fetch_done(self, callback=lambda x: x): - """Check whether all the requests are processed or not.""" - _flag = True - for resp in self.process_queue: - _flag = False - others, url, req_obj = resp - logger.info("other:{}, url:{}, req_obj:{}".format(others, url, req_obj)) - - if url in self.cache: - req_obj.cancel() - self.process_queue.remove(resp) - self.responses.append(self.cache[url]) - elif req_obj.done(): - req_obj.cancel() - self.process_queue.remove(resp) - self.cache[url] = (others, callback(req_obj)) - self.responses.append((others, callback(req_obj))) - return _flag def update_s3_bucket(self, data, bucket_name, diff --git a/rudra/data_store/bigquery/pypi_bigquery.py b/rudra/data_store/bigquery/pypi_bigquery.py index ab6c88c..69cefd1 100644 --- a/rudra/data_store/bigquery/pypi_bigquery.py +++ b/rudra/data_store/bigquery/pypi_bigquery.py @@ -6,6 +6,7 @@ from rudra.data_store.bigquery.base import BigqueryBuilder from rudra.utils.pypi_parser import pip_req from rudra.data_store.bigquery.base import DataProcessing +from rudra.utils.validation import BQValidation from rudra import logger @@ -46,52 +47,31 @@ def __init__(self, big_query_instance=None, s3_client=None): self.filename = '{}/big-query-data/collated.json'.format( os.getenv('DEPLOYMENT_PREFIX', 'dev')) - def process(self): + def process(self, validate=False): """Process Pypi Bigquery response data.""" - start = time.monotonic() + bq_validation = BQValidation() logger.info("Running Bigquery for pypi synchronously") self.big_query_instance.run_query_sync() - - logger.info("fetching bigquery result.") - for content in self.big_query_instance.get_result(): - self.big_query_content.append(content) - logger.info("collected manifests: {}".format(len(self.big_query_content))) - logger.info("Succefully retrieved data from Bigquery, time:{}".format( - time.monotonic() - start)) - base_url_pypi = 'https://pypi.org/pypi/{pkg}/json' - logger.info("Starting package cleaning") start_process_time = time.monotonic() - for idx, obj in enumerate(self.big_query_content): + for idx, obj in enumerate(self.big_query_instance.get_result()): start = time.monotonic() content = obj.get('content') - self.process_queue = list() - self.responses = list() + packages = [] if content: try: - for name in pip_req.parse_requirements(content): - logger.info("searching pkg:`{}` in Python Package Index \ - Repository" .format(name)) - self.async_fetch(base_url_pypi.format(pkg=name), others=name) + packages = sorted({p for p in pip_req.parse_requirements(content)}) + if validate: + packages = sorted(bq_validation.validate_pypi(packages)) except Exception as _exc: logger.error("IGNORE: {}".format(_exc)) logger.error("Failed to parse content data {}".format(content)) - try: - while not self.is_fetch_done(lambda x: x.result().status_code): - # hold the process until all request finishes. - time.sleep(0.001) - except Exception as _exc: - logger.error("IGNORE: {}".format(_exc)) - # discard process_queue - self.process_queue = [] - self.responses = [] - packages = sorted(set(self.handle_response())) if packages: pkg_string = ', '.join(packages) logger.info("PACKAGES: {}".format(pkg_string)) self.counter.update([pkg_string]) - logger.info("Processed content in time: {} process:{}/{}".format( - (time.monotonic() - start), idx, len(self.big_query_content))) + logger.info("Processed content in time: {} counter:{}".format( + (time.monotonic() - start), idx)) logger.info("Processed All the manifests in time: {}".format( time.monotonic() - start_process_time)) @@ -101,18 +81,3 @@ def process(self): filename=self.filename) logger.info("Succefully Processed the PyPiBigQuery") - - def handle_response(self): - """Process and get the response of async requests.""" - results = list() - for resp in self.responses: - pkg_name, req_obj = resp - if isinstance(req_obj, int): - if req_obj == 200: - results.append(pkg_name) - elif req_obj.status_code == 200: - results.append(pkg_name) - logger.info("Received status:{} for pkg:{}".format(req_obj.status_code, pkg_name)) - else: - logger.info("Received status:{} for pkg:{}".format(req_obj.status_code, pkg_name)) - return results diff --git a/rudra/deployments/emr_scripts/__init__.py b/rudra/deployments/emr_scripts/__init__.py index 1078053..4467be4 100644 --- a/rudra/deployments/emr_scripts/__init__.py +++ b/rudra/deployments/emr_scripts/__init__.py @@ -4,4 +4,4 @@ from rudra.deployments.emr_scripts.npm_emr import NpmEMR from rudra.deployments.emr_scripts.pypi_emr import PyPiEMR -__all__ = [MavenEMR, NpmEMR, PyPiEMR] +__all__ = ['MavenEMR', 'NpmEMR', 'PyPiEMR'] diff --git a/rudra/utils/validation.py b/rudra/utils/validation.py index a5905ea..23dc23a 100644 --- a/rudra/utils/validation.py +++ b/rudra/utils/validation.py @@ -1,7 +1,9 @@ """Validation Utility module.""" import urllib.request as request +import xmlrpc.client as xmlrpclib from rudra import logger +from pip._vendor.distlib.util import normalize_name as nn def check_field_exists(input_data, fields): @@ -27,3 +29,33 @@ def check_url_alive(url, accept_codes=[401]): except Exception as exc: logger.debug("Unable to reach url", extra={"exception": str(exc)}) return False + + +class BQValidation: + """Add validation for ecosystems.""" + + def __init__(self): + """Initialize the BQValidation object.""" + pypi_org = xmlrpclib.ServerProxy('https://pypi.python.org/pypi') + self.pypi_org_packages = {nn(p) for p in pypi_org.list_packages()} + + def validate_pypi(self, content): + """Validate python packages. + + Attributes: + content (:obj:`str` or [:obj:`str`] or {:obj:`str`}): + list/set of packages or package str + + Returns: + [:obj:`str`]: list of valid packages. + + Raises: + ValueError: if content is not a type of :obj:`str` or :obj:`list` + + """ + if not isinstance(content, (str, list, set, frozenset)): + raise ValueError("content type should be string or set/list of string") + + content = [content] if isinstance(content, str) else content + + return list(self.pypi_org_packages.intersection(content)) diff --git a/tests/data_store/bigquery/test_base.py b/tests/data_store/bigquery/test_base.py index 13aeb62..72d6c57 100644 --- a/tests/data_store/bigquery/test_base.py +++ b/tests/data_store/bigquery/test_base.py @@ -7,8 +7,6 @@ import mock import pytest -from rudra.data_store.bigquery.base import DataProcessing - os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/path-to-credentials' @@ -150,59 +148,3 @@ def test_iter_(self, _builder_client): assert job_id is not None for d in _builder_client: assert not set(['id', 'name', 'content']).difference(d) - - -class TestDataProcessing: - - def test_async_fetch(self): - dpro = DataProcessing() - pkg = 'flask' - url = 'https://pypi.org/pypi/{p}/json'.format(p=pkg) - dpro.process_queue = list() - dpro.responses = list() - for _ in range(10): - dpro.async_fetch(url, others='flask') - assert len(dpro.process_queue) == 10 - assert len(dpro.responses) == 0 - while dpro.process_queue: - _pkg, _url, _obj = dpro.process_queue[-1] - assert _pkg == pkg - assert _url == url - if _obj.done(): - code = _obj.result().status_code - assert code == 200 - dpro.process_queue.pop() - - def test_is_fetch_done(self): - dpro = DataProcessing() - pkg = 'flask' - url = 'https://pypi.org/pypi/{p}/json'.format(p=pkg) - dpro.process_queue = list() - dpro.responses = list() - num = 10 - for _ in range(num): - dpro.async_fetch(url, others='flask') - assert len(dpro.process_queue) == num - assert len(dpro.responses) == 0 - while not dpro.is_fetch_done(lambda x: x.result().status_code): - pq_len = len(dpro.process_queue) - if pq_len < num: - assert len(dpro.responses) == num - pq_len - - def test_caching(self): - dpro = DataProcessing() - pkg = 'flask' - url = 'https://pypi.org/pypi/{p}/json'.format(p=pkg) - dpro.process_queue = list() - dpro.responses = list() - num = 10000 - for _ in range(num): - dpro.async_fetch(url, others='flask') - assert len(dpro.process_queue) == num - assert len(dpro.responses) == 0 - while not dpro.is_fetch_done(lambda x: x.result().status_code): - pq_len = len(dpro.process_queue) - if pq_len < num: - assert len(dpro.responses) == num - pq_len - assert url in dpro.cache - assert dpro.cache[url] == (pkg, 200) diff --git a/tests/data_store/bigquery/test_pypi_bigquery.py b/tests/data_store/bigquery/test_pypi_bigquery.py index b0fecd7..57a4a18 100644 --- a/tests/data_store/bigquery/test_pypi_bigquery.py +++ b/tests/data_store/bigquery/test_pypi_bigquery.py @@ -91,9 +91,9 @@ def test_get_result_async(self, _pypi_bigquery_client): class TestPyPiDataProcessing: - def test_process(self, _data_process_client): + def test_process_with_validation(self, _data_process_client): dp_client, s3_client = _data_process_client - dp_client.process() + dp_client.process(validate=True) data = s3_client.read_json_file(dp_client.filename) assert 'pypi' in data assert len(data['pypi']) > 0 @@ -101,12 +101,18 @@ def test_process(self, _data_process_client): assert 'boto' in k assert 'chardet' in k assert 'flask' in k + assert 'unknown1' not in k assert v == 2 - def test_handle_response(self, _data_process_client): + def test_process_without_validation(self, _data_process_client): dp_client, s3_client = _data_process_client - dp_client.responses = [ - ('flask', 200), ('django', type('Request', (), {"status_code": 200}))] - result = dp_client.handle_response() - assert len(result) == 2 - assert 'flask' in result and 'django' in result + dp_client.process(validate=False) + data = s3_client.read_json_file(dp_client.filename) + assert 'pypi' in data + assert len(data['pypi']) > 0 + for k, v in data['pypi'].items(): + assert 'boto' in k + assert 'chardet' in k + assert 'flask' in k + assert 'unknown1'in k + assert v == 2 diff --git a/tests/utils/test_validation.py b/tests/utils/test_validation.py index 948215e..729df37 100644 --- a/tests/utils/test_validation.py +++ b/tests/utils/test_validation.py @@ -1,4 +1,4 @@ -from rudra.utils.validation import check_field_exists, check_url_alive +from rudra.utils.validation import check_field_exists, check_url_alive, BQValidation import pytest @@ -20,3 +20,20 @@ def test_check_url_alive(): assert check_url_alive(url) url = 'https://234j23ksadasca.com' assert not check_url_alive(url) + + +class TestBQValidation: + + @staticmethod + def test_validate_pypi_content(): + bq_validation = BQValidation() + content = 'flask' + assert not set(bq_validation.validate_pypi(content)).difference([content]) + content = ['flask', 'django', 'unknownpkg'] + assert not set(['flask', 'django']).difference(bq_validation.validate_pypi(content)) + content = {'flask', 'django'} + assert not content.difference(bq_validation.validate_pypi(content)) + content = frozenset(['flask', 'django']) + assert not content.difference(bq_validation.validate_pypi(content)) + with pytest.raises(ValueError): + bq_validation.validate_pypi({"name": "flask"})