Merge pull request #80 from ravsa/dev-add-validation

Separate improved validation logic for PyPi
fabric8-analytics · Apr 3, 2019 · 0d7369d · 0d7369d
2 parents dd18000 + d37865f
commit 0d7369d
Show file tree

Hide file tree

Showing 12 changed files with 224 additions and 158 deletions.
diff --git a/Makefile b/Makefile
@@ -15,4 +15,13 @@ run_pytest:
 
 test: venv run_pytest clean
 
-.PHONY: install test venv clean run_pytest
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = docs/source
+BUILDDIR      = docs/build
+
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: install test venv clean run_pytest help Makefile
diff --git a/README.md b/README.md
@@ -1,10 +1,8 @@
 ## fabric8-analytics-rudra
-This library is the collection of the common utils and tools required by various fabric8-analytics machine learning projects.
+This library is the collection of the common utils and tools required by various fabric8-analytics machine learning projects. Documentation is available at [fabric8-analytics-rudra Docs](https://fabric8-analytics-rudra.readthedocs.io). Please follow this article  [Docs Examples](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) for documenting the code.
 
 ### Installation:
 
-#### 
-
 ```bash
 $ pip install git+https://github.com/fabric8-analytics/fabric8-analytics-rudra
 ```
@@ -18,6 +16,14 @@ $ make install
 $ make test
 ```
 
+### Documentation:
+```bash
+$ export PYTHONPATH=/path/to/fabric8-analytics-rudra
+```
+```bash
+$ make html
+```
+
 ### Footnotes
 
 #### Coding standards
@@ -80,3 +86,4 @@ The script named `check-bashscripts.sh` can be used to check all BASH scripts (i
 
 Please see [the following link](https://github.com/koalaman/shellcheck) for further explanation, how the ShellCheck works and which issues can be detected.
 
+
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -39,10 +39,14 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.ifconfig',
+    'sphinx.ext.viewcode',
 ]
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['rudratemplates']
+templates_path = ['_templates']
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
@@ -74,7 +78,7 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'alabaster'
+html_theme = 'sphinx_rtd_theme'
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
@@ -85,7 +89,7 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['rudrastatic']
+html_static_path = ['_static']
 
 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
@@ -171,3 +175,6 @@
 
 # A list of files that should not be packed into the epub file.
 epub_exclude_files = ['search.html']
+
+
+# -- Extension configuration -------------------------------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -1,5 +1,5 @@
 .. fabric8-analytics-rudra documentation master file, created by
-   sphinx-quickstart on Wed Mar 20 21:38:11 2019.
+   sphinx-quickstart on Wed Mar 20 23:00:30 2019.
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
@@ -18,3 +18,106 @@ Indices and tables
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
+
+.. toctree::
+   :maxdepth: 3
+
+.. automodule:: rudra.data_store
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.data_store.aws
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.data_store.local_data_store
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.data_store.bigquery
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.data_store.bigquery.base
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.data_store.bigquery.maven_bigquery
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.data_store.bigquery.pypi_bigquery
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.data_store.bigquery.npm_bigquery
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.deployments
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.deployments.emr_scripts
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.deployments.emr_scripts.emr_config
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.deployments.emr_scripts.emr_script_builder
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.deployments.emr_scripts.maven_emr
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.deployments.emr_scripts.npm_emr
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.deployments.emr_scripts.pypi_emr
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.utils.validation
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.utils.helper
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.utils.mercator
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. automodule:: rudra.utils.pypi_parser
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,9 @@
+alabaster==0.7.12
 asn1crypto==0.24.0
 atomicwrites==1.3.0
 attrs==18.2.0
 aws-xray-sdk==0.95
+Babel==2.6.0
 beautifulsoup4==4.7.1
 boto==2.49.0
 boto3==1.7.84
@@ -26,6 +28,7 @@ google-cloud-core==0.29.1
 google-resumable-media==0.3.2
 googleapis-common-protos==1.5.8
 idna==2.8
+imagesize==1.1.0
 Jinja2==2.10
 jmespath==0.9.3
 jsondiff==1.1.1
@@ -36,6 +39,7 @@ mock==2.0.0
 more-itertools==6.0.0
 moto==1.3.6
 numpy==1.16.1
+packaging==19.0
 pbr==5.1.2
 pluggy==0.8.1
 protobuf==3.7.0
@@ -46,6 +50,8 @@ pyasn1-modules==0.2.4
 pycodestyle==2.5.0
 pycparser==2.19
 pycryptodome==3.7.3
+Pygments==2.3.1
+pyparsing==2.3.1
 pytest==4.3.0
 python-dateutil==2.8.0
 python-jose==2.0.2
@@ -59,7 +65,16 @@ ruamel.yaml==0.15.88
 s3transfer==0.1.13
 scipy==1.2.1
 six==1.12.0
+snowballstemmer==1.2.1
 soupsieve==1.8
+Sphinx==2.0.0
+sphinx-rtd-theme==0.4.3
+sphinxcontrib-applehelp==1.0.1
+sphinxcontrib-devhelp==1.0.1
+sphinxcontrib-htmlhelp==1.0.1
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.2
+sphinxcontrib-serializinghtml==1.1.1
 urllib3==1.24.1
 websocket-client==0.54.0
 Werkzeug==0.14.1

diff --git a/rudra/data_store/bigquery/base.py b/rudra/data_store/bigquery/base.py
@@ -1,14 +1,10 @@
 """Implementation Bigquery builder base."""
 import os
 import time
-from collections import Counter
 
 from google.cloud import bigquery
-from requests import Session
-from requests_futures.sessions import FuturesSession
 
 from rudra import logger
-from rudra.utils.helper import CacheDict
 from rudra.data_store.aws import AmazonS3
 
 
@@ -88,40 +84,7 @@ class DataProcessing:
 
     def __init__(self, s3_client=None):
         """Initialize DataProcessing object."""
-        self.data = None
-        self.cache = CacheDict(max_len=50000)
-        self.pkg_counter = Counter()
         self.s3_client = s3_client
-        self.req_session = FuturesSession(session=Session())
-
-    def async_fetch(self, url,
-                    method='GET',
-                    others=None):
-        """Fetch urls asynchronously."""
-        if url in self.cache:
-            self.responses.append(self.cache[url])
-        else:
-            self.process_queue.append(
-                (others, url, self.req_session.request(method, url)))
-
-    def is_fetch_done(self, callback=lambda x: x):
-        """Check whether all the requests are processed or not."""
-        _flag = True
-        for resp in self.process_queue:
-            _flag = False
-            others, url, req_obj = resp
-            logger.info("other:{}, url:{}, req_obj:{}".format(others, url, req_obj))
-
-            if url in self.cache:
-                req_obj.cancel()
-                self.process_queue.remove(resp)
-                self.responses.append(self.cache[url])
-            elif req_obj.done():
-                req_obj.cancel()
-                self.process_queue.remove(resp)
-                self.cache[url] = (others, callback(req_obj))
-                self.responses.append((others, callback(req_obj)))
-        return _flag
 
     def update_s3_bucket(self, data,
                          bucket_name,

diff --git a/rudra/data_store/bigquery/pypi_bigquery.py b/rudra/data_store/bigquery/pypi_bigquery.py
@@ -6,6 +6,7 @@
 from rudra.data_store.bigquery.base import BigqueryBuilder
 from rudra.utils.pypi_parser import pip_req
 from rudra.data_store.bigquery.base import DataProcessing
+from rudra.utils.validation import BQValidation
 from rudra import logger
 
 
@@ -46,52 +47,31 @@ def __init__(self, big_query_instance=None, s3_client=None):
         self.filename = '{}/big-query-data/collated.json'.format(
             os.getenv('DEPLOYMENT_PREFIX', 'dev'))
 
-    def process(self):
+    def process(self, validate=False):
         """Process Pypi Bigquery response data."""
-        start = time.monotonic()
+        bq_validation = BQValidation()
         logger.info("Running Bigquery for pypi synchronously")
         self.big_query_instance.run_query_sync()
-
-        logger.info("fetching bigquery result.")
-        for content in self.big_query_instance.get_result():
-            self.big_query_content.append(content)
-            logger.info("collected manifests: {}".format(len(self.big_query_content)))
-        logger.info("Succefully retrieved data from Bigquery, time:{}".format(
-            time.monotonic() - start))
-        base_url_pypi = 'https://pypi.org/pypi/{pkg}/json'
-        logger.info("Starting package cleaning")
         start_process_time = time.monotonic()
-        for idx, obj in enumerate(self.big_query_content):
+        for idx, obj in enumerate(self.big_query_instance.get_result()):
             start = time.monotonic()
             content = obj.get('content')
-            self.process_queue = list()
-            self.responses = list()
+            packages = []
             if content:
                 try:
-                    for name in pip_req.parse_requirements(content):
-                        logger.info("searching pkg:`{}` in Python Package Index \
-                                Repository" .format(name))
-                        self.async_fetch(base_url_pypi.format(pkg=name), others=name)
+                    packages = sorted({p for p in pip_req.parse_requirements(content)})
+                    if validate:
+                        packages = sorted(bq_validation.validate_pypi(packages))
                 except Exception as _exc:
                     logger.error("IGNORE: {}".format(_exc))
                     logger.error("Failed to parse content data {}".format(content))
 
-                try:
-                    while not self.is_fetch_done(lambda x: x.result().status_code):
-                        # hold the process until all request finishes.
-                        time.sleep(0.001)
-                except Exception as _exc:
-                    logger.error("IGNORE: {}".format(_exc))
-                    # discard process_queue
-                    self.process_queue = []
-                    self.responses = []
-                packages = sorted(set(self.handle_response()))
                 if packages:
                     pkg_string = ', '.join(packages)
                     logger.info("PACKAGES: {}".format(pkg_string))
                     self.counter.update([pkg_string])
-                logger.info("Processed content in time: {} process:{}/{}".format(
-                    (time.monotonic() - start), idx, len(self.big_query_content)))
+                logger.info("Processed content in time: {} counter:{}".format(
+                    (time.monotonic() - start), idx))
         logger.info("Processed All the manifests in time: {}".format(
             time.monotonic() - start_process_time))
 
@@ -101,18 +81,3 @@ def process(self):
                               filename=self.filename)
 
         logger.info("Succefully Processed the PyPiBigQuery")
-
-    def handle_response(self):
-        """Process and get the response of async requests."""
-        results = list()
-        for resp in self.responses:
-            pkg_name, req_obj = resp
-            if isinstance(req_obj, int):
-                if req_obj == 200:
-                    results.append(pkg_name)
-            elif req_obj.status_code == 200:
-                results.append(pkg_name)
-                logger.info("Received status:{} for pkg:{}".format(req_obj.status_code, pkg_name))
-            else:
-                logger.info("Received status:{} for pkg:{}".format(req_obj.status_code, pkg_name))
-        return results