Skip to content

Commit

Permalink
Merge pull request #80 from ravsa/dev-add-validation
Browse files Browse the repository at this point in the history
Separate improved validation logic for PyPi
  • Loading branch information
Samuzzal Choudhury committed Apr 3, 2019
2 parents dd18000 + d37865f commit 0d7369d
Show file tree
Hide file tree
Showing 12 changed files with 224 additions and 158 deletions.
11 changes: 10 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,13 @@ run_pytest:

test: venv run_pytest clean

.PHONY: install test venv clean run_pytest
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SOURCEDIR = docs/source
BUILDDIR = docs/build

%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: install test venv clean run_pytest help Makefile
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
## fabric8-analytics-rudra
This library is the collection of the common utils and tools required by various fabric8-analytics machine learning projects.
This library is the collection of the common utils and tools required by various fabric8-analytics machine learning projects. Documentation is available at [fabric8-analytics-rudra Docs](https://fabric8-analytics-rudra.readthedocs.io). Please follow this article [Docs Examples](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) for documenting the code.

### Installation:

####

```bash
$ pip install git+https://github.com/fabric8-analytics/fabric8-analytics-rudra
```
Expand All @@ -18,6 +16,14 @@ $ make install
$ make test
```

### Documentation:
```bash
$ export PYTHONPATH=/path/to/fabric8-analytics-rudra
```
```bash
$ make html
```

### Footnotes

#### Coding standards
Expand Down Expand Up @@ -80,3 +86,4 @@ The script named `check-bashscripts.sh` can be used to check all BASH scripts (i

Please see [the following link](https://github.com/koalaman/shellcheck) for further explanation, how the ShellCheck works and which issues can be detected.


13 changes: 10 additions & 3 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,14 @@
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.mathjax',
'sphinx.ext.ifconfig',
'sphinx.ext.viewcode',
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['rudratemplates']
templates_path = ['_templates']

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
Expand Down Expand Up @@ -74,7 +78,7 @@
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'alabaster'
html_theme = 'sphinx_rtd_theme'

# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
Expand All @@ -85,7 +89,7 @@
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['rudrastatic']
html_static_path = ['_static']

# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
Expand Down Expand Up @@ -171,3 +175,6 @@

# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']


# -- Extension configuration -------------------------------------------------
105 changes: 104 additions & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
.. fabric8-analytics-rudra documentation master file, created by
sphinx-quickstart on Wed Mar 20 21:38:11 2019.
sphinx-quickstart on Wed Mar 20 23:00:30 2019.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Expand All @@ -18,3 +18,106 @@ Indices and tables
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

.. toctree::
:maxdepth: 3

.. automodule:: rudra.data_store
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.data_store.aws
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.data_store.local_data_store
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.data_store.bigquery
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.data_store.bigquery.base
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.data_store.bigquery.maven_bigquery
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.data_store.bigquery.pypi_bigquery
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.data_store.bigquery.npm_bigquery
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.deployments
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.deployments.emr_scripts
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.deployments.emr_scripts.emr_config
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.deployments.emr_scripts.emr_script_builder
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.deployments.emr_scripts.maven_emr
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.deployments.emr_scripts.npm_emr
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.deployments.emr_scripts.pypi_emr
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.utils
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.utils.validation
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.utils.helper
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.utils.mercator
:members:
:undoc-members:
:show-inheritance:

.. automodule:: rudra.utils.pypi_parser
:members:
:undoc-members:
:show-inheritance:
15 changes: 15 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
alabaster==0.7.12
asn1crypto==0.24.0
atomicwrites==1.3.0
attrs==18.2.0
aws-xray-sdk==0.95
Babel==2.6.0
beautifulsoup4==4.7.1
boto==2.49.0
boto3==1.7.84
Expand All @@ -26,6 +28,7 @@ google-cloud-core==0.29.1
google-resumable-media==0.3.2
googleapis-common-protos==1.5.8
idna==2.8
imagesize==1.1.0
Jinja2==2.10
jmespath==0.9.3
jsondiff==1.1.1
Expand All @@ -36,6 +39,7 @@ mock==2.0.0
more-itertools==6.0.0
moto==1.3.6
numpy==1.16.1
packaging==19.0
pbr==5.1.2
pluggy==0.8.1
protobuf==3.7.0
Expand All @@ -46,6 +50,8 @@ pyasn1-modules==0.2.4
pycodestyle==2.5.0
pycparser==2.19
pycryptodome==3.7.3
Pygments==2.3.1
pyparsing==2.3.1
pytest==4.3.0
python-dateutil==2.8.0
python-jose==2.0.2
Expand All @@ -59,7 +65,16 @@ ruamel.yaml==0.15.88
s3transfer==0.1.13
scipy==1.2.1
six==1.12.0
snowballstemmer==1.2.1
soupsieve==1.8
Sphinx==2.0.0
sphinx-rtd-theme==0.4.3
sphinxcontrib-applehelp==1.0.1
sphinxcontrib-devhelp==1.0.1
sphinxcontrib-htmlhelp==1.0.1
sphinxcontrib-jsmath==1.0.1
sphinxcontrib-qthelp==1.0.2
sphinxcontrib-serializinghtml==1.1.1
urllib3==1.24.1
websocket-client==0.54.0
Werkzeug==0.14.1
Expand Down
37 changes: 0 additions & 37 deletions rudra/data_store/bigquery/base.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
"""Implementation Bigquery builder base."""
import os
import time
from collections import Counter

from google.cloud import bigquery
from requests import Session
from requests_futures.sessions import FuturesSession

from rudra import logger
from rudra.utils.helper import CacheDict
from rudra.data_store.aws import AmazonS3


Expand Down Expand Up @@ -88,40 +84,7 @@ class DataProcessing:

def __init__(self, s3_client=None):
"""Initialize DataProcessing object."""
self.data = None
self.cache = CacheDict(max_len=50000)
self.pkg_counter = Counter()
self.s3_client = s3_client
self.req_session = FuturesSession(session=Session())

def async_fetch(self, url,
method='GET',
others=None):
"""Fetch urls asynchronously."""
if url in self.cache:
self.responses.append(self.cache[url])
else:
self.process_queue.append(
(others, url, self.req_session.request(method, url)))

def is_fetch_done(self, callback=lambda x: x):
"""Check whether all the requests are processed or not."""
_flag = True
for resp in self.process_queue:
_flag = False
others, url, req_obj = resp
logger.info("other:{}, url:{}, req_obj:{}".format(others, url, req_obj))

if url in self.cache:
req_obj.cancel()
self.process_queue.remove(resp)
self.responses.append(self.cache[url])
elif req_obj.done():
req_obj.cancel()
self.process_queue.remove(resp)
self.cache[url] = (others, callback(req_obj))
self.responses.append((others, callback(req_obj)))
return _flag

def update_s3_bucket(self, data,
bucket_name,
Expand Down
55 changes: 10 additions & 45 deletions rudra/data_store/bigquery/pypi_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from rudra.data_store.bigquery.base import BigqueryBuilder
from rudra.utils.pypi_parser import pip_req
from rudra.data_store.bigquery.base import DataProcessing
from rudra.utils.validation import BQValidation
from rudra import logger


Expand Down Expand Up @@ -46,52 +47,31 @@ def __init__(self, big_query_instance=None, s3_client=None):
self.filename = '{}/big-query-data/collated.json'.format(
os.getenv('DEPLOYMENT_PREFIX', 'dev'))

def process(self):
def process(self, validate=False):
"""Process Pypi Bigquery response data."""
start = time.monotonic()
bq_validation = BQValidation()
logger.info("Running Bigquery for pypi synchronously")
self.big_query_instance.run_query_sync()

logger.info("fetching bigquery result.")
for content in self.big_query_instance.get_result():
self.big_query_content.append(content)
logger.info("collected manifests: {}".format(len(self.big_query_content)))
logger.info("Succefully retrieved data from Bigquery, time:{}".format(
time.monotonic() - start))
base_url_pypi = 'https://pypi.org/pypi/{pkg}/json'
logger.info("Starting package cleaning")
start_process_time = time.monotonic()
for idx, obj in enumerate(self.big_query_content):
for idx, obj in enumerate(self.big_query_instance.get_result()):
start = time.monotonic()
content = obj.get('content')
self.process_queue = list()
self.responses = list()
packages = []
if content:
try:
for name in pip_req.parse_requirements(content):
logger.info("searching pkg:`{}` in Python Package Index \
Repository" .format(name))
self.async_fetch(base_url_pypi.format(pkg=name), others=name)
packages = sorted({p for p in pip_req.parse_requirements(content)})
if validate:
packages = sorted(bq_validation.validate_pypi(packages))
except Exception as _exc:
logger.error("IGNORE: {}".format(_exc))
logger.error("Failed to parse content data {}".format(content))

try:
while not self.is_fetch_done(lambda x: x.result().status_code):
# hold the process until all request finishes.
time.sleep(0.001)
except Exception as _exc:
logger.error("IGNORE: {}".format(_exc))
# discard process_queue
self.process_queue = []
self.responses = []
packages = sorted(set(self.handle_response()))
if packages:
pkg_string = ', '.join(packages)
logger.info("PACKAGES: {}".format(pkg_string))
self.counter.update([pkg_string])
logger.info("Processed content in time: {} process:{}/{}".format(
(time.monotonic() - start), idx, len(self.big_query_content)))
logger.info("Processed content in time: {} counter:{}".format(
(time.monotonic() - start), idx))
logger.info("Processed All the manifests in time: {}".format(
time.monotonic() - start_process_time))

Expand All @@ -101,18 +81,3 @@ def process(self):
filename=self.filename)

logger.info("Succefully Processed the PyPiBigQuery")

def handle_response(self):
"""Process and get the response of async requests."""
results = list()
for resp in self.responses:
pkg_name, req_obj = resp
if isinstance(req_obj, int):
if req_obj == 200:
results.append(pkg_name)
elif req_obj.status_code == 200:
results.append(pkg_name)
logger.info("Received status:{} for pkg:{}".format(req_obj.status_code, pkg_name))
else:
logger.info("Received status:{} for pkg:{}".format(req_obj.status_code, pkg_name))
return results

0 comments on commit 0d7369d

Please sign in to comment.