Skip to content

Commit

Permalink
crawler: dump data for invalid PR and review
Browse files Browse the repository at this point in the history
  • Loading branch information
flepied committed Apr 19, 2020
1 parent 14962ce commit f039ed4
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 29 deletions.
24 changes: 22 additions & 2 deletions monocle/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import json
import logging
import os
import tempfile
from time import sleep
from datetime import datetime
from threading import Thread
Expand Down Expand Up @@ -69,13 +71,31 @@ def get_last_updated_date(self):
return change['updated_at']

def run_step(self):
def dump_data(data, prefix=None):
try:
if self.dump_dir:
tmpfile = tempfile.NamedTemporaryFile(
dir=self.dump_dir,
prefix=prefix,
suffix='.json',
mode='w',
delete=False,
)
json.dump(data, tmpfile)
tmpfile.close()
log.info('Data dumped to %s' % tmpfile.name)
return tmpfile.name
except Exception:
log.exception('Unable to dump data')
return None

updated_since = self.get_last_updated_date()
try:
prs = self.prf.get(updated_since)
except Exception:
log.exception('Unable to get PR data')
return
objects = self.prf.extract_objects(prs, self.dump_dir)
objects = self.prf.extract_objects(prs, dump_data)
if objects:
log.info("%d objects will be updated in the database" % len(objects))
self.db.update(objects)
Expand Down
3 changes: 2 additions & 1 deletion monocle/gerrit/review.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def get(self, updated_since, change=None):
break
return reviews

def extract_objects(self, reviewes):
def extract_objects(self, reviewes, dumper):
def timedelta(start, end):
format = "%Y-%m-%dT%H:%M:%SZ"
start = datetime.strptime(start, format)
Expand Down Expand Up @@ -306,6 +306,7 @@ def extract_pr_objects(review):
objects.extend(extract_pr_objects(review))
except Exception:
self.log.exception("Unable to extract Review data: %s" % review)
dumper(review)
return objects


Expand Down
27 changes: 3 additions & 24 deletions monocle/github/pullrequest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@
# SOFTWARE.


import json
import logging
import tempfile
import requests
from datetime import datetime
from time import sleep
Expand Down Expand Up @@ -269,7 +267,7 @@ def get_one(self, org, repository, number):
raw = self.gql.query(qdata % kwargs)['data']['repository']['pullRequest']
return (raw, self.extract_objects([raw]))

def extract_objects(self, prs, dump_dir=None):
def extract_objects(self, prs, dumper):
def get_login(data):
if data and 'login' in data and data['login']:
return data['login']
Expand Down Expand Up @@ -410,38 +408,19 @@ def extract_pr_objects(pr):
objects.append(obj)
return objects

def dump_data(data, prefix=None):
try:
if dump_dir:
tmpfile = tempfile.NamedTemporaryFile(
dir=dump_dir,
prefix=prefix,
suffix='.json',
mode='w',
delete=False,
)
json.dump(data, tmpfile)
tmpfile.close()
self.log.info('PR dumped to %s' % tmpfile.name)
return tmpfile.name
except Exception:
self.log.exception('Unable to dump data')
return None

objects = []
idx = 0
for pr in prs:
idx += 1
try:
objects.extend(extract_pr_objects(pr))
except Exception:
self.log.expection('Unable to extract PR')
dump_data(pr)
dumper(pr)
return objects


if __name__ == '__main__':
import os
import json
import argparse
from pprint import pprint
from monocle.github import graphql
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_crawler_gerrit.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def extract_and_compare(self, base_url, name):
input_review, xtrd_ref = load_change(name)

rf = review.ReviewesFetcher(base_url, None)
xtrd = rf.extract_objects([input_review])
xtrd = rf.extract_objects([input_review], print)

ddiff = DeepDiff(xtrd_ref, xtrd, ignore_order=True)
if ddiff:
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_crawler_github.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def extract_and_compare(self, name):
input_pr, xtrd_ref = load_change(name)

pr_fetcher = pullrequest.PRsFetcher(None, 'https://github.com', None, None)
xtrd = pr_fetcher.extract_objects([input_pr])
xtrd = pr_fetcher.extract_objects([input_pr], print)

ddiff = DeepDiff(xtrd_ref, xtrd, ignore_order=True)
if ddiff:
Expand Down

0 comments on commit f039ed4

Please sign in to comment.