From 7c6df808a6b94defa8615d6d7e21261af89be705 Mon Sep 17 00:00:00 2001 From: John Davi Date: Fri, 30 May 2014 20:59:26 -0700 Subject: [PATCH] adding crawl update, crawl download, other fixes --- README.md | 21 +++++++++++++++++++-- client.py | 33 ++++++++++++++++++++++++++++----- example.py | 2 +- 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 8fd18c4..116809b 100644 --- a/README.md +++ b/README.md @@ -77,16 +77,26 @@ To start a new crawl, specify a crawl name, seed URLs, and the API via which URL token = "SOME_TOKEN" name = "sampleCrawlName" seeds = "http://www.twitter.com/" -apiUrl = "analyze" -sampleCrawl = DiffbotCrawl(token,name,seeds,apiUrl) +api = "analyze" +sampleCrawl = DiffbotCrawl(token,name,seeds=seeds,api=api) ``` +Omit "seeds" and "api" to load an existing crawl, or create a crawl as a placeholder. + To check the status of a crawl: ``` sampleCrawl.status() ``` +To update a crawl: + +``` +maxToCrawl = 100 +upp = "diffbot" +sampleCrawl.update(maxToCrawl=maxToCrawl,urlProcessPattern=upp) +``` + To delete or restart a crawl: ``` @@ -94,6 +104,13 @@ sampleCrawl.delete() sampleCrawl.restart() ``` +To download crawl data: + +``` +sampleCrawl.download() # returns JSON by default +sampleCrawl.download(data_format="csv") +``` + To pass additional arguments to a crawl: ``` diff --git a/client.py b/client.py index 1925a61..3343c35 100644 --- a/client.py +++ b/client.py @@ -32,7 +32,7 @@ def format_version_string(version_number): class DiffbotJob(DiffbotClient): """ - Various calls for managing a Diffbot Crawlbot or Bulk API job. + Various calls for managing a Crawlbot or Bulk API job. """ def request(self,params): @@ -51,6 +51,12 @@ def status(self): response = self.request(self.params) return response + def update(self,**kwargs): + temp_params = self.params + temp_params.update(kwargs) + response = self.request(self.params) + return response + def delete(self): temp_params = self.params temp_params['delete'] = 1 @@ -63,19 +69,36 @@ def restart(self): response = self.request(temp_params) return response + def download(self,data_format="json"): + """ + downloads the JSON output of a crawl or bulk job + """ + + download_url = '{}/v3/{}/download/{}-{}_data.{}'.format( + self.base_url,self.jobType,self.params['token'],self.params['name'],data_format + ) + download = requests.get(download_url) + download.raise_for_status() + if data_format == "csv": + return download.content + else: + return download.json() + class DiffbotCrawl(DiffbotJob): """ - Initializes a new Diffbot crawl. Pass additional arguments as necessary. + Initializes a Diffbot crawl. Pass additional arguments as necessary. """ - def __init__(self,token,name,seeds,api,apiVersion=3,**kwargs): + def __init__(self,token,name,seeds=None,api=None,apiVersion=3,**kwargs): self.params = { "token": token, "name": name, } startParams = dict(self.params) - startParams['seeds'] = seeds - startParams['apiUrl'] = self.compose_url(api,apiVersion) + if seeds: + startParams['seeds'] = seeds + if api: + startParams['apiUrl'] = self.compose_url(api,apiVersion) startParams.update(kwargs) self.jobType = "crawl" self.start(startParams) \ No newline at end of file diff --git a/example.py b/example.py index 3764d43..b7de9e6 100644 --- a/example.py +++ b/example.py @@ -73,7 +73,7 @@ seeds = "http://support.diffbot.com" api = "article" name = "testCrawl" -diffbot = DiffbotCrawl(token,name,seeds,api) +diffbot = DiffbotCrawl(token, name, seeds=seeds, api=api) time.sleep(5) status = diffbot.status() print "\nPrinting status:\n"