Skip to content

Added crawlbot support and support for V3 APIs #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 31, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 32 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


##Preface
Identify and extract the important parts of any web page in Python! This client currently supports calls to the automatic APIs.
Identify and extract the important parts of any web page in Python! This client currently supports calls to Diffbot's Automatic APIs and Crawlbot.


Installation
Expand Down Expand Up @@ -34,18 +34,6 @@ api = "article"
response = diffbot.request(url, token, api, version=2)
```

###Frontpage API
An example call to the Frontpage API:

```
diffbot = DiffbotClient()
token = "SOME_TOKEN"
version = 2
url = "http://www.huffingtonpost.com/"
api = "frontpage"
response = diffbot.request(url, token, api, version=version)
```

###Product API
An example call to the Product API:

Expand All @@ -70,8 +58,8 @@ api = "image"
response = diffbot.request(url, token, api, version=version)
```

###Classifier API
An example call to the Classifier API:
###Analyze API
An example call to the Analyze API:

```
diffbot = DiffbotClient()
Expand All @@ -82,6 +70,35 @@ api = "analyze"
response = diffbot.request(url, token, api, version=version)
```

###Crawlbot API
To start a new crawl, specify a crawl name, seed URLs, and the API via which URLs should be processed. An example call to the Crawlbot API:

```
token = "SOME_TOKEN"
name = "sampleCrawlName"
seeds = "http://www.twitter.com/"
apiUrl = "analyze"
sampleCrawl = DiffbotCrawl(token,name,seeds,apiUrl)
```

To check the status of a crawl:

```
sampleCrawl.status()
```

To delete or restart a crawl:

```
sampleCrawl.delete()
sampleCrawl.restart()
```

To pass additional arguments to a crawl:

```
sampleCrawl = DiffbotCrawl(token,name,seeds,apiUrl,maxToCrawl=100,maxToProcess=50,notifyEmail="support@diffbot.com")
```

##Testing

Expand Down
53 changes: 51 additions & 2 deletions client.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import requests


class DiffbotClient(object):

base_url = 'http://api.diffbot.com/'

def request(self, url, token, api, fields=None, version=2, **kwargs):
def request(self, url, token, api, fields=None, version=3, **kwargs):
"""
Returns a python object containing the requested resource from the diffbot api
"""
Expand All @@ -30,3 +29,53 @@ def format_version_string(version_number):
Returns a string representation of the API version
"""
return 'v{}'.format(version_number)

class DiffbotJob(DiffbotClient):
"""
Various calls for managing a Diffbot Crawlbot or Bulk API job.
"""

def request(self,params):
response = requests.get(self.compose_url(self.jobType,3),params=params)
response.raise_for_status
try:
return response.json()
except:
print response.text

def start(self,params):
response = self.request(params)
return response

def status(self):
response = self.request(self.params)
return response

def delete(self):
temp_params = self.params
temp_params['delete'] = 1
response = self.request(temp_params)
return response

def restart(self):
temp_params = self.params
temp_params['restart'] = 1
response = self.request(temp_params)
return response

class DiffbotCrawl(DiffbotJob):
"""
Initializes a new Diffbot crawl. Pass additional arguments as necessary.
"""

def __init__(self,token,name,seeds,api,apiVersion=3,**kwargs):
self.params = {
"token": token,
"name": name,
}
startParams = dict(self.params)
startParams['seeds'] = seeds
startParams['apiUrl'] = self.compose_url(api,apiVersion)
startParams.update(kwargs)
self.jobType = "crawl"
self.start(startParams)
38 changes: 23 additions & 15 deletions example.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
from client import DiffbotClient
from client import DiffbotClient,DiffbotCrawl
from config import API_TOKEN
import pprint

import time

print "Calling article API endpoint on the url: http://shichuan.github.io/javascript-patterns/...\n"
diffbot = DiffbotClient()
token = API_TOKEN
version = 2
url = "http://shichuan.github.io/javascript-patterns/"
api = "article"
response = diffbot.request(url, token, api, version=2)
response = diffbot.request(url, token, api)
print "\nPrinting response:\n"
pp = pprint.PrettyPrinter(indent=4)
print pp.pprint(response)
Expand All @@ -18,10 +17,9 @@
print "Calling article API endpoint with fields specified on the url: http://shichuan.github.io/javascript-patterns/...\n"
diffbot = DiffbotClient()
token = API_TOKEN
version = 2
url = "http://shichuan.github.io/javascript-patterns/"
api = "article"
response = diffbot.request(url, token, api, fields=['title', 'type'], version=2)
response = diffbot.request(url, token, api, fields=['title', 'type'])
print "\nPrinting response:\n"
pp = pprint.PrettyPrinter(indent=4)
print pp.pprint(response)
Expand All @@ -30,10 +28,9 @@
print "Calling frontpage API endpoint on the url: http://www.huffingtonpost.com/...\n"
diffbot = DiffbotClient()
token = API_TOKEN
version = 2
url = "http://www.huffingtonpost.com/"
api = "frontpage"
response = diffbot.request(url, token, api, version=version)
response = diffbot.request(url, token, api)
print "\nPrinting response:\n"
pp = pprint.PrettyPrinter(indent=4)
print pp.pprint(response)
Expand All @@ -42,10 +39,9 @@
print "Calling product API endpoint on the url: http://www.overstock.com/Home-Garden/iRobot-650-Roomba-Vacuuming-Robot/7886009/product.html...\n"
diffbot = DiffbotClient()
token = API_TOKEN
version = 2
url = "http://www.overstock.com/Home-Garden/iRobot-650-Roomba-Vacuuming-Robot/7886009/product.html"
api = "product"
response = diffbot.request(url, token, api, version=version)
response = diffbot.request(url, token, api)
print "\nPrinting response:\n"
pp = pprint.PrettyPrinter(indent=4)
print pp.pprint(response)
Expand All @@ -54,10 +50,9 @@
print "Calling image API endpoint on the url: http://www.google.com/...\n"
diffbot = DiffbotClient()
token = API_TOKEN
version = 2
url = "http://www.google.com/"
api = "image"
response = diffbot.request(url, token, api, version=version)
response = diffbot.request(url, token, api)
print "\nPrinting response:\n"
pp = pprint.PrettyPrinter(indent=4)
print pp.pprint(response)
Expand All @@ -66,10 +61,23 @@
print "Calling classifier API endpoint on the url: http://www.twitter.com/...\n"
diffbot = DiffbotClient()
token = API_TOKEN
version = 2
url = "http://www.twitter.com/"
api = "analyze"
response = diffbot.request(url, token, api, version=version)
response = diffbot.request(url, token, api)
print "\nPrinting response:\n"
pp = pprint.PrettyPrinter(indent=4)
print pp.pprint(response)
print pp.pprint(response)

print "Create a new crawl of http://support.diffbot.com/ using the Article API...\n"
token = API_TOKEN
seeds = "http://support.diffbot.com"
api = "article"
name = "testCrawl"
diffbot = DiffbotCrawl(token,name,seeds,api)
time.sleep(5)
status = diffbot.status()
print "\nPrinting status:\n"
pp = pprint.PrettyPrinter(indent=4)
print pp.pprint(status)
print "\nDeleting test crawl.\n"
diffbot.delete()