Skip to content
This repository has been archived by the owner on Aug 16, 2023. It is now read-only.

Commit

Permalink
Merge pull request #3 from codefortulsa/scrapy-doo
Browse files Browse the repository at this point in the history
Scrapy doo
  • Loading branch information
groovecoder committed Mar 13, 2016
2 parents a1be21d + d6e3554 commit 6f98b66
Show file tree
Hide file tree
Showing 9 changed files with 150 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -1,3 +1,4 @@
*.pyc
citystrugglebus.sqlite3
staticfiles
html
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -3,3 +3,4 @@ dj-database-url==0.4.0
gunicorn==19.4.5
python-decouple==3.0
whitenoise==2.0.6
scrapy==1.0.5
11 changes: 11 additions & 0 deletions scrapy_city/scrapy.cfg
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = scrapy_city.settings

[deploy]
#url = http://localhost:6800/
project = scrapy_city
Empty file.
13 changes: 13 additions & 0 deletions scrapy_city/scrapy_city/items.py
@@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class ScrapyCityItem(scrapy.Item):
url = scrapy.Field()
html = scrapy.Field()
11 changes: 11 additions & 0 deletions scrapy_city/scrapy_city/pipelines.py
@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class ScrapyCityPipeline(object):
def process_item(self, item, spider):
return item
85 changes: 85 additions & 0 deletions scrapy_city/scrapy_city/settings.py
@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-

# Scrapy settings for scrapy_city project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'scrapy_city'

SPIDER_MODULES = ['scrapy_city.spiders']
NEWSPIDER_MODULE = 'scrapy_city.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapy_city (+http://www.yourdomain.com)'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS=32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY=3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16

# Disable cookies (enabled by default)
#COOKIES_ENABLED=False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED=False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'scrapy_city.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'scrapy_city.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'scrapy_city.pipelines.SomePipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
#AUTOTHROTTLE_ENABLED=True
# The initial download delay
#AUTOTHROTTLE_START_DELAY=5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY=60
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG=False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True
#HTTPCACHE_EXPIRATION_SECS=0
#HTTPCACHE_DIR='httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[]
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
4 changes: 4 additions & 0 deletions scrapy_city/scrapy_city/spiders/__init__.py
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
24 changes: 24 additions & 0 deletions scrapy_city/scrapy_city/spiders/city_spider.py
@@ -0,0 +1,24 @@
import scrapy
from scrapy_city.items import ScrapyCityItem

class CitySpider(scrapy.Spider):
name = "city"
allowed_domains = [
"nextcity.org",
"govtech.com"
]
start_urls = [
"http://www.nextcity.org/",
"http://www.govtech.com/"
]

def parse(self, response):
# send the present page down the item pipeline
item = ScrapyCityItem()
item['url'] = response.url
item['html'] = response.body
yield item
# traverse the links to get the next items
for href in response.css("a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse)

0 comments on commit 6f98b66

Please sign in to comment.