Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d458680
commit 98547ea
Showing
11 changed files
with
322 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
.idea | ||
*.xml | ||
.scrapy | ||
*.pyc | ||
/__pycache__/ | ||
*.json | ||
*.txt | ||
/data/ | ||
/crawls/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Automatically created by: scrapy startproject | ||
# | ||
# For more information about the [deploy] section see: | ||
# https://scrapyd.readthedocs.org/en/latest/deploy.html | ||
|
||
[settings] | ||
default = stackoverflow.settings | ||
|
||
[deploy] | ||
#url = http://localhost:6800/ | ||
project = stackoverflow |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware | ||
|
||
class ProxyMiddleware(HttpProxyMiddleware): | ||
""" overwrite process request """ | ||
|
||
def process_request(self, request, spider): | ||
# Set the location of the proxy | ||
# Using XX-NET port | ||
proxy_ip = "http://127.0.0.1:8087" | ||
request.meta['proxy'] = proxy_ip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import random | ||
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware | ||
|
||
class RandomUserAgentMiddleware(UserAgentMiddleware): | ||
""" Class extends UserAgentMiddleware for set myself UAmiddlweare """ | ||
|
||
def __init__(self, user_agent=''): | ||
self.user_agent = user_agent | ||
|
||
def process_request(self, request, spider): | ||
""" override process_request to setting User-Agent """ | ||
ua = random.choice(self.user_agent_list) | ||
if ua: | ||
request.headers.setdefault('User-Agent', ua) | ||
|
||
user_agent_list = [ \ | ||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " | ||
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", | ||
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " | ||
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", | ||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " | ||
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", | ||
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " | ||
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", | ||
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " | ||
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", | ||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " | ||
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", | ||
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " | ||
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", | ||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " | ||
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", | ||
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " | ||
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", | ||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " | ||
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", | ||
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " | ||
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", | ||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " | ||
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", | ||
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " | ||
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", | ||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " | ||
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", | ||
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " | ||
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", | ||
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " | ||
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", | ||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " | ||
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", | ||
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " | ||
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" | ||
] |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import json | ||
import pymysql | ||
|
||
conn = pymysql.connect(host = "localhost",port = 3306,user = "root",passwd = "0303",db = "chenx",charset="utf8") | ||
cur = conn.cursor() | ||
|
||
with open(r"e:\python\stackoverflow\data2.json","r",encoding="utf-8") as f: | ||
data = json.load(f) | ||
|
||
def insert_db(s_links, s_views, s_votes, s_answers, s_tags, s_questions): | ||
"""" Insert data to database """ | ||
|
||
sql = "insert into stackoverflow(s_links, s_views, s_votes, s_answers, s_tags, s_questions) " \ | ||
"values(%s, %s, %s, %s, %s, %s)" | ||
value = (s_links, s_views, s_votes, s_answers, s_tags, s_questions) | ||
cur.execute(sql, value) | ||
conn.commit() | ||
print(s_links + " Done") | ||
print("Insert s_links: " + s_links) | ||
|
||
for i in range(len(data)): | ||
s_links = data[i]['links'] | ||
s_views = data[i]['views'] | ||
s_votes = data[i]['votes'] | ||
s_answers = data[i]['answers'] | ||
s_tags = " ".join(data[i]['tags']) | ||
s_questions = data[i]['questions'] | ||
try: | ||
insert_db(s_links, s_views, s_votes, s_answers, s_tags, s_questions) | ||
except Exception as e : | ||
print(e) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Scrapy settings for stackoverflow project | ||
# | ||
# For simplicity, this file contains only settings considered important or | ||
# commonly used. You can find more settings consulting the documentation: | ||
# | ||
# http://doc.scrapy.org/en/latest/topics/settings.html | ||
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
|
||
# from stackoverflow.middleware import httpproxy | ||
# from stackoverflow.middleware import useragent | ||
|
||
BOT_NAME = 'stackoverflow' | ||
|
||
SPIDER_MODULES = ['stackoverflow.spiders'] | ||
NEWSPIDER_MODULE = 'stackoverflow.spiders' | ||
|
||
|
||
# Crawl responsibly by identifying yourself (and your website) on the user-agent | ||
#USER_AGENT = 'stackoverflow (+http://www.yourdomain.com)' | ||
|
||
# Obey robots.txt rules | ||
ROBOTSTXT_OBEY = False | ||
|
||
# encoding setting | ||
FEED_EXPORT_ENCODING = 'utf-8' | ||
|
||
# Configure maximum concurrent requests performed by Scrapy (default: 16) | ||
CONCURRENT_REQUESTS = 100 | ||
|
||
# Maximum number of concurrent items (per response) to process in parallel in the Item Processor(defalut:100) | ||
CONCURRENT_ITEMS = 100 | ||
|
||
# Configure a delay for requests for the same website (default: 0) | ||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay | ||
# See also autothrottle settings and docs | ||
|
||
# his setting is also affected by the RANDOMIZE_DOWNLOAD_DELAY setting (which is enabled by default). | ||
# By default, Scrapy doesn’t wait a fixed amount of time between requests, | ||
# but uses a random interval between 0.5 * DOWNLOAD_DELAY and 1.5 * DOWNLOAD_DELAY. | ||
DOWNLOAD_DELAY = 2.5 | ||
|
||
|
||
# The download delay setting will honor only one of: | ||
# The maximum number of concurrent (ie. simultaneous) requests that will be performed to any single domain (defalut:8) | ||
CONCURRENT_REQUESTS_PER_DOMAIN = 16 | ||
#CONCURRENT_REQUESTS_PER_IP = 16 | ||
|
||
# The maximum limit for Twisted Reactor thread pool size. (defalut:10) | ||
REACTOR_THREADPOOL_MAXSIZE = 32 | ||
|
||
# Defines the maximum times a request can be redirected. (deaflut:20) | ||
# After this maximum the request’s response is returned as is. We used Firefox default value for the same task. | ||
# REDIRECT_MAX_TIMES = 25 | ||
|
||
# Retrying failed HTTP requests can slow down the crawls substantially, | ||
# specially when sites causes are very slow (or fail) to respond, | ||
# thus causing a timeout error which gets retried many times, | ||
# unnecessarily, preventing crawler capacity to be reused for other domains. | ||
REDIRECT_ENABLED = False | ||
|
||
# Disable cookies (enabled by default) | ||
COOKIES_ENABLED = False | ||
|
||
# Enables the AutoThrottle extension.(default:Flase) | ||
# AUTOTHROTTLE_ENABLED = False | ||
AUTOTHROTTLE_ENABLED = False | ||
|
||
# The amount of time (in secs) that the downloader will wait before timing out.(defalut:180) | ||
# Unless you are crawling from a very slow connection (which shouldn’t be the case for broad crawls) | ||
# reduce the download timeout so that stuck requests are discarded quickly and | ||
# free up capacity to process the next ones. | ||
# recommend : DOWNLOAD_TIMEOUT = 15 | ||
DOWNLOAD_TIMEOUT = 120 | ||
|
||
# Disable Telnet Console (enabled by default) | ||
#TELNETCONSOLE_ENABLED = False | ||
|
||
# Override the default request headers: | ||
#DEFAULT_REQUEST_HEADERS = { | ||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | ||
# 'Accept-Language': 'en', | ||
#} | ||
|
||
# Enable or disable spider middlewares | ||
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
#SPIDER_MIDDLEWARES = { | ||
# 'stackoverflow.middlewares.StackoverflowSpiderMiddleware': 543, | ||
#} | ||
|
||
# Enable or disable downloader middlewares | ||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
# from stackoverflow.spiders.stackoverflow_spider import StackoverflowSpider | ||
|
||
DOWNLOADER_MIDDLEWARES = { | ||
'stackoverflow.middleware.useragent.RandomUserAgentMiddleware':400, | ||
# 'scrapy_crawlera.CrawleraMiddleware': 610, | ||
# 'stackoverflow.spiders.proxy_middleware.ProxyMiddleware':750, | ||
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750, | ||
# 'stackoverflow.middlewares.MyCustomDownloaderMiddleware': 543, | ||
} | ||
|
||
# Crwalera setting | ||
# Enable Crawlear extensions | ||
# CRAWLERA_ENABLED = True | ||
|
||
# Crawlera private API KEY | ||
# CRAWLERA_APIKEY = 'a67ebee1e6764a6f87585056f155ed1d' | ||
|
||
# Preserver DOWNLOAD_DELAY setting if it was be set | ||
# CRAWLERA_PRESERVE_DELAY = True | ||
|
||
# Enable or disable extensions | ||
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html | ||
#EXTENSIONS = { | ||
# 'scrapy.extensions.telnet.TelnetConsole': None, | ||
#} | ||
|
||
# Configure item pipelines | ||
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html | ||
#ITEM_PIPELINES = { | ||
# 'stackoverflow.pipelines.StackoverflowPipeline': 300, | ||
#} | ||
|
||
# Enable and configure the AutoThrottle extension (disabled by default) | ||
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html | ||
#AUTOTHROTTLE_ENABLED = True | ||
# The initial download delay | ||
#AUTOTHROTTLE_START_DELAY = 5 | ||
# The maximum download delay to be set in case of high latencies | ||
#AUTOTHROTTLE_MAX_DELAY = 60 | ||
# The average number of requests Scrapy should be sending in parallel to | ||
# each remote server | ||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 | ||
# Enable showing throttling stats for every response received: | ||
#AUTOTHROTTLE_DEBUG = False | ||
|
||
# Enable and configure HTTP caching (disabled by default) | ||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings | ||
|
||
# Whether the HTTP cache will be enabled.(default:False) | ||
# HTTPCACHE_ENABLED = True | ||
|
||
# Expiration time for cached requests, in seconds.(default:0) | ||
# Cached requests older than this time will be re-downloaded. If zero, cached requests will never expire. | ||
# HTTPCACHE_EXPIRATION_SECS = 0 | ||
|
||
# The directory to use for storing the (low-level) HTTP cache. If empty, the HTTP cache will be disabled. | ||
# If a relative path is given, is taken relative to the project data dir. | ||
# HTTPCACHE_DIR = 'httpcache' | ||
|
||
# Don’t cache response with these HTTP codes. | ||
# HTTPCACHE_IGNORE_HTTP_CODES = [] | ||
|
||
# The class which implements the cache storage backend. | ||
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# This package will contain the spiders of your Scrapy project | ||
# | ||
# Please refer to the documentation for information on how to create and manage | ||
# your spiders. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Define here the models for your scraped items | ||
# | ||
# See documentation in: | ||
# http://doc.scrapy.org/en/latest/topics/items.html | ||
|
||
import scrapy | ||
|
||
class StackoverflowItem(scrapy.Item): | ||
|
||
links = scrapy.Field() | ||
views = scrapy.Field() | ||
votes = scrapy.Field() | ||
answers = scrapy.Field() | ||
tags = scrapy.Field() | ||
questions = scrapy.Field() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import scrapy | ||
from stackoverflow.spiders.items import StackoverflowItem | ||
|
||
|
||
class StackoverflowSpider(scrapy.Spider): | ||
|
||
name = "stackoverflow" | ||
|
||
def start_requests(self): | ||
|
||
urls = ['http://stackoverflow.com/questions?page={page}&sort=votes&pagesize=50'.format(page=page) | ||
for page in range(11,21)] | ||
|
||
for url in urls: | ||
yield scrapy.Request(url=url, callback=self.parse) | ||
|
||
|
||
def parse(self, response): | ||
|
||
for index in range(1,51): | ||
sel = response.xpath('//*[@id="questions"]/div[{index}]'.format(index=index)) | ||
item = StackoverflowItem() | ||
item['votes'] = sel.xpath('div[1]/div[2]/div[1]/div[1]/span/strong/text()').extract() | ||
item['answers'] = sel.xpath('div[1]/div[2]/div[2]/strong/text()').extract() | ||
item['views'] = "".join(sel.xpath('div[1]/div[3]/@title').extract()).split()[0].replace(",","") | ||
item['questions'] = sel.xpath('div[2]/h3/a/text()').extract() | ||
item['links'] = "".join(sel.xpath('div[2]/h3/a/@href').extract()).split("/")[2] | ||
item['tags']= sel.xpath('div[2]/div[2]/a/text()').extract() | ||
yield item |