Skip to content

Commit

Permalink
init repo
Browse files Browse the repository at this point in the history
  • Loading branch information
chenjiandongx committed Mar 26, 2017
1 parent d458680 commit 98547ea
Show file tree
Hide file tree
Showing 11 changed files with 322 additions and 0 deletions.
9 changes: 9 additions & 0 deletions .gitignore
@@ -0,0 +1,9 @@
.idea
*.xml
.scrapy
*.pyc
/__pycache__/
*.json
*.txt
/data/
/crawls/
11 changes: 11 additions & 0 deletions scrapy.cfg
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = stackoverflow.settings

[deploy]
#url = http://localhost:6800/
project = stackoverflow
Empty file.
10 changes: 10 additions & 0 deletions stackoverflow/middleware/httpproxy.py
@@ -0,0 +1,10 @@
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware

class ProxyMiddleware(HttpProxyMiddleware):
""" overwrite process request """

def process_request(self, request, spider):
# Set the location of the proxy
# Using XX-NET port
proxy_ip = "http://127.0.0.1:8087"
request.meta['proxy'] = proxy_ip
53 changes: 53 additions & 0 deletions stackoverflow/middleware/useragent.py
@@ -0,0 +1,53 @@
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware

class RandomUserAgentMiddleware(UserAgentMiddleware):
""" Class extends UserAgentMiddleware for set myself UAmiddlweare """

def __init__(self, user_agent=''):
self.user_agent = user_agent

def process_request(self, request, spider):
""" override process_request to setting User-Agent """
ua = random.choice(self.user_agent_list)
if ua:
request.headers.setdefault('User-Agent', ua)

user_agent_list = [ \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
Empty file added stackoverflow/mysql/__init__.py
Empty file.
31 changes: 31 additions & 0 deletions stackoverflow/mysql/sql.py
@@ -0,0 +1,31 @@
import json
import pymysql

conn = pymysql.connect(host = "localhost",port = 3306,user = "root",passwd = "0303",db = "chenx",charset="utf8")
cur = conn.cursor()

with open(r"e:\python\stackoverflow\data2.json","r",encoding="utf-8") as f:
data = json.load(f)

def insert_db(s_links, s_views, s_votes, s_answers, s_tags, s_questions):
"""" Insert data to database """

sql = "insert into stackoverflow(s_links, s_views, s_votes, s_answers, s_tags, s_questions) " \
"values(%s, %s, %s, %s, %s, %s)"
value = (s_links, s_views, s_votes, s_answers, s_tags, s_questions)
cur.execute(sql, value)
conn.commit()
print(s_links + " Done")
print("Insert s_links: " + s_links)

for i in range(len(data)):
s_links = data[i]['links']
s_views = data[i]['views']
s_votes = data[i]['votes']
s_answers = data[i]['answers']
s_tags = " ".join(data[i]['tags'])
s_questions = data[i]['questions']
try:
insert_db(s_links, s_views, s_votes, s_answers, s_tags, s_questions)
except Exception as e :
print(e)
158 changes: 158 additions & 0 deletions stackoverflow/settings.py
@@ -0,0 +1,158 @@
# -*- coding: utf-8 -*-

# Scrapy settings for stackoverflow project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

# from stackoverflow.middleware import httpproxy
# from stackoverflow.middleware import useragent

BOT_NAME = 'stackoverflow'

SPIDER_MODULES = ['stackoverflow.spiders']
NEWSPIDER_MODULE = 'stackoverflow.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'stackoverflow (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# encoding setting
FEED_EXPORT_ENCODING = 'utf-8'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 100

# Maximum number of concurrent items (per response) to process in parallel in the Item Processor(defalut:100)
CONCURRENT_ITEMS = 100

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs

# his setting is also affected by the RANDOMIZE_DOWNLOAD_DELAY setting (which is enabled by default).
# By default, Scrapy doesn’t wait a fixed amount of time between requests,
# but uses a random interval between 0.5 * DOWNLOAD_DELAY and 1.5 * DOWNLOAD_DELAY.
DOWNLOAD_DELAY = 2.5


# The download delay setting will honor only one of:
# The maximum number of concurrent (ie. simultaneous) requests that will be performed to any single domain (defalut:8)
CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# The maximum limit for Twisted Reactor thread pool size. (defalut:10)
REACTOR_THREADPOOL_MAXSIZE = 32

# Defines the maximum times a request can be redirected. (deaflut:20)
# After this maximum the request’s response is returned as is. We used Firefox default value for the same task.
# REDIRECT_MAX_TIMES = 25

# Retrying failed HTTP requests can slow down the crawls substantially,
# specially when sites causes are very slow (or fail) to respond,
# thus causing a timeout error which gets retried many times,
# unnecessarily, preventing crawler capacity to be reused for other domains.
REDIRECT_ENABLED = False

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Enables the AutoThrottle extension.(default:Flase)
# AUTOTHROTTLE_ENABLED = False
AUTOTHROTTLE_ENABLED = False

# The amount of time (in secs) that the downloader will wait before timing out.(defalut:180)
# Unless you are crawling from a very slow connection (which shouldn’t be the case for broad crawls)
# reduce the download timeout so that stuck requests are discarded quickly and
# free up capacity to process the next ones.
# recommend : DOWNLOAD_TIMEOUT = 15
DOWNLOAD_TIMEOUT = 120

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'stackoverflow.middlewares.StackoverflowSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# from stackoverflow.spiders.stackoverflow_spider import StackoverflowSpider

DOWNLOADER_MIDDLEWARES = {
'stackoverflow.middleware.useragent.RandomUserAgentMiddleware':400,
# 'scrapy_crawlera.CrawleraMiddleware': 610,
# 'stackoverflow.spiders.proxy_middleware.ProxyMiddleware':750,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
# 'stackoverflow.middlewares.MyCustomDownloaderMiddleware': 543,
}

# Crwalera setting
# Enable Crawlear extensions
# CRAWLERA_ENABLED = True

# Crawlera private API KEY
# CRAWLERA_APIKEY = 'a67ebee1e6764a6f87585056f155ed1d'

# Preserver DOWNLOAD_DELAY setting if it was be set
# CRAWLERA_PRESERVE_DELAY = True

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'stackoverflow.pipelines.StackoverflowPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

# Whether the HTTP cache will be enabled.(default:False)
# HTTPCACHE_ENABLED = True

# Expiration time for cached requests, in seconds.(default:0)
# Cached requests older than this time will be re-downloaded. If zero, cached requests will never expire.
# HTTPCACHE_EXPIRATION_SECS = 0

# The directory to use for storing the (low-level) HTTP cache. If empty, the HTTP cache will be disabled.
# If a relative path is given, is taken relative to the project data dir.
# HTTPCACHE_DIR = 'httpcache'

# Don’t cache response with these HTTP codes.
# HTTPCACHE_IGNORE_HTTP_CODES = []

# The class which implements the cache storage backend.
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
4 changes: 4 additions & 0 deletions stackoverflow/spiders/__init__.py
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
17 changes: 17 additions & 0 deletions stackoverflow/spiders/items.py
@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class StackoverflowItem(scrapy.Item):

links = scrapy.Field()
views = scrapy.Field()
votes = scrapy.Field()
answers = scrapy.Field()
tags = scrapy.Field()
questions = scrapy.Field()
29 changes: 29 additions & 0 deletions stackoverflow/spiders/stackoverflow_spider.py
@@ -0,0 +1,29 @@
import scrapy
from stackoverflow.spiders.items import StackoverflowItem


class StackoverflowSpider(scrapy.Spider):

name = "stackoverflow"

def start_requests(self):

urls = ['http://stackoverflow.com/questions?page={page}&sort=votes&pagesize=50'.format(page=page)
for page in range(11,21)]

for url in urls:
yield scrapy.Request(url=url, callback=self.parse)


def parse(self, response):

for index in range(1,51):
sel = response.xpath('//*[@id="questions"]/div[{index}]'.format(index=index))
item = StackoverflowItem()
item['votes'] = sel.xpath('div[1]/div[2]/div[1]/div[1]/span/strong/text()').extract()
item['answers'] = sel.xpath('div[1]/div[2]/div[2]/strong/text()').extract()
item['views'] = "".join(sel.xpath('div[1]/div[3]/@title').extract()).split()[0].replace(",","")
item['questions'] = sel.xpath('div[2]/h3/a/text()').extract()
item['links'] = "".join(sel.xpath('div[2]/h3/a/@href').extract()).split("/")[2]
item['tags']= sel.xpath('div[2]/div[2]/a/text()').extract()
yield item

0 comments on commit 98547ea

Please sign in to comment.