init repo

chenjiandongx · Mar 26, 2017 · 98547ea · 98547ea
1 parent d458680
commit 98547ea
Show file tree

Hide file tree

Showing 11 changed files with 322 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+.idea
+*.xml
+.scrapy
+*.pyc
+/__pycache__/
+*.json
+*.txt
+/data/
+/crawls/
diff --git a/scrapy.cfg b/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = stackoverflow.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = stackoverflow
diff --git a/stackoverflow/middleware/__init__.py b/stackoverflow/middleware/__init__.py
diff --git a/stackoverflow/middleware/httpproxy.py b/stackoverflow/middleware/httpproxy.py
@@ -0,0 +1,10 @@
+from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
+
+class ProxyMiddleware(HttpProxyMiddleware):
+    """ overwrite process request """
+
+    def process_request(self, request, spider):
+        # Set the location of the proxy
+        # Using XX-NET port
+        proxy_ip = "http://127.0.0.1:8087"
+        request.meta['proxy'] = proxy_ip
diff --git a/stackoverflow/middleware/useragent.py b/stackoverflow/middleware/useragent.py
@@ -0,0 +1,53 @@
+import random
+from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
+
+class RandomUserAgentMiddleware(UserAgentMiddleware):
+    """ Class extends UserAgentMiddleware for set myself UAmiddlweare """
+
+    def __init__(self, user_agent=''):
+        self.user_agent = user_agent
+
+    def process_request(self, request, spider):
+        """ override process_request to setting User-Agent """
+        ua = random.choice(self.user_agent_list)
+        if ua:
+            request.headers.setdefault('User-Agent', ua)
+
+    user_agent_list = [ \
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
+        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
+        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
+        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
+        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
+        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
+        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
+        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
+        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
+        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
+        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
+        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
+        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
+    ]
diff --git a/stackoverflow/mysql/__init__.py b/stackoverflow/mysql/__init__.py
diff --git a/stackoverflow/mysql/sql.py b/stackoverflow/mysql/sql.py
@@ -0,0 +1,31 @@
+import json
+import pymysql
+
+conn = pymysql.connect(host = "localhost",port = 3306,user = "root",passwd = "0303",db = "chenx",charset="utf8")
+cur = conn.cursor()
+
+with open(r"e:\python\stackoverflow\data2.json","r",encoding="utf-8") as f:
+    data = json.load(f)
+
+    def insert_db(s_links, s_views, s_votes, s_answers, s_tags, s_questions):
+        """" Insert data to database """
+
+        sql = "insert into stackoverflow(s_links, s_views, s_votes, s_answers, s_tags, s_questions) " \
+              "values(%s, %s, %s, %s, %s, %s)"
+        value = (s_links, s_views, s_votes, s_answers, s_tags, s_questions)
+        cur.execute(sql, value)
+        conn.commit()
+        print(s_links + " Done")
+        print("Insert s_links: " + s_links)
+
+    for i in range(len(data)):
+        s_links = data[i]['links']
+        s_views = data[i]['views']
+        s_votes = data[i]['votes']
+        s_answers = data[i]['answers']
+        s_tags = " ".join(data[i]['tags'])
+        s_questions = data[i]['questions']
+        try:
+            insert_db(s_links, s_views, s_votes, s_answers, s_tags, s_questions)
+        except Exception as e :
+            print(e)
diff --git a/stackoverflow/settings.py b/stackoverflow/settings.py
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for stackoverflow project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+# from stackoverflow.middleware import httpproxy
+# from stackoverflow.middleware import useragent
+
+BOT_NAME = 'stackoverflow'
+
+SPIDER_MODULES = ['stackoverflow.spiders']
+NEWSPIDER_MODULE = 'stackoverflow.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'stackoverflow (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# encoding setting
+FEED_EXPORT_ENCODING = 'utf-8'
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+CONCURRENT_REQUESTS = 100
+
+# Maximum number of concurrent items (per response) to process in parallel in the Item Processor(defalut:100)
+CONCURRENT_ITEMS = 100
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+
+# his setting is also affected by the RANDOMIZE_DOWNLOAD_DELAY setting (which is enabled by default).
+# By default, Scrapy doesn’t wait a fixed amount of time between requests,
+# but uses a random interval between 0.5 * DOWNLOAD_DELAY and 1.5 * DOWNLOAD_DELAY.
+DOWNLOAD_DELAY = 2.5
+
+
+# The download delay setting will honor only one of:
+# The maximum number of concurrent (ie. simultaneous) requests that will be performed to any single domain (defalut:8)
+CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# The maximum limit for Twisted Reactor thread pool size. (defalut:10)
+REACTOR_THREADPOOL_MAXSIZE = 32
+
+# Defines the maximum times a request can be redirected. (deaflut:20)
+# After this maximum the request’s response is returned as is. We used Firefox default value for the same task.
+# REDIRECT_MAX_TIMES = 25
+
+# Retrying failed HTTP requests can slow down the crawls substantially,
+# specially when sites causes are very slow (or fail) to respond,
+# thus causing a timeout error which gets retried many times,
+# unnecessarily, preventing crawler capacity to be reused for other domains.
+REDIRECT_ENABLED = False
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Enables the AutoThrottle extension.(default:Flase)
+# AUTOTHROTTLE_ENABLED = False
+AUTOTHROTTLE_ENABLED = False
+
+# The amount of time (in secs) that the downloader will wait before timing out.(defalut:180)
+# Unless you are crawling from a very slow connection (which shouldn’t be the case for broad crawls)
+# reduce the download timeout so that stuck requests are discarded quickly and
+# free up capacity to process the next ones.
+# recommend : DOWNLOAD_TIMEOUT = 15
+DOWNLOAD_TIMEOUT = 120
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'stackoverflow.middlewares.StackoverflowSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+# from stackoverflow.spiders.stackoverflow_spider import StackoverflowSpider
+
+DOWNLOADER_MIDDLEWARES = {
+   'stackoverflow.middleware.useragent.RandomUserAgentMiddleware':400,
+   # 'scrapy_crawlera.CrawleraMiddleware': 610,
+   # 'stackoverflow.spiders.proxy_middleware.ProxyMiddleware':750,
+   'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
+   # 'stackoverflow.middlewares.MyCustomDownloaderMiddleware': 543,
+}
+
+# Crwalera setting
+# Enable Crawlear extensions
+# CRAWLERA_ENABLED = True
+
+# Crawlera private API KEY
+# CRAWLERA_APIKEY = 'a67ebee1e6764a6f87585056f155ed1d'
+
+# Preserver DOWNLOAD_DELAY setting if it was be set
+# CRAWLERA_PRESERVE_DELAY = True
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'stackoverflow.pipelines.StackoverflowPipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+
+# Whether the HTTP cache will be enabled.(default:False)
+# HTTPCACHE_ENABLED = True
+
+# Expiration time for cached requests, in seconds.(default:0)
+# Cached requests older than this time will be re-downloaded. If zero, cached requests will never expire.
+# HTTPCACHE_EXPIRATION_SECS = 0
+
+# The directory to use for storing the (low-level) HTTP cache. If empty, the HTTP cache will be disabled.
+# If a relative path is given, is taken relative to the project data dir.
+# HTTPCACHE_DIR = 'httpcache'
+
+# Don’t cache response with these HTTP codes.
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+
+# The class which implements the cache storage backend.
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/stackoverflow/spiders/__init__.py b/stackoverflow/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/stackoverflow/spiders/items.py b/stackoverflow/spiders/items.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+class StackoverflowItem(scrapy.Item):
+
+    links = scrapy.Field()
+    views = scrapy.Field()
+    votes = scrapy.Field()
+    answers = scrapy.Field()
+    tags = scrapy.Field()
+    questions = scrapy.Field()
diff --git a/stackoverflow/spiders/stackoverflow_spider.py b/stackoverflow/spiders/stackoverflow_spider.py
@@ -0,0 +1,29 @@
+import scrapy
+from stackoverflow.spiders.items import StackoverflowItem
+
+
+class StackoverflowSpider(scrapy.Spider):
+
+    name = "stackoverflow"
+
+    def start_requests(self):
+
+        urls = ['http://stackoverflow.com/questions?page={page}&sort=votes&pagesize=50'.format(page=page)
+                for page in range(11,21)]
+
+        for url in urls:
+            yield scrapy.Request(url=url, callback=self.parse)
+
+
+    def parse(self, response):
+
+        for index in range(1,51):
+            sel = response.xpath('//*[@id="questions"]/div[{index}]'.format(index=index))
+            item = StackoverflowItem()
+            item['votes'] = sel.xpath('div[1]/div[2]/div[1]/div[1]/span/strong/text()').extract()
+            item['answers'] = sel.xpath('div[1]/div[2]/div[2]/strong/text()').extract()
+            item['views'] = "".join(sel.xpath('div[1]/div[3]/@title').extract()).split()[0].replace(",","")
+            item['questions'] = sel.xpath('div[2]/h3/a/text()').extract()
+            item['links'] = "".join(sel.xpath('div[2]/h3/a/@href').extract()).split("/")[2]
+            item['tags']= sel.xpath('div[2]/div[2]/a/text()').extract()
+            yield item