Initial commit

divtiply · Jun 26, 2018 · f34bef3 · f34bef3
commit f34bef3
Show file tree

Hide file tree

Showing 11 changed files with 349 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,117 @@
+
+# Created by https://www.gitignore.io/api/python
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+
+# End of https://www.gitignore.io/api/python
+
+
+/*.sqlite
+/*.sqlite-journal
+/*.db
+/*.db-journal
diff --git a/freedns_scraper/__init__.py b/freedns_scraper/__init__.py
diff --git a/freedns_scraper/settings.py b/freedns_scraper/settings.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for freedns_scraper project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'freedns_scraper'
+
+SPIDER_MODULES = ['freedns_scraper.spiders']
+NEWSPIDER_MODULE = 'freedns_scraper.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'freedns_scraper (+http://www.yourdomain.com)'
+USER_AGENT = 'Mozilla/5.0'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'freedns_scraper.middlewares.FreeDNSScraperSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'freedns_scraper.middlewares.FreeDNSScraperDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+#    'freedns_scraper.pipelines.FreeDNSScraperPipeline': 300,
+    'scrapy_sqlite_pipeline.SQLitePipeline': 310,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+LOG_LEVEL = 'INFO'
+
+SQLITE_DATABASE = 'data.sqlite'
+SQLITE_TABLE = 'data'
diff --git a/freedns_scraper/spiders/__init__.py b/freedns_scraper/spiders/__init__.py
diff --git a/freedns_scraper/spiders/domain_registry.py b/freedns_scraper/spiders/domain_registry.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+
+from datetime import datetime
+
+import scrapy
+from scrapy import Spider, Request, Item, Field
+from scrapy.loader import ItemLoader
+from scrapy.loader.processors import TakeFirst, MapCompose
+
+
+class RegistryItem(Item):
+    name = 'data'  # used as db table name by pipeline
+    # TODO: sqlite_keys
+    # see https://github.com/RockyZ/Scrapy-sqlite-item-exporter/blob/master/exporters.py
+    domain = Field()
+    host_count = Field()
+    website = Field()
+    status = Field()
+    owner = Field()
+    created_on = Field()
+
+
+class RegistryItemLoader(ItemLoader):
+    default_item_class = RegistryItem
+    default_output_processor = TakeFirst()
+    # host_count_in = MapCompose(int)
+    # created_on_in = MapCompose(
+    #     lambda s: datetime.strptime(s, '%m/%d/%Y').date())
+
+
+class RegistrySpider(Spider):
+    name = 'domain-registry'
+    allowed_domains = ['freedns.afraid.org']
+    base_url = 'http://freedns.afraid.org/domain/registry/'
+    start_urls = [base_url]
+    requests_made = True
+
+    def parse(self, response):
+        """
+        @url http://freedns.afraid.org/domain/registry/
+        @returns items 1 100
+        @scrapes domain host_count website status owner created_on
+        """
+        table = response.xpath('//table[form[@action="/domain/registry/"]]')
+        for row in table.xpath('./tr[@class="trl" or @class="trd"]'):
+            l = RegistryItemLoader(selector=row)
+            l.add_xpath('domain', './td[1]/a/text()')
+            l.add_xpath('host_count', './td[1]/span/text()', re=r'\d+')
+            l.add_xpath('website', './td[1]/span/a/@href')
+            l.add_xpath('status', './td[2]/text()')
+            l.add_xpath('owner', './td[3]/a/text()')
+            l.add_xpath('created_on', './td[4]/text()', re=r'\(([^)]+)\)')
+            yield l.load_item()
+
+        if not self.requests_made:
+            self.requests_made = True
+            last_page = int(
+                table.xpath(
+                    './tr[last()]//font[./text()[starts-with(., "Page ")]]//text()[contains(., " of ")]'
+                ).re_first(r'\d+'))
+            for page in range(2, last_page + 1):
+                yield Request(self.base_url + '?page=%d' % page)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+Scrapy==1.5.0
diff --git a/runtime.txt b/runtime.txt
@@ -0,0 +1 @@
+python-3.6.5
diff --git a/scraper.py b/scraper.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+
+process = CrawlerProcess(get_project_settings())
+
+process.crawl('registry')
+process.start()
diff --git a/scrapy.cfg b/scrapy.cfg
@@ -0,0 +1,6 @@
+[settings]
+default = freedns_scraper.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = freedns_scraper
diff --git a/scrapy_sqlite_pipeline/__init__.py b/scrapy_sqlite_pipeline/__init__.py
@@ -0,0 +1 @@
+from .sqlite_pipeline import SQLitePipeline
diff --git a/scrapy_sqlite_pipeline/sqlite_pipeline.py b/scrapy_sqlite_pipeline/sqlite_pipeline.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+
+# TODO: convert to exporter
+# see https://github.com/RockyZ/Scrapy-sqlite-item-exporter/blob/master/exporters.py
+
+import sqlite3
+
+
+class SQLitePipeline(object):
+    def __init__(self, database='data.sqlite', table='data'):
+        self.database = database
+        self.table = table
+        self.created_tables = []
+
+    def open_spider(self, spider):
+        self.connection = sqlite3.connect(self.database)
+        self.cursor = self.connection.cursor()
+
+    def close_spider(self, spider):
+        self.connection.close()
+
+    def process_item(self, item, spider):
+        table = item.name
+        if table not in self.created_tables:
+            columns = item.fields.keys()
+            self._create_table(table, columns)
+            self.created_tables.append(table)
+        self._upsert(table, item)
+        return item
+
+    def _upsert(self, table, item):
+        # TODO
+        # https://stackoverflow.com/questions/15277373/sqlite-upsert-update-or-insert
+        columns = item.keys()
+        values = list(item.values())
+        sql = 'INSERT OR REPLACE INTO "%s" (%s) VALUES (%s)' % (
+            table,
+            ', '.join('`%s`' % x for x in columns),
+            ', '.join('?' for x in values),
+        )
+        self.cursor.execute(sql, values)
+        self.connection.commit()
+
+    def _create_table(self, table, columns, keys=None):
+        sql = 'CREATE TABLE IF NOT EXISTS "%s" ' % table
+        column_define = ['`%s` TEXT' % column for column in columns]
+        if keys:
+            if len(keys) > 0:
+                primary_key = 'PRIMARY KEY (%s)' % ', '.join(keys[0])
+                column_define.append(primary_key)
+            for key in keys[1:]:
+                column_define.append('UNIQUE (%s)' % ', '.join(key))
+        sql += '(%s)' % ', '.join(column_define)
+        self.cursor.execute(sql)
+        self.connection.commit()