From f34bef38ab816f8e18e78b61a333517dc8cdbe75 Mon Sep 17 00:00:00 2001 From: Michel Ace Date: Tue, 26 Jun 2018 16:30:56 +0200 Subject: [PATCH] Initial commit --- .gitignore | 117 +++++++++++++++++++++ freedns_scraper/__init__.py | 0 freedns_scraper/settings.py | 97 +++++++++++++++++ freedns_scraper/spiders/__init__.py | 0 freedns_scraper/spiders/domain_registry.py | 62 +++++++++++ requirements.txt | 1 + runtime.txt | 1 + scraper.py | 9 ++ scrapy.cfg | 6 ++ scrapy_sqlite_pipeline/__init__.py | 1 + scrapy_sqlite_pipeline/sqlite_pipeline.py | 55 ++++++++++ 11 files changed, 349 insertions(+) create mode 100644 .gitignore create mode 100644 freedns_scraper/__init__.py create mode 100644 freedns_scraper/settings.py create mode 100644 freedns_scraper/spiders/__init__.py create mode 100644 freedns_scraper/spiders/domain_registry.py create mode 100644 requirements.txt create mode 100644 runtime.txt create mode 100644 scraper.py create mode 100644 scrapy.cfg create mode 100644 scrapy_sqlite_pipeline/__init__.py create mode 100644 scrapy_sqlite_pipeline/sqlite_pipeline.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..df176f5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,117 @@ + +# Created by https://www.gitignore.io/api/python + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + + +# End of https://www.gitignore.io/api/python + + +/*.sqlite +/*.sqlite-journal +/*.db +/*.db-journal diff --git a/freedns_scraper/__init__.py b/freedns_scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/freedns_scraper/settings.py b/freedns_scraper/settings.py new file mode 100644 index 0000000..b1b7e8f --- /dev/null +++ b/freedns_scraper/settings.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for freedns_scraper project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'freedns_scraper' + +SPIDER_MODULES = ['freedns_scraper.spiders'] +NEWSPIDER_MODULE = 'freedns_scraper.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'freedns_scraper (+http://www.yourdomain.com)' +USER_AGENT = 'Mozilla/5.0' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'freedns_scraper.middlewares.FreeDNSScraperSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'freedns_scraper.middlewares.FreeDNSScraperDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { +# 'freedns_scraper.pipelines.FreeDNSScraperPipeline': 300, + 'scrapy_sqlite_pipeline.SQLitePipeline': 310, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +LOG_LEVEL = 'INFO' + +SQLITE_DATABASE = 'data.sqlite' +SQLITE_TABLE = 'data' diff --git a/freedns_scraper/spiders/__init__.py b/freedns_scraper/spiders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/freedns_scraper/spiders/domain_registry.py b/freedns_scraper/spiders/domain_registry.py new file mode 100644 index 0000000..6d5e952 --- /dev/null +++ b/freedns_scraper/spiders/domain_registry.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- + +from datetime import datetime + +import scrapy +from scrapy import Spider, Request, Item, Field +from scrapy.loader import ItemLoader +from scrapy.loader.processors import TakeFirst, MapCompose + + +class RegistryItem(Item): + name = 'data' # used as db table name by pipeline + # TODO: sqlite_keys + # see https://github.com/RockyZ/Scrapy-sqlite-item-exporter/blob/master/exporters.py + domain = Field() + host_count = Field() + website = Field() + status = Field() + owner = Field() + created_on = Field() + + +class RegistryItemLoader(ItemLoader): + default_item_class = RegistryItem + default_output_processor = TakeFirst() + # host_count_in = MapCompose(int) + # created_on_in = MapCompose( + # lambda s: datetime.strptime(s, '%m/%d/%Y').date()) + + +class RegistrySpider(Spider): + name = 'domain-registry' + allowed_domains = ['freedns.afraid.org'] + base_url = 'http://freedns.afraid.org/domain/registry/' + start_urls = [base_url] + requests_made = True + + def parse(self, response): + """ + @url http://freedns.afraid.org/domain/registry/ + @returns items 1 100 + @scrapes domain host_count website status owner created_on + """ + table = response.xpath('//table[form[@action="/domain/registry/"]]') + for row in table.xpath('./tr[@class="trl" or @class="trd"]'): + l = RegistryItemLoader(selector=row) + l.add_xpath('domain', './td[1]/a/text()') + l.add_xpath('host_count', './td[1]/span/text()', re=r'\d+') + l.add_xpath('website', './td[1]/span/a/@href') + l.add_xpath('status', './td[2]/text()') + l.add_xpath('owner', './td[3]/a/text()') + l.add_xpath('created_on', './td[4]/text()', re=r'\(([^)]+)\)') + yield l.load_item() + + if not self.requests_made: + self.requests_made = True + last_page = int( + table.xpath( + './tr[last()]//font[./text()[starts-with(., "Page ")]]//text()[contains(., " of ")]' + ).re_first(r'\d+')) + for page in range(2, last_page + 1): + yield Request(self.base_url + '?page=%d' % page) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b83fc38 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +Scrapy==1.5.0 \ No newline at end of file diff --git a/runtime.txt b/runtime.txt new file mode 100644 index 0000000..4b5675d --- /dev/null +++ b/runtime.txt @@ -0,0 +1 @@ +python-3.6.5 \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..3c1378a --- /dev/null +++ b/scraper.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- + +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings + +process = CrawlerProcess(get_project_settings()) + +process.crawl('registry') +process.start() diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..af7da94 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,6 @@ +[settings] +default = freedns_scraper.settings + +[deploy] +#url = http://localhost:6800/ +project = freedns_scraper diff --git a/scrapy_sqlite_pipeline/__init__.py b/scrapy_sqlite_pipeline/__init__.py new file mode 100644 index 0000000..d7cdb88 --- /dev/null +++ b/scrapy_sqlite_pipeline/__init__.py @@ -0,0 +1 @@ +from .sqlite_pipeline import SQLitePipeline diff --git a/scrapy_sqlite_pipeline/sqlite_pipeline.py b/scrapy_sqlite_pipeline/sqlite_pipeline.py new file mode 100644 index 0000000..d1b673f --- /dev/null +++ b/scrapy_sqlite_pipeline/sqlite_pipeline.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- + +# TODO: convert to exporter +# see https://github.com/RockyZ/Scrapy-sqlite-item-exporter/blob/master/exporters.py + +import sqlite3 + + +class SQLitePipeline(object): + def __init__(self, database='data.sqlite', table='data'): + self.database = database + self.table = table + self.created_tables = [] + + def open_spider(self, spider): + self.connection = sqlite3.connect(self.database) + self.cursor = self.connection.cursor() + + def close_spider(self, spider): + self.connection.close() + + def process_item(self, item, spider): + table = item.name + if table not in self.created_tables: + columns = item.fields.keys() + self._create_table(table, columns) + self.created_tables.append(table) + self._upsert(table, item) + return item + + def _upsert(self, table, item): + # TODO + # https://stackoverflow.com/questions/15277373/sqlite-upsert-update-or-insert + columns = item.keys() + values = list(item.values()) + sql = 'INSERT OR REPLACE INTO "%s" (%s) VALUES (%s)' % ( + table, + ', '.join('`%s`' % x for x in columns), + ', '.join('?' for x in values), + ) + self.cursor.execute(sql, values) + self.connection.commit() + + def _create_table(self, table, columns, keys=None): + sql = 'CREATE TABLE IF NOT EXISTS "%s" ' % table + column_define = ['`%s` TEXT' % column for column in columns] + if keys: + if len(keys) > 0: + primary_key = 'PRIMARY KEY (%s)' % ', '.join(keys[0]) + column_define.append(primary_key) + for key in keys[1:]: + column_define.append('UNIQUE (%s)' % ', '.join(key)) + sql += '(%s)' % ', '.join(column_define) + self.cursor.execute(sql) + self.connection.commit()