Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
divtiply committed Jun 26, 2018
0 parents commit f34bef3
Show file tree
Hide file tree
Showing 11 changed files with 349 additions and 0 deletions.
117 changes: 117 additions & 0 deletions .gitignore
@@ -0,0 +1,117 @@

# Created by https://www.gitignore.io/api/python

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/


# End of https://www.gitignore.io/api/python


/*.sqlite
/*.sqlite-journal
/*.db
/*.db-journal
Empty file added freedns_scraper/__init__.py
Empty file.
97 changes: 97 additions & 0 deletions freedns_scraper/settings.py
@@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-

# Scrapy settings for freedns_scraper project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'freedns_scraper'

SPIDER_MODULES = ['freedns_scraper.spiders']
NEWSPIDER_MODULE = 'freedns_scraper.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'freedns_scraper (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'freedns_scraper.middlewares.FreeDNSScraperSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'freedns_scraper.middlewares.FreeDNSScraperDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'freedns_scraper.pipelines.FreeDNSScraperPipeline': 300,
'scrapy_sqlite_pipeline.SQLitePipeline': 310,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

LOG_LEVEL = 'INFO'

SQLITE_DATABASE = 'data.sqlite'
SQLITE_TABLE = 'data'
Empty file.
62 changes: 62 additions & 0 deletions freedns_scraper/spiders/domain_registry.py
@@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-

from datetime import datetime

import scrapy
from scrapy import Spider, Request, Item, Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose


class RegistryItem(Item):
name = 'data' # used as db table name by pipeline
# TODO: sqlite_keys
# see https://github.com/RockyZ/Scrapy-sqlite-item-exporter/blob/master/exporters.py
domain = Field()
host_count = Field()
website = Field()
status = Field()
owner = Field()
created_on = Field()


class RegistryItemLoader(ItemLoader):
default_item_class = RegistryItem
default_output_processor = TakeFirst()
# host_count_in = MapCompose(int)
# created_on_in = MapCompose(
# lambda s: datetime.strptime(s, '%m/%d/%Y').date())


class RegistrySpider(Spider):
name = 'domain-registry'
allowed_domains = ['freedns.afraid.org']
base_url = 'http://freedns.afraid.org/domain/registry/'
start_urls = [base_url]
requests_made = True

def parse(self, response):
"""
@url http://freedns.afraid.org/domain/registry/
@returns items 1 100
@scrapes domain host_count website status owner created_on
"""
table = response.xpath('//table[form[@action="/domain/registry/"]]')
for row in table.xpath('./tr[@class="trl" or @class="trd"]'):
l = RegistryItemLoader(selector=row)
l.add_xpath('domain', './td[1]/a/text()')
l.add_xpath('host_count', './td[1]/span/text()', re=r'\d+')
l.add_xpath('website', './td[1]/span/a/@href')
l.add_xpath('status', './td[2]/text()')
l.add_xpath('owner', './td[3]/a/text()')
l.add_xpath('created_on', './td[4]/text()', re=r'\(([^)]+)\)')
yield l.load_item()

if not self.requests_made:
self.requests_made = True
last_page = int(
table.xpath(
'./tr[last()]//font[./text()[starts-with(., "Page ")]]//text()[contains(., " of ")]'
).re_first(r'\d+'))
for page in range(2, last_page + 1):
yield Request(self.base_url + '?page=%d' % page)
1 change: 1 addition & 0 deletions requirements.txt
@@ -0,0 +1 @@
Scrapy==1.5.0
1 change: 1 addition & 0 deletions runtime.txt
@@ -0,0 +1 @@
python-3.6.5
9 changes: 9 additions & 0 deletions scraper.py
@@ -0,0 +1,9 @@
# -*- coding: utf-8 -*-

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

process = CrawlerProcess(get_project_settings())

process.crawl('registry')
process.start()
6 changes: 6 additions & 0 deletions scrapy.cfg
@@ -0,0 +1,6 @@
[settings]
default = freedns_scraper.settings

[deploy]
#url = http://localhost:6800/
project = freedns_scraper
1 change: 1 addition & 0 deletions scrapy_sqlite_pipeline/__init__.py
@@ -0,0 +1 @@
from .sqlite_pipeline import SQLitePipeline
55 changes: 55 additions & 0 deletions scrapy_sqlite_pipeline/sqlite_pipeline.py
@@ -0,0 +1,55 @@
# -*- coding: utf-8 -*-

# TODO: convert to exporter
# see https://github.com/RockyZ/Scrapy-sqlite-item-exporter/blob/master/exporters.py

import sqlite3


class SQLitePipeline(object):
def __init__(self, database='data.sqlite', table='data'):
self.database = database
self.table = table
self.created_tables = []

def open_spider(self, spider):
self.connection = sqlite3.connect(self.database)
self.cursor = self.connection.cursor()

def close_spider(self, spider):
self.connection.close()

def process_item(self, item, spider):
table = item.name
if table not in self.created_tables:
columns = item.fields.keys()
self._create_table(table, columns)
self.created_tables.append(table)
self._upsert(table, item)
return item

def _upsert(self, table, item):
# TODO
# https://stackoverflow.com/questions/15277373/sqlite-upsert-update-or-insert
columns = item.keys()
values = list(item.values())
sql = 'INSERT OR REPLACE INTO "%s" (%s) VALUES (%s)' % (
table,
', '.join('`%s`' % x for x in columns),
', '.join('?' for x in values),
)
self.cursor.execute(sql, values)
self.connection.commit()

def _create_table(self, table, columns, keys=None):
sql = 'CREATE TABLE IF NOT EXISTS "%s" ' % table
column_define = ['`%s` TEXT' % column for column in columns]
if keys:
if len(keys) > 0:
primary_key = 'PRIMARY KEY (%s)' % ', '.join(keys[0])
column_define.append(primary_key)
for key in keys[1:]:
column_define.append('UNIQUE (%s)' % ', '.join(key))
sql += '(%s)' % ', '.join(column_define)
self.cursor.execute(sql)
self.connection.commit()

0 comments on commit f34bef3

Please sign in to comment.