Skip to content

Commit

Permalink
Base structure
Browse files Browse the repository at this point in the history
  • Loading branch information
elliotgao2 committed May 31, 2017
1 parent d319037 commit f69f814
Show file tree
Hide file tree
Showing 10 changed files with 808 additions and 0 deletions.
674 changes: 674 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

Empty file added README.md
Empty file.
32 changes: 32 additions & 0 deletions example/hacknews_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from gain import Css, Item, Parser, Regex, Spider, Xpath


class Post(Item):
id = Regex('')
title = Css('')
username = Css('')
url = Css('')
points = Xpath('')

def save(self):
print(self.results)


class User(Item):
id = Regex('\d+')
username = Css('.username')
karma = Xpath('//[@class=karma]')

def save(self):
print(self.results)


class MySpider(Spider):
start_url = ''
follow_urls = ['',
'']
parsers = [Parser('', Post),
Parser('', User)]


MySpider.run()
6 changes: 6 additions & 0 deletions gain/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .item import Item
from .parser import Parser
from .selector import Css, Regex, Xpath
from .spider import Spider

__all__ = ('Css', 'Regex', 'Xpath', 'Item', 'Spider', 'Parser')
32 changes: 32 additions & 0 deletions gain/item.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from .selector import Selector


class ItemType(type):
def __new__(mcs, name, bases, namespace):
""""""
selectors = {}
for name, value in namespace.items():
if isinstance(value, Selector):
selectors[name] = value
namespace['selectors'] = selectors
for name, value in selectors.items():
del namespace[name]
return type.__new__(mcs, name, bases, namespace)


class Item(metaclass=ItemType):
""""""

def __init__(self, html):
self.results = {}
for name, selector in self.selectors.items():
self.results[name] = selector.parse(html)
self.save()

def __getattr__(self, item):
if item not in self.results:
raise AttributeError()
return self.results[item]

def save(self):
NotImplementedError('save() should be implemented')
7 changes: 7 additions & 0 deletions gain/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
class Parser:
def __init__(self, url, item):
self.item = item
self.url = url

def parse(self):
self.item(self.url).save()
24 changes: 24 additions & 0 deletions gain/selector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
class Selector:
def __init__(self, rule):
self.rule = rule

def __str__(self):
return '{}({})'.format(self.__class__.__name__, self.rule)

def __repr__(self):
return '{}({})'.format(self.__class__.__name__, self.rule)

def parse(self, html):
return 'Good'


class Css(Selector):
""""""


class Xpath(Selector):
""""""


class Regex(Selector):
""""""
9 changes: 9 additions & 0 deletions gain/spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
class Spider:
start_url = ''
follow_urls = []
parsers = []

@classmethod
def run(cls):
for parser in cls.parsers:
parser.parse()
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[metadata]
description-file = README.md
22 changes: 22 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from setuptools import find_packages, setup

setup(
name="gain",
version="0.1.0",
description="Web crawling framework for everyone.",
author="Gaojiuli",
author_email="gaojiuli@gmail.com",
url='https://github.com/gaojiuli/gain',
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'License :: OSI Approved :: GNU General Public License (GPL)',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
],
license='GNU GPL 3',
packages=find_packages(),
py_modules=['gain'],
include_package_data=True,
zip_safe=False
)

0 comments on commit f69f814

Please sign in to comment.