Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

23 improve manager #34

Merged
merged 9 commits into from
Apr 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 12 additions & 1 deletion cli/cli.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
import os
from scraper import Settings
from cli import entry
from scraper import Settings
from scraper.components import fetchers, parsers, manager, writers
from scraper.settings.constants import SOURCES, OUTPUT_TYPE

# Settings object for the whole app

def start():

# register default components for the app

manager.register_handler(SOURCES.FINRA_SHORTS, fetchers.Finra, parsers.Finra)
manager.register_handler(SOURCES.SEC_FTD, fetchers.SecFtd, parsers.SecFtd)
manager.register_writer(OUTPUT_TYPE.SINGLE_FILE, writers.SingleFile)
manager.register_writer(OUTPUT_TYPE.SINGLE_TICKER, writers.MultiFile)

settings = Settings()

os.system('clear')

if settings.init():
Expand Down
3 changes: 1 addition & 2 deletions codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,4 @@ coverage:
precision: 0
range: 80...100

comment:
layout: "header, diff, changes"
comment: false
7 changes: 0 additions & 7 deletions scraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,6 @@
from scraper import components
from scraper import utils


from scraper.components import fetchers, parsers, manager
from scraper.settings.constants import SOURCES

manager.register(SOURCES.FINRA_SHORTS, fetchers.Finra, parsers.Finra)
manager.register(SOURCES.SEC_FTD, fetchers.SecFtd, parsers.SecFtd)

# Root dir for scraper
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))

17 changes: 17 additions & 0 deletions scraper/components/component_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import abc

class ComponentBase(abc.ABC):
@staticmethod
@abc.abstractmethod
def is_for(): # pragma: no cover
"""Should return a string/identifier for what this component is meant to.
In the case of a Writer for example this should be the output type it can
handle. (i.e. constants.OUTPUT_TYPE)
For Components that handle data (fetching, parsing...) the source
that links them together. (i.e. constants.SOURCES)

The identifier should be a unique way to identiy the istance of that type
of componant against others of the same type, but can be shared against
different types of components."""
return NotImplemented
# raise NotImplementedError
6 changes: 4 additions & 2 deletions scraper/components/fetchers/base_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
from contextlib import closing
import requests
import click

from scraper.components.component_base import ComponentBase
from scraper import utils


class Fetcher(abc.ABC):
class Fetcher(ComponentBase):
def __init__(self, settings, debug=False):
self.settings = settings
self.tickers = settings.tickers or []
Expand Down Expand Up @@ -44,7 +46,7 @@ def done(self):

@abc.abstractmethod
def make_url(self, *args, **kwargs): # pragma: no cover
raise NotImplementedError
return NotImplemented

# if the provided url is in the processed list return None, otherwise
# add it and return it.
Expand Down
5 changes: 5 additions & 0 deletions scraper/components/fetchers/finra_fetcher.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from scraper.settings.constants import SOURCES
from scraper.components.fetchers.base_fetcher import Fetcher
# import scraper.utils as utils

Expand All @@ -6,6 +7,10 @@ class FinraFetcher(Fetcher):
URL_BASE = "http://regsho.finra.org/CNMSshvol"
URL_END = ".txt"

@staticmethod
def is_for():
return SOURCES.FINRA_SHORTS

def make_url(self, date, *args, **kwargs):
""" Get the url for the specified date for the given source"""
date = date.strftime("%Y%m%d")
Expand Down
6 changes: 6 additions & 0 deletions scraper/components/fetchers/nasdaq_fetcher.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from scraper.settings.constants import SOURCES
from scraper.components.fetchers.base_fetcher import Fetcher
# import scraper.utils as utils

Expand All @@ -9,6 +10,11 @@ def __init__(self, settings, debug=False):
super().__init__(settings, debug)
self.loop_tickers_not_dates = True

@staticmethod
def is_for():
# TODO: Actually write its component and constants
return "NASDAQ"

def make_url(self, ticker, *args, **kwargs):
""" Get the url for the specified date for the given source """

Expand Down
5 changes: 5 additions & 0 deletions scraper/components/fetchers/secftd_fetcher.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from scraper.settings.constants import SOURCES
from scraper.components.fetchers.base_fetcher import Fetcher
# from dateutil.relativedelta import relativedelta

Expand All @@ -15,6 +16,10 @@ class SecFtdFetcher(Fetcher):
URL_VARIANTS = ["a", "b"]
URL_END = ".zip"

@staticmethod
def is_for():
return SOURCES.SEC_FTD

def make_url(self, date, *args, **kwargs):
date = date.strftime("%Y%m")

Expand Down
132 changes: 102 additions & 30 deletions scraper/components/manager.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,112 @@

from scraper.settings.constants import SOURCES
from scraper.components.fetchers.base_fetcher import Fetcher
from scraper.components.parsers.base_parser import Parser
from scraper.settings.constants import SOURCES, OUTPUT_TYPE
from scraper.components.fetchers.base_fetcher import Fetcher as FetcherBase
from scraper.components.parsers.base_parser import Parser as ParserBase
from scraper.components.writers.base_writer import Writer as WriterBase

"""
Module to be used as singleton to store components coupled with a source.
"""

registered_handler = []
registered_handlers = []
available_sources = []

registered_writers = []
available_outputs = []

def register_handler(source, fetcher_cls, parser_cls):
if source in available_sources:
return None

available_sources.append(source)

handler = ProcessHandler(source, fetcher_cls, parser_cls)
registered_handlers.append(handler)
return handler

def register_writer(output_type, writer_cls):
if output_type in available_outputs: # pragma: no cover
return None

def register(source, fetcher_cls, parser_cls):
if source in registered_handler:
return
handler = Handler(source, fetcher_cls, parser_cls)
registered_handler.append(handler)
available_outputs.append(output_type)
handler = WriterHandler(output_type, writer_cls)
registered_writers.append(handler)
return handler

def get_for(for_source):
handler = next((h for h in registered_handler if h == for_source), None)
if not handler:
def get_handlers(for_source=None):
# Avoid going further since
if not for_source in available_sources:
raise Exception("Handler for '{}' were not registered. please complain.".format(for_source))

handler = next((h for h in registered_handlers if h == for_source), None)
if not handler: # pragma: no cover
raise Exception("Handler for '{}' were not registered. please complain.".format(for_source))

return handler
return (handler.fetcher, handler.parser)

def get_writer(out_type):
if not out_type in available_outputs:
raise Exception("Writer for '{}' were not registered. please complain.".format(out_type))

handler = next((h for h in registered_writers if h == out_type), None)
if not handler: # pragma: no cover
raise Exception("Writer for '{}' were not registered. please complain.".format(out_type))

return handler.writer

def get_sources(): # pragma: no cover
return available_sources

def get_outputs(): # pragma: no cover
return available_outputs

def reset():
registered_handlers.clear()
available_sources.clear()
registered_writers.clear()
available_outputs.clear()

class _HandlerBase:
@staticmethod
def validate_register(register, group, prefix="Handler Registrar name"):
if not group: return True # pragma: no cover
if register not in group:
raise TypeError(f'{prefix} should be one of {group}')
return True
@staticmethod
def validate_component_class(cls, parent_cls, cls_ref="class"):
if not issubclass(cls, parent_cls):
raise TypeError("fetcher_cls should be a subclass of Writer")
return True
@staticmethod
def validate_component_target(target, component_cls, cls_ref="Component"):
if not target == component_cls.is_for():
raise TypeError(f"Provided {cls_ref} is not a match for {target}")

class WriterHandler(_HandlerBase):
def __init__(self, type_, writer_cls):
WriterHandler.validate_register(type_, OUTPUT_TYPE.VALID, "Output Type")
WriterHandler.validate_component_class(writer_cls, WriterBase, "Writers")
WriterHandler.validate_component_target(type_, writer_cls, "Writers")

class Handler:
self.output_type = type_
self.writer = writer_cls

def __eq__(self, output_type):
return self.output_type == output_type
def __del__(self):
if self.writer: del self.writer
del self


class ProcessHandler(_HandlerBase):
def __init__(self, source, fetcher_cls, parser_cls):
if not source in SOURCES.VALID:
raise TypeError("source should be a valid SOURCE (string)")
if not issubclass(fetcher_cls, Fetcher):
raise TypeError("fetcher_cls should be a subclass of Fetcher")
if not issubclass(parser_cls, Parser):
raise TypeError("parser_cls should be a subclass of Parser")
ProcessHandler.validate_register(source, SOURCES.VALID, "Source")

ProcessHandler.validate_component_class(fetcher_cls, FetcherBase, "Fetchers")
ProcessHandler.validate_component_target(source, fetcher_cls, "Fetchers")

ProcessHandler.validate_component_class(parser_cls, ParserBase, "Parsers")
ProcessHandler.validate_component_target(source, parser_cls, "Parsers")

# TODO: Add some way to match fetchers and parsers with the source.
# realistically a method/property to get the source that the class is for
Expand All @@ -41,13 +117,9 @@ def __init__(self, source, fetcher_cls, parser_cls):

def __eq__(self, source_name):
return self.source == source_name
def __str__(self): # pragma: no cover
return "'Handler for {}'".format(self.source)
def __repr__(self): # pragma: no cover
return "'<Handler for '{}' fetcher: {}, parser {} @ {}>".format(
self.source,
self.fetcher.__name__,
self.parser.__name__,
hex(id(self))
)

def __del__(self):
if self.fetcher: del self.fetcher
if self.parser: del self.parser
del self

12 changes: 7 additions & 5 deletions scraper/components/parsers/base_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import abc
from scraper.components.component_base import ComponentBase
from datetime import datetime as dt

# To be compatible with writers cache structure should be a dictionary shaped as
Expand All @@ -9,9 +10,10 @@
# And, if available, to parse the date in an interpretable format from other
# applications (for example Excel) using the provided method

class Parser(abc.ABC):
class Parser(ComponentBase):
def __init__(self, settings, debug=False):
# def __init__(self, parse_rows, debug=False):
ComponentBase.__init__(self)

self._cache = {}
self._header = []
self.settings = settings
Expand All @@ -36,19 +38,19 @@ def process_response_to_csv(self, response): # pragma: no cover
Takes the response object from a performed requests call and should
return a csv.reader object
"""
raise NotImplementedError
return NotImplemented

@abc.abstractmethod
def extract_ticker_from_row(self, row_data): # pragma: no cover
"""
Get the ticker from the right column of the row.
Mainly used to filter out rows
"""
raise NotImplementedError
return NotImplemented

@abc.abstractmethod
def parse_row(self, row): # pragma: no cover
raise NotImplementedError
return NotImplemented

def parse(self, response, separator='|'):
reader = self.process_response_to_csv(response)
Expand Down
7 changes: 5 additions & 2 deletions scraper/components/parsers/finra_parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import codecs, csv
from scraper.settings.constants import SOURCES
from scraper.components.parsers.base_parser import Parser

class FinraParser(Parser):
def __init__(self, settings, debug=False):
super().__init__(settings, debug)

@staticmethod
def is_for():
return SOURCES.FINRA_SHORTS

def process_response_to_csv(self, response):
return csv.reader(codecs.iterdecode(response.iter_lines(), 'utf-8', errors="replace"))
Expand Down
5 changes: 5 additions & 0 deletions scraper/components/parsers/secftd_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@
import codecs, csv
from zipfile import ZipFile

from scraper.settings.constants import SOURCES
from scraper.components.parsers.base_parser import Parser

class SecFtdParser(Parser):
def __init__(self, settings, debug=False):
super().__init__(settings, debug)

@staticmethod
def is_for():
return SOURCES.SEC_FTD

def process_response_to_csv(self, response):
zf = ZipFile(BytesIO(response.content), 'r')
# Zip should contain only one file
Expand Down
5 changes: 3 additions & 2 deletions scraper/components/writers/base_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@
from pathlib import Path

from scraper import utils
from scraper.components.component_base import ComponentBase
from scraper.components.writers.filename import FilenameGenerator

class Writer(abc.ABC):
class Writer(ComponentBase):
def __init__(self, settings, debug=False):
self.settings = settings
self.base_path = settings.output_path
self.fname_gen = FilenameGenerator(settings)

@abc.abstractmethod
def write(self, header, data, source): # pragma: no cover
raise NotImplementedError
return NotImplemented

def write_to_file(self, path, filename, data):
# ensure path exists and create it if missing
Expand Down
5 changes: 5 additions & 0 deletions scraper/components/writers/single_file_writer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from scraper.settings.constants import OUTPUT_TYPE
from scraper.components.writers.base_writer import Writer


Expand All @@ -6,6 +7,10 @@ class SingleFileWriter(Writer):
data into a single file. Data rows SHOULD then contain a reference to an
unique identifier to that symbol, otherwise the data would be unusable.
"""
@staticmethod
def is_for():
return OUTPUT_TYPE.SINGLE_FILE

def write(self, header, data, source):
# NOTE: This can happen when the fetcher could not find the data, the
# parser had issues parsing existing data or mixed conditions.
Expand Down