Skip to content

Commit

Permalink
Merge pull request #13 from fhightower/dev
Browse files Browse the repository at this point in the history
Separating list updating functionality from the domain check function
  • Loading branch information
fhightower committed Apr 25, 2017
2 parents 0177f60 + ed0782d commit 7700d27
Showing 1 changed file with 21 additions and 18 deletions.
39 changes: 21 additions & 18 deletions onemillion/onemillion.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,7 @@ def __init__(self, cache=True, cache_location=DEFAULT_CACHE_LOCATION,
open(os.path.join(self.cache_location, 'metadata.json'), 'a').close()
self.first_time = True

# if instructions given to onemillion are contrary, raise error message
if self.update and not self.cache:
raise ValueError("It is not possible to update the top one " +
"million domain lists without caching them. " +
"This script will use the most updated version " +
"of the domain lists by default if cache is " +
"set to True.")
self.update_lists()

def _get_metadata(self):
"""Read the metadata from the metadata file."""
Expand Down Expand Up @@ -120,8 +114,8 @@ def _update_etag(self, domain_list_name, etag):
with open(os.path.join(self.cache_location, 'metadata.json'), 'w') as f:
f.write(json.dumps(self.metadata))

def _update_lists(self):
"""Update the top one million lists."""
def _check_for_updates(self):
"""Check to see if lists need updated and update if needed."""
# get the metadata
self.metadata = self._get_metadata()

Expand Down Expand Up @@ -152,15 +146,6 @@ def _update_lists(self):
def domain_in_million(self, domain):
"""Check if the given domain is in a top on million list."""
# TODO: parse the registered domain out of the given domain using tldextract
# if we are caching and updating...
if self.cache and self.update:
# cache/update the lists
self._update_lists()
# if we are caching but not updating and this is the first pass...
elif self.cache and not self.update and self.first_time:
# cache the contents of the lists as this is the first pass
self._update_lists()

# see if the given domain is in the up-to-date domain lists
for domain_list in CONFIG['domain_lists']:
# open the domain list as a CSV
Expand All @@ -173,3 +158,21 @@ def domain_in_million(self, domain):

# if the domain was not found in the list, return false
return False

def update_lists(self):
"""Update the lists if appropriate."""
# if we are caching and updating...
if self.cache and self.update:
# cache/update the lists
self._check_for_updates()
# if we are caching but not updating and this is the first pass...
elif self.cache and not self.update and self.first_time:
# cache the contents of the lists as this is the first pass
self._check_for_updates()
# if instructions given to onemillion are contrary, raise error message
elif self.update and not self.cache:
raise ValueError("It is not possible to update the top one " +
"million domain lists without caching them. " +
"This script will use the most updated version " +
"of the domain lists by default if cache is " +
"set to True.")

0 comments on commit 7700d27

Please sign in to comment.