Merge pull request #286 from codefordc/combined-apiconn-prs

Combined ApiConn pull requests
focusconsulting · May 23, 2017 · c1426d1 · c1426d1
2 parents ba42294 + 4bd1b55
commit c1426d1
Show file tree

Hide file tree

Showing 10 changed files with 664 additions and 199 deletions.
diff --git a/python/README.md b/python/README.md
@@ -0,0 +1,18 @@
+
+
+Configuring the Amazon Web Services Command Line Interface (awscli)
+-------------------------------------------------------------------
+
+Looking for instructions on downloading data from S3? These have moved to `our website <http://housinginsights.org/resources/aws-sync.html/>`_.
+
+
+Getting data from APIs
+======================
+
+Using get_api_data.py
+-------------
+
+get_api_data.py is a command line script to pull data from external data sources. 
+It uses modules in the housinginsights/sources directory to pull information from external data sources.
+See get_api_data.py for detailed instructions on how to run and parameters. 
+
diff --git a/python/README.rst b/python/README.rst
diff --git a/python/housinginsights/sources/DCHousing.py b/python/housinginsights/sources/DCHousing.py
@@ -4,7 +4,7 @@
 """
 
 from pprint import pprint
-
+import logging
 
 from housinginsights.sources.base import BaseApiConn
 from housinginsights.sources.models.DCHousing import FIELDS,\
@@ -26,30 +26,31 @@ class DCHousingApiConn(BaseApiConn):
     def __init__(self):
         super().__init__(DCHousingApiConn.BASEURL)
 
-    def get_json(self, output_type=None, output_file=None):
+        self._available_unique_data_ids = ['dchousing']
+
+    def get_data(self, unique_data_ids=None, sample=False, output_type = 'csv', **kwargs):
         """
         Returns JSON object of the entire data set.
 
-        :param output_type: Output type specified by user.
-        :type  output_type: String.
-
-        :param output_file: Output file specified by user.
-        :type  output_file: String
-
-        :returns: Json output from the api.
-        :rtype: String
         """
-        result = self.get(DCHousingApiConn.QUERY)
-        if result.status_code != 200:
-            err = "An error occurred during request: status {0}"
-            raise Exception(err.format(result.status_code))
-
-        if output_type == 'stdout':
-            pprint(result.json())
-        elif output_type == 'csv':
-            data = result.json()['features']
-            results = [DCHousingResult(address['attributes']) for address in
-                       data]
-            self.result_to_csv(FIELDS, results, output_file)
-
-        return result.json()
+        if unique_data_ids == None:
+            unique_data_ids = self._available_unique_data_ids
+
+        for u in unique_data_ids:
+            if (u not in self._available_unique_data_ids):
+                #TODO this will always be raised when passing a list to get_multiple_api_sources method for those not in this class. 
+                logging.info("  The unique_data_id '{}' is not supported by the DCHousingApiConn".format(u))
+
+            else:
+                result = self.get(DCHousingApiConn.QUERY)
+                if result.status_code != 200:
+                    err = "An error occurred during request: status {0}"
+                    raise Exception(err.format(result.status_code))
+
+                if output_type == 'stdout':
+                    pprint(result.json())
+                elif output_type == 'csv':
+                    data = result.json()['features']
+                    results = [DCHousingResult(address['attributes']) for address in
+                               data]
+                    self.result_to_csv(FIELDS, results, self.output_paths[u])
diff --git a/python/housinginsights/sources/README.md b/python/housinginsights/sources/README.md
@@ -2,28 +2,8 @@ SOURCES DIRECTORY
 -----------------
 
 This directory is to be used to store modules to access various external APIs.
-To be compatible with get_api_data.py, use the following conventions.
-
-HOW DATA.SH WORKS
------------------
-
-get_api_data.py expects the user to supplies these
-things when it is run:
-1. Output Type [--outtype]  
-2. Output File (optional) [--output, -o]
-3. Api Module name
-4. Api Method name
-
-It then tries to import housinginsights.sources + whatever module you specified. 
-For example if the api module name supplied by the user is "mar", then it tries to import
-"housinginsights.sources.mar". It then looks for a class with the module name + "ApiConn"
-as a suffix. In this case it would be "MarApiConn". It then calls whatever method the user specied
-from that ApiConn class. Whatever parameters specified by the user with the --params argument
-are split and passed as keyword arguments (**kwargs) to the function. The --outtype argument is added
-as output_type, and --o or --output is added as output_file. Thus, each public function compatible with
-get_api_data.py needs to have as a minimum those two parameters (output_type and output_file). See the mar.py
-file for an example.
-
+To be compatible with get_api_data.py, use the following conventions. See get_api_data.py for detailed 
+instructions on how to run and parameters. 
 
 STRUCTURE:
 ---------

diff --git a/python/housinginsights/sources/base.py b/python/housinginsights/sources/base.py
@@ -3,18 +3,32 @@
 """
 
 from urllib.parse import urljoin
+from datetime import datetime
 
 from housinginsights.config.base import HousingInsightsConfig
 
 import requests
 import csv
+import os
 
 
 class BaseApiConn(object):
     """
-    Base API Connection to inherit from. Proxy support is built in,
-    because if you do enough scraping, I promise you you're gonna get
-    your IP blocked at some point.
+    Base API Connection to inherit from. Proxy support built in.
+
+    Every XXXApiConn class that inherits from this class should have a few key features:
+
+    If the class downloads whole data files for ingestion into our database:
+    - get_data() method should be a one-call method that downloads all the files that class
+      is capable of downloading. It should have 0 mandatory arguments, and at a minimum 
+      the following optional arguments:
+        - unique_data_ids
+        - sample (Boolean). If possible, return just a few rows of data instead of the whole thing
+        - output_type ('csv' or 'stdout'). 'csv' should write the file to disk, 'stdout' prints to console
+    - __init__ should have _available_unique_data_ids, a list of ids that the class can output. 
+
+
+
     """
     def __init__(self, baseurl, proxies=None):
         """
@@ -30,25 +44,55 @@ def __init__(self, baseurl, proxies=None):
         self.baseurl = baseurl
         self.proxies = proxies
 
+
+        #A list of strings; this should be defined in the child class 
+        self._available_unique_data_ids = None
+
+    @property
+    def output_paths(self):
+        if self._available_unique_data_ids is None:
+            raise NotImplementedError("You need to add self._available_unique_data_ids to your class before using it")
+
+        paths = {}
+        for u in self._available_unique_data_ids:
+            base = os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                               os.pardir,os.pardir,os.pardir))
+            api_location = 'data/raw/apis'
+            filename = u + ".csv"
+            d = datetime.now().strftime('%Y%m%d')
+            path = os.path.join(base,api_location,d,filename)
+            paths[u] = path
+
+        return paths
+
+
     def get(self, urlpath, params=None, **kwargs):
         """
         Thin wrapper around requests.get() that adds in proxy value
         and relative url joining.
 
         :param urlpath: URL path to be joined to the baseurl.
+
                         Example: if baseurl is https://www.somesite.com/api,
                                  and urlpath is /v2, the end result
                                  is https://www.somesite.com/api/v2.
+
+                        If baseurl == None, the urlpath is used directly. 
+
         :type  urlpath: String.
 
         :param params: Dictionary of request parameters.
         :type  params: dict of String.
         """
-        if self.baseurl[-1] == '/':
-            self.baseurl = self.baseurl[:-1]
-        if urlpath[0] != '/':
-            urlpath = '/' + urlpath
-        url = self.baseurl + urlpath
+        if self.baseurl != None:
+            if self.baseurl[-1] == '/':
+                self.baseurl = self.baseurl[:-1]
+            if urlpath[0] != '/':
+                urlpath = '/' + urlpath
+            url = self.baseurl + urlpath
+        else:
+            url = urlpath
+
         return self.session.get(url, params=params, proxies=self.proxies, **kwargs)
 
     def post(self, urlpath, data=None, **kwargs):
@@ -68,7 +112,20 @@ def post(self, urlpath, data=None, **kwargs):
         url = urljoin(self.baseurl, urlpath)
         return self.session.post(url, data=data, proxies=self.proxies, **kwargs)
 
-    def result_to_csv(self, fields, results, csvfile):
+    def create_directory_if_missing(self, filepath):
+        """
+        Ensure there is a directory for given filepath, if doesn't exists it creates ones.
+
+        :param filepath: file path for where to write and save csv file
+        :type filepath: string
+
+        :return: None
+        """
+        directory=os.path.dirname(filepath)
+        os.makedirs(directory, exist_ok=True)
+
+
+    def result_to_csv(self, fields, results, filepath):
         """
         Write the data to a csv file.
 
@@ -78,18 +135,37 @@ def result_to_csv(self, fields, results, csvfile):
         :param results: field values for each row
         :type results: list
 
-        :param csvfile: file path for where to write and save csv file
-        :type csvfile: string
+        :param filepath: file path for where to write and save csv file
+        :type filepath: string
 
         :return: None
         """
-        with open(csvfile, 'w', encoding='utf-8') as f:
+        self.create_directory_if_missing(filepath)
+        with open(filepath, 'w', encoding='utf-8') as f:
             writer = csv.writer(f, delimiter=',')
             writer.writerow(fields)
             for result in results:
                 writer.writerow(result.data)
 
 
+    def directly_to_file(self, data, filepath):
+        """
+        Write the data to a file
+
+        :param data: raw data to write to file
+        :type data: string
+
+        :param filepath: file path for where to write and save csv file
+        :type filepath: string
+
+        :return: None
+        """
+        self.create_directory_if_missing(filepath)
+        with open(filepath, 'w', encoding='utf-8') as f:
+            f.write(data)
+
+
+
 class BaseApiManager(object):
 
     classname = "base"

diff --git a/python/housinginsights/sources/census.py b/python/housinginsights/sources/census.py
@@ -0,0 +1,61 @@
+'''
+
+
+INCOMPLETE APPROACH
+
+
+
+See this issue: https://github.com/codefordc/housing-insights/issues/152
+for some comments on latest status and next steps. 
+
+
+The core method will need to be renamed to `get_data` and should have the same
+method signature as others. the current opendata.py file is a good model to 
+look at to provide consistent approach. 
+
+
+'''
+
+
+
+
+
+
+
+
+
+import os
+import sys
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.pardir,os.pardir)))
+
+from housinginsights.sources.base import BaseApiConn
+
+# secrets.json, yo
+census_key = os.environ.get("CENSUS_KEY")
+
+
+class CensusApiConn(BaseApiConn):
+    """
+    Census API connector, confined to ACS5 2015 for now.
+
+    """
+    BASEURL = 'http://api.census.gov/data'
+
+    def __init__(self, arg):
+        super(CensusApiConn, self).__init__(CensusApiConn.BASEURL)
+        self.arg = arg
+
+    def getacs5(self):
+        params = {'key': census_key, 'get': 'B01003_001E,B25057_001E,B25058_001E,B25059_001E', 'for': 'tract:*', 'in': 'state:11'}
+        result = self.get('/2015/acs5', params=params)
+        if result.status_code != 200:
+            err = "An error occurred during request: status {0}"
+            raise Exception(err.format(result.status_code))
+        else:
+            data = result
+            print(data.text)
+
+
+CensusApiConn('fakearg').getacs5()