Skip to content

Commit

Permalink
Merge pull request #286 from codefordc/combined-apiconn-prs
Browse files Browse the repository at this point in the history
Combined ApiConn pull requests
  • Loading branch information
NealHumphrey committed May 23, 2017
2 parents ba42294 + 4bd1b55 commit c1426d1
Show file tree
Hide file tree
Showing 10 changed files with 664 additions and 199 deletions.
18 changes: 18 additions & 0 deletions python/README.md
@@ -0,0 +1,18 @@


Configuring the Amazon Web Services Command Line Interface (awscli)
-------------------------------------------------------------------

Looking for instructions on downloading data from S3? These have moved to `our website <http://housinginsights.org/resources/aws-sync.html/>`_.


Getting data from APIs
======================

Using get_api_data.py
-------------

get_api_data.py is a command line script to pull data from external data sources.
It uses modules in the housinginsights/sources directory to pull information from external data sources.
See get_api_data.py for detailed instructions on how to run and parameters.

53 changes: 0 additions & 53 deletions python/README.rst

This file was deleted.

49 changes: 25 additions & 24 deletions python/housinginsights/sources/DCHousing.py
Expand Up @@ -4,7 +4,7 @@
"""

from pprint import pprint

import logging

from housinginsights.sources.base import BaseApiConn
from housinginsights.sources.models.DCHousing import FIELDS,\
Expand All @@ -26,30 +26,31 @@ class DCHousingApiConn(BaseApiConn):
def __init__(self):
super().__init__(DCHousingApiConn.BASEURL)

def get_json(self, output_type=None, output_file=None):
self._available_unique_data_ids = ['dchousing']

def get_data(self, unique_data_ids=None, sample=False, output_type = 'csv', **kwargs):
"""
Returns JSON object of the entire data set.
:param output_type: Output type specified by user.
:type output_type: String.
:param output_file: Output file specified by user.
:type output_file: String
:returns: Json output from the api.
:rtype: String
"""
result = self.get(DCHousingApiConn.QUERY)
if result.status_code != 200:
err = "An error occurred during request: status {0}"
raise Exception(err.format(result.status_code))

if output_type == 'stdout':
pprint(result.json())
elif output_type == 'csv':
data = result.json()['features']
results = [DCHousingResult(address['attributes']) for address in
data]
self.result_to_csv(FIELDS, results, output_file)

return result.json()
if unique_data_ids == None:
unique_data_ids = self._available_unique_data_ids

for u in unique_data_ids:
if (u not in self._available_unique_data_ids):
#TODO this will always be raised when passing a list to get_multiple_api_sources method for those not in this class.
logging.info(" The unique_data_id '{}' is not supported by the DCHousingApiConn".format(u))

else:
result = self.get(DCHousingApiConn.QUERY)
if result.status_code != 200:
err = "An error occurred during request: status {0}"
raise Exception(err.format(result.status_code))

if output_type == 'stdout':
pprint(result.json())
elif output_type == 'csv':
data = result.json()['features']
results = [DCHousingResult(address['attributes']) for address in
data]
self.result_to_csv(FIELDS, results, self.output_paths[u])
24 changes: 2 additions & 22 deletions python/housinginsights/sources/README.md
Expand Up @@ -2,28 +2,8 @@ SOURCES DIRECTORY
-----------------

This directory is to be used to store modules to access various external APIs.
To be compatible with get_api_data.py, use the following conventions.

HOW DATA.SH WORKS
-----------------

get_api_data.py expects the user to supplies these
things when it is run:
1. Output Type [--outtype]
2. Output File (optional) [--output, -o]
3. Api Module name
4. Api Method name

It then tries to import housinginsights.sources + whatever module you specified.
For example if the api module name supplied by the user is "mar", then it tries to import
"housinginsights.sources.mar". It then looks for a class with the module name + "ApiConn"
as a suffix. In this case it would be "MarApiConn". It then calls whatever method the user specied
from that ApiConn class. Whatever parameters specified by the user with the --params argument
are split and passed as keyword arguments (**kwargs) to the function. The --outtype argument is added
as output_type, and --o or --output is added as output_file. Thus, each public function compatible with
get_api_data.py needs to have as a minimum those two parameters (output_type and output_file). See the mar.py
file for an example.

To be compatible with get_api_data.py, use the following conventions. See get_api_data.py for detailed
instructions on how to run and parameters.

STRUCTURE:
---------
Expand Down
100 changes: 88 additions & 12 deletions python/housinginsights/sources/base.py
Expand Up @@ -3,18 +3,32 @@
"""

from urllib.parse import urljoin
from datetime import datetime

from housinginsights.config.base import HousingInsightsConfig

import requests
import csv
import os


class BaseApiConn(object):
"""
Base API Connection to inherit from. Proxy support is built in,
because if you do enough scraping, I promise you you're gonna get
your IP blocked at some point.
Base API Connection to inherit from. Proxy support built in.
Every XXXApiConn class that inherits from this class should have a few key features:
If the class downloads whole data files for ingestion into our database:
- get_data() method should be a one-call method that downloads all the files that class
is capable of downloading. It should have 0 mandatory arguments, and at a minimum
the following optional arguments:
- unique_data_ids
- sample (Boolean). If possible, return just a few rows of data instead of the whole thing
- output_type ('csv' or 'stdout'). 'csv' should write the file to disk, 'stdout' prints to console
- __init__ should have _available_unique_data_ids, a list of ids that the class can output.
"""
def __init__(self, baseurl, proxies=None):
"""
Expand All @@ -30,25 +44,55 @@ def __init__(self, baseurl, proxies=None):
self.baseurl = baseurl
self.proxies = proxies


#A list of strings; this should be defined in the child class
self._available_unique_data_ids = None

@property
def output_paths(self):
if self._available_unique_data_ids is None:
raise NotImplementedError("You need to add self._available_unique_data_ids to your class before using it")

paths = {}
for u in self._available_unique_data_ids:
base = os.path.abspath(os.path.join(os.path.dirname(__file__),
os.pardir,os.pardir,os.pardir))
api_location = 'data/raw/apis'
filename = u + ".csv"
d = datetime.now().strftime('%Y%m%d')
path = os.path.join(base,api_location,d,filename)
paths[u] = path

return paths


def get(self, urlpath, params=None, **kwargs):
"""
Thin wrapper around requests.get() that adds in proxy value
and relative url joining.
:param urlpath: URL path to be joined to the baseurl.
Example: if baseurl is https://www.somesite.com/api,
and urlpath is /v2, the end result
is https://www.somesite.com/api/v2.
If baseurl == None, the urlpath is used directly.
:type urlpath: String.
:param params: Dictionary of request parameters.
:type params: dict of String.
"""
if self.baseurl[-1] == '/':
self.baseurl = self.baseurl[:-1]
if urlpath[0] != '/':
urlpath = '/' + urlpath
url = self.baseurl + urlpath
if self.baseurl != None:
if self.baseurl[-1] == '/':
self.baseurl = self.baseurl[:-1]
if urlpath[0] != '/':
urlpath = '/' + urlpath
url = self.baseurl + urlpath
else:
url = urlpath

return self.session.get(url, params=params, proxies=self.proxies, **kwargs)

def post(self, urlpath, data=None, **kwargs):
Expand All @@ -68,7 +112,20 @@ def post(self, urlpath, data=None, **kwargs):
url = urljoin(self.baseurl, urlpath)
return self.session.post(url, data=data, proxies=self.proxies, **kwargs)

def result_to_csv(self, fields, results, csvfile):
def create_directory_if_missing(self, filepath):
"""
Ensure there is a directory for given filepath, if doesn't exists it creates ones.
:param filepath: file path for where to write and save csv file
:type filepath: string
:return: None
"""
directory=os.path.dirname(filepath)
os.makedirs(directory, exist_ok=True)


def result_to_csv(self, fields, results, filepath):
"""
Write the data to a csv file.
Expand All @@ -78,18 +135,37 @@ def result_to_csv(self, fields, results, csvfile):
:param results: field values for each row
:type results: list
:param csvfile: file path for where to write and save csv file
:type csvfile: string
:param filepath: file path for where to write and save csv file
:type filepath: string
:return: None
"""
with open(csvfile, 'w', encoding='utf-8') as f:
self.create_directory_if_missing(filepath)
with open(filepath, 'w', encoding='utf-8') as f:
writer = csv.writer(f, delimiter=',')
writer.writerow(fields)
for result in results:
writer.writerow(result.data)


def directly_to_file(self, data, filepath):
"""
Write the data to a file
:param data: raw data to write to file
:type data: string
:param filepath: file path for where to write and save csv file
:type filepath: string
:return: None
"""
self.create_directory_if_missing(filepath)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(data)



class BaseApiManager(object):

classname = "base"
Expand Down
61 changes: 61 additions & 0 deletions python/housinginsights/sources/census.py
@@ -0,0 +1,61 @@
'''
INCOMPLETE APPROACH
See this issue: https://github.com/codefordc/housing-insights/issues/152
for some comments on latest status and next steps.
The core method will need to be renamed to `get_data` and should have the same
method signature as others. the current opendata.py file is a good model to
look at to provide consistent approach.
'''









import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
os.pardir,os.pardir)))

from housinginsights.sources.base import BaseApiConn

# secrets.json, yo
census_key = os.environ.get("CENSUS_KEY")


class CensusApiConn(BaseApiConn):
"""
Census API connector, confined to ACS5 2015 for now.
"""
BASEURL = 'http://api.census.gov/data'

def __init__(self, arg):
super(CensusApiConn, self).__init__(CensusApiConn.BASEURL)
self.arg = arg

def getacs5(self):
params = {'key': census_key, 'get': 'B01003_001E,B25057_001E,B25058_001E,B25059_001E', 'for': 'tract:*', 'in': 'state:11'}
result = self.get('/2015/acs5', params=params)
if result.status_code != 200:
err = "An error occurred during request: status {0}"
raise Exception(err.format(result.status_code))
else:
data = result
print(data.text)


CensusApiConn('fakearg').getacs5()

0 comments on commit c1426d1

Please sign in to comment.