diff --git a/datacommons/BUILD.bazel b/datacommons/BUILD.bazel index 0f0a11f9..476db1be 100644 --- a/datacommons/BUILD.bazel +++ b/datacommons/BUILD.bazel @@ -1,10 +1,6 @@ package(default_visibility = ["//visibility:public"]) -load("@requirements//:requirements.bzl", "requirement") py_library( name = "datacommons", srcs = glob(["*.py"]), - deps = [ - requirement("pandas"), - ] ) diff --git a/datacommons/__init__.py b/datacommons/__init__.py index 72a819d5..bc427f0d 100644 --- a/datacommons/__init__.py +++ b/datacommons/__init__.py @@ -21,4 +21,4 @@ from datacommons.populations import get_populations, get_observations, get_pop_obs, get_place_obs # Other utilities -from .utils import set_api_key, clean_frame, flatten_frame +from .utils import set_api_key diff --git a/datacommons/core.py b/datacommons/core.py index 85f912fa..d82cc0b3 100644 --- a/datacommons/core.py +++ b/datacommons/core.py @@ -28,8 +28,6 @@ from collections import defaultdict -import pandas as pd - import datacommons.utils as utils import requests @@ -40,7 +38,7 @@ def get_property_labels(dcids, out=True): """ Returns the labels of properties defined for the given :code:`dcids`. Args: - dcids (:obj:`list` of :obj:`str`): A list of nodes identified by their + dcids (:obj:`iterable` of :obj:`str`): A list of nodes identified by their dcids. out (:obj:`bool`, optional): Whether or not the property points away from the given list of nodes. @@ -99,6 +97,7 @@ def get_property_labels(dcids, out=True): } """ # Generate the GetProperty query and send the request + dcids = list(dcids) url = utils._API_ROOT + utils._API_ENDPOINTS['get_property_labels'] payload = utils._send_request(url, req_json={'dcids': dcids}) @@ -120,8 +119,7 @@ def get_property_values(dcids, """ Returns property values of given :code:`dcids` along the given property. Args: - dcids (Union[:obj:`list` of :obj:`str`, :obj:`pandas.Series`]): dcids to get - property values for. + dcids (:obj:`iterable` of :obj:`str`): dcids to get property values for. prop (:obj:`str`): The property to get property values for. out (:obj:`bool`, optional): A flag that indicates the property is directed away from the given nodes when set to true. @@ -131,15 +129,8 @@ def get_property_values(dcids, aggregated over all given nodes. Returns: - When :code:`dcids` is an instance of :obj:`list`, the returned property - values are formatted as a :obj:`dict` from a given dcid to a list of its - property values. - - When :code:`dcids` is an instance of :obj:`pandas.Series`, the returned - property values are formatted as a :obj:`pandas.Series` where the `i`-th - entry corresponds to property values associated with the `i`-th given dcid. - The cells of the returned series will always contain a :obj:`list` of - property values. + Returned property values are formatted as a :obj:`dict` from a given dcid + to a list of its property values. Raises: ValueError: If the payload returned by the Data Commons REST API is @@ -160,21 +151,11 @@ def get_property_values(dcids, "geoId/21": ["Kentucky"], "geoId/24": ["Maryland"], } - - Next, we specify :code:`dcids` as a :obj:`pandas.Series` - - >>> import pandas as pd - >>> dcids = pd.Series(["geoId/06", "geoId/21", "geoId/24"]) - >>> get_property_values(dcids, "name") - 0 [California] - 1 [Kentucky] - 2 [Maryland] - dtype: object """ # Convert the dcids field and format the request to GetPropertyValue - dcids, req_dcids = utils._convert_dcids_type(dcids) + dcids = list(dcids) req_json = { - 'dcids': req_dcids, + 'dcids': dcids, 'property': prop, 'limit': limit } @@ -205,9 +186,6 @@ def get_property_values(dcids, # Make sure each dcid is in the results dict, and convert all sets to lists. results = {dcid: sorted(list(unique_results[dcid])) for dcid in dcids} - # Format the results as a Series if a Pandas Series is provided. - if isinstance(dcids, pd.Series): - return pd.Series([results[dcid] for dcid in dcids], index=dcids.index) return results @@ -221,7 +199,7 @@ def get_triples(dcids, limit=utils._MAX_LIMIT): *predicate*). Args: - dcids (:obj:`list` of :obj:`str`): A list of dcids to get triples for. + dcids (:obj:`iterable` of :obj:`str`): A list of dcids to get triples for. limit (:obj:`int`, optional): The maximum total number of triples to get. Returns: @@ -249,6 +227,7 @@ def get_triples(dcids, limit=utils._MAX_LIMIT): } """ # Generate the GetTriple query and send the request. + dcids = list(dcids) url = utils._API_ROOT + utils._API_ENDPOINTS['get_triples'] payload = utils._send_request(url, req_json={'dcids': dcids, 'limit': limit}) diff --git a/datacommons/examples/core.py b/datacommons/examples/core.py index d531b37d..b79eaaf7 100644 --- a/datacommons/examples/core.py +++ b/datacommons/examples/core.py @@ -77,7 +77,7 @@ def main(): # To expand on a column with get_property_values, the data frame has to be # flattened first. Clients can use flatten_frame to do this. utils._print_header('Flatten the Frame') - pd_frame = dc.flatten_frame(pd_frame) + pd_frame = pd_frame.explode('county') print(pd_frame) # Get the names for each city. @@ -87,7 +87,7 @@ def main(): # Format the final frame. utils._print_header('The Final Frame') - pd_frame = dc.flatten_frame(pd_frame) + pd_frame = pd_frame.explode('city') print(pd_frame) diff --git a/datacommons/examples/places.py b/datacommons/examples/places.py index 352b5805..2a9a4309 100644 --- a/datacommons/examples/places.py +++ b/datacommons/examples/places.py @@ -53,7 +53,7 @@ def main(): # Get all CensusTracts in these two counties. utils._print_header('Get Census Tracts') pd_frame['tracts'] = dc.get_places_in(pd_frame['county'], 'CensusTract') - pd_frame = dc.flatten_frame(pd_frame) + pd_frame = pd_frame.explode('tracts') print(pd_frame) diff --git a/datacommons/examples/populations.py b/datacommons/examples/populations.py index 3a751282..1a47c8b1 100644 --- a/datacommons/examples/populations.py +++ b/datacommons/examples/populations.py @@ -57,16 +57,16 @@ def main(): # DataFrame with Santa Clara and Montgomery County. utils._print_header('Initialize the DataFrame') pd_frame = pd.DataFrame({'state': ['geoId/06', 'geoId/21', 'geoId/24']}) - pd_frame['state_name'] = dc.get_property_values(pd_frame['state'], 'name') - pd_frame = dc.flatten_frame(pd_frame) - print(pd_frame) + pd_frame['state_name'] = pd_frame['state'].map( + dc.get_property_values(pd_frame['state'], 'name')) + pd_frame = pd_frame.explode('state_name').reset_index(drop=True) # Get populations for employed individuals utils._print_header('Add Population and Observation to DataFrame') - pd_frame['employed_pop'] = dc.get_populations( + pd_frame['employed_pop'] = pd_frame['state'].map(dc.get_populations( pd_frame['state'], 'Person', - constraining_properties={'employment': 'BLS_Employed'}) + constraining_properties={'employment': 'BLS_Employed'})) # Add the observation for employed individuals pd_frame['employed_count'] = dc.get_observations( @@ -81,7 +81,7 @@ def main(): # Final dataframe. Use the convenience function "clean_frame" to convert # columns to numerical types. utils._print_header('Final Data Frame') - pd_frame = dc.clean_frame(pd_frame) + pd_frame = pd_frame.dropna().reset_index(drop=True) print(pd_frame) diff --git a/datacommons/examples/query.py b/datacommons/examples/query.py index e484a909..bd7d634d 100644 --- a/datacommons/examples/query.py +++ b/datacommons/examples/query.py @@ -21,7 +21,6 @@ from __future__ import print_function import datacommons as dc -import pandas as pd def main(): @@ -37,12 +36,9 @@ def main(): ''') print('> Issuing query.\n{}'.format(query)) - # Initialize the Query instance. - dc_query = dc.Query(sparql=query) - # Iterate through all the rows in the results. print('> Printing results.\n') - for row in dc_query.rows(): + for row in dc.query(query_string=query): print(' {}'.format(row)) diff --git a/datacommons/places.py b/datacommons/places.py index 19ac7ae7..0a951836 100644 --- a/datacommons/places.py +++ b/datacommons/places.py @@ -23,7 +23,6 @@ from __future__ import print_function import datacommons.utils as utils -import pandas as pd import requests @@ -33,21 +32,13 @@ def get_places_in(dcids, place_type): :code:`place_type`. Args: - dcids (Union[:obj:`list` of :obj:`str`, :obj:`pandas.Series`]): Dcids to get - contained in places. + dcids (:obj:`iterable` of :obj:`str`): Dcids to get contained in places. place_type (:obj:`str`): The type of places contained in the given dcids to filter by. Returns: - When :code:`dcids` is an instance of :obj:`list`, the returned - :obj:`Place`'s are formatted as a :obj:`dict` from a given dcid to a list of - places identified by dcids of the given `place_type`. - - When :code:`dcids` is an instance of :obj:`pandas.Series`, the returned - :obj:`Place`'s are formatted as a :obj:`pandas.Series` where the `i`-th - entry corresponds to places contained in the place identified by the dcid - in `i`-th cell if :code:`dcids`. The cells of the returned series will always - contain a :obj:`list` of place dcids of the given `place_type`. + The returned :obj:`Place`'s are formatted as a :obj:`dict` from a given + dcid to a list of places identified by dcids of the given `place_type`. Raises: ValueError: If the payload returned by the Data Commons REST API is @@ -70,26 +61,14 @@ def get_places_in(dcids, place_type): # and 53 more ] } - - We can also specify the :code:`dcids` as a :obj:`pandas.Series` like so. - - >>> import pandas as pd - >>> dcids = pd.Series(["geoId/06"]) - >>> get_places_in(dcids, "County") - 0 [geoId/06041, geoId/06089, geoId/06015, geoId/... - dtype: object - """ - # Convert the dcids field and format the request to GetPlacesIn - dcids, req_dcids = utils._convert_dcids_type(dcids) + dcids = list(dcids) url = utils._API_ROOT + utils._API_ENDPOINTS['get_places_in'] payload = utils._send_request(url, req_json={ - 'dcids': req_dcids, + 'dcids': dcids, 'place_type': place_type, }) # Create the results and format it appropriately result = utils._format_expand_payload(payload, 'place', must_exist=dcids) - if isinstance(dcids, pd.Series): - return pd.Series([result[dcid] for dcid in dcids], index=dcids.index) return result diff --git a/datacommons/populations.py b/datacommons/populations.py index 7f57a548..a58626f5 100644 --- a/datacommons/populations.py +++ b/datacommons/populations.py @@ -27,7 +27,6 @@ from __future__ import print_function import datacommons.utils as utils -import pandas as pd import requests @@ -36,7 +35,7 @@ def get_populations(dcids, population_type, constraining_properties={}): """ Returns :obj:`StatisticalPopulation`'s located at the given :code:`dcids`. Args: - dcids (Union[:obj:`list` of :obj:`str`, :obj:`pandas.Series`]): Dcids + dcids (:obj:`iterable` of :obj:`str`): Dcids identifying :obj:`Place`'s of populations to query for. These dcids are treated as the property value associated with returned :obj:`Population`'s by the property @@ -48,19 +47,12 @@ def get_populations(dcids, population_type, constraining_properties={}): :obj:`StatisticalPopulation` should be constrained by. Returns: - When :code:`dcids` is an instance of :obj:`list`, the returned - :obj:`StatisticalPopulation` are formatted as a :obj:`dict` from a given + The returned :obj:`StatisticalPopulation` are formatted as a :obj:`dict` from a given dcid to the unique :obj:`StatisticalPopulation` located at the dcid as specified by the `population_type` and `constraining_properties` *if such exists*. A given dcid will *NOT* be a member of the :obj:`dict` if such a population does not exist. - When :code:`dcids` is an instance of :obj:`pandas.Series`, the returned - :obj:`StatisticalPopulation` are formatted as a :obj:`pandas.Series` where - the `i`-th entry corresponds to populations located at the given dcid - specified by the `population_type` and `constraining_properties` *if such - exists*. Otherwise, the cell is empty. - Raises: ValueError: If the payload returned by the Data Commons REST API is malformed. @@ -88,24 +80,13 @@ def get_populations(dcids, population_type, constraining_properties={}): "geoId/21": "dc/p/fs929fynprzs", "geoId/24": "dc/p/lr52m1yr46r44" } - - We can also specify the :code:`dcids` as a :obj:`pandas.Series` like so. - - >>> import pandas as pd - >>> dcids = pd.Series(["geoId/06", "geoId/21", "geoId/24"]) - >>> pvs = {'employment': 'BLS_Employed'} - >>> dc.get_populations(dcids, 'Person', constraining_properties=pvs) - 0 dc/p/x6t44d8jd95rd - 1 dc/p/fs929fynprzs - 2 dc/p/lr52m1yr46r44 - dtype: object """ # Convert the dcids field and format the request to GetPopulations - dcids, req_dcids = utils._convert_dcids_type(dcids) + dcids = list(dcids) pv = [{'property': k, 'value': v} for k, v in constraining_properties.items()] url = utils._API_ROOT + utils._API_ENDPOINTS['get_populations'] payload = utils._send_request(url, req_json={ - 'dcids': req_dcids, + 'dcids': dcids, 'population_type': population_type, 'pvs': pv, }) @@ -113,9 +94,6 @@ def get_populations(dcids, population_type, constraining_properties={}): # Create the results and format it appropriately result = utils._format_expand_payload( payload, 'population', must_exist=dcids) - if isinstance(dcids, pd.Series): - flattened = utils._flatten_results(result, default_value="") - return pd.Series([flattened[dcid] for dcid in dcids], index=dcids.index) # Drop empty results while flattening return utils._flatten_results(result) @@ -130,7 +108,7 @@ def get_observations(dcids, """ Returns values of :obj:`Observation`'s observing the given :code:`dcids`. Args: - dcids (Union[:obj:`list` of :obj:`str`, :obj:`pandas.Series`]): Dcids + dcids (:obj:`iterable` of :obj:`str`): Dcids identifying nodes that returning :obj:`Observation`'s observe. These dcids are treated as the property value associated with returned :obj:`Observation`'s by the property @@ -156,11 +134,6 @@ def get_observations(dcids, dcid will *NOT* be a member of the :obj:`dict` if such an observation does not exist. - When :code:`dcids` is an instance of :obj:`pandas.Series`, the returned - :obj:`Observation`'s are formatted as a :obj:`pandas.Series` where the - `i`-th entry corresponds to observation observing the given dcid as specified - by the other parameters *if such exists*. Otherwise, the cell holds NaN. - Examples: We would like to get the following for December, 2018: @@ -187,24 +160,10 @@ def get_observations(dcids, "dc/p/fs929fynprzs": 1973955.0, "dc/p/lr52m1yr46r44": 3075662.0 } - - We can also specify the :code:`dcids` as a :obj:`pandas.Series` like so. - - >>> import pandas as pd - >>> dcids = pd.Series(["dc/p/x6t44d8jd95rd", "dc/p/fs929fynprzs", "dc/p/lr52m1yr46r44"]) - >>> get_observations(dcids, 'count', 'measuredValue', '2018-12', - ... observation_period='P1M', - ... measurement_method='BLSSeasonallyAdjusted' - ... ) - 0 18704962.0 - 1 1973955.0 - 2 3075662.0 - dtype: float64 """ - # Convert the dcids field and format the request to GetObservation - dcids, req_dcids = utils._convert_dcids_type(dcids) + dcids = list(dcids) req_json = { - 'dcids': req_dcids, + 'dcids': dcids, 'measured_property': measured_property, 'stats_type': stats_type, 'observation_date': observation_date, @@ -221,10 +180,6 @@ def get_observations(dcids, # Create the results and format it appropriately result = utils._format_expand_payload( payload, 'observation', must_exist=dcids) - if isinstance(dcids, pd.Series): - flattened = utils._flatten_results(result, default_value="") - series = pd.Series([flattened[dcid] for dcid in dcids], index=dcids.index) - return series.apply(pd.to_numeric, errors='coerce') # Drop empty results by calling _flatten_results without default_value, then # coerce the type to float if possible. @@ -352,7 +307,8 @@ def get_pop_obs(dcid): url = utils._API_ROOT + utils._API_ENDPOINTS['get_pop_obs'] + '?dcid={}'.format(dcid) return utils._send_request(url, compress=True, post=False) -def get_place_obs(place_type, observation_date, population_type, constraining_properties={}): +def get_place_obs( + place_type, observation_date, population_type, constraining_properties={}): """ Returns all :obj:`Observation`'s for all places given the place type, observation date and the :obj:`StatisticalPopulation` constraints. diff --git a/datacommons/test/BUILD.bazel b/datacommons/test/BUILD.bazel index cb339913..39f8734d 100644 --- a/datacommons/test/BUILD.bazel +++ b/datacommons/test/BUILD.bazel @@ -7,7 +7,6 @@ py_test( deps = [ "//datacommons:datacommons", requirement("mock"), - requirement("pandas"), ], python_version = "PY3" ) @@ -18,7 +17,6 @@ py_test( deps = [ "//datacommons:datacommons", requirement("mock"), - requirement("pandas"), ], python_version = "PY3" ) @@ -29,7 +27,6 @@ py_test( deps = [ "//datacommons:datacommons", requirement("mock"), - requirement("pandas"), ], python_version = "PY3" ) @@ -40,7 +37,6 @@ py_test( deps = [ "//datacommons:datacommons", requirement("mock"), - requirement("pandas"), ], python_version = "PY3" ) diff --git a/datacommons/test/core_test.py b/datacommons/test/core_test.py index ce73d898..eca74b81 100644 --- a/datacommons/test/core_test.py +++ b/datacommons/test/core_test.py @@ -20,13 +20,10 @@ from __future__ import division from __future__ import print_function -from pandas.util.testing import assert_series_equal, assert_frame_equal from unittest import mock import datacommons as dc import datacommons.utils as utils -import pandas as pd - import json import unittest @@ -472,96 +469,6 @@ def test_no_dcids(self, post_mock): prop_vals = dc.get_property_values([], 'containedInPlace') self.assertDictEqual(prop_vals, {}) - # ---------------------------- PANDAS UNIT TESTS ---------------------------- - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_series(self, post_mock): - """ Calling get_property_values with a Pandas Series returns the correct - results. - """ - # Set the API key - dc.set_api_key('TEST-API-KEY') - - # The given and expected series. - dcids = pd.Series(['geoId/06085', 'geoId/24031']) - expected = pd.Series([ - ['geoId/0643294', 'geoId/0644112'], - ['geoId/2462850'] - ]) - - # Call get_property_values with the series as input - actual = dc.get_property_values( - dcids, 'containedInPlace', out=False, value_type='Town') - assert_series_equal(actual, expected) - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_series_bad_dcids(self, post_mock): - """ Calling get_property_values with a Pandas Series and dcids that does not - exist resturns an empty result. - """ - # Set the API key - dc.set_api_key('TEST-API-KEY') - - # The given and expected series - bad_dcids_1 = pd.Series(['geoId/06085', 'dc/MadDcid']) - bad_dcids_2 = pd.Series(['dc/MadDcid', 'dc/MadderDcid']) - expected_1 = pd.Series([['geoId/0644112'], []]) - expected_2 = pd.Series([[], []]) - - # Call get_property_values with series as input - actual_1 = dc.get_property_values(bad_dcids_1, 'containedInPlace', out=False) - actual_2 = dc.get_property_values(bad_dcids_2, 'containedInPlace', out=False) - - # Assert the results are correct - assert_series_equal(actual_1, expected_1) - assert_series_equal(actual_2, expected_2) - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_series_bad_property(self, post_mock): - """ Calling get_property_values with a Pandas Series and a property that - does not exist returns an empty result. - """ - # Set the API key - dc.set_api_key('TEST-API-KEY') - - # The input and expected series - dcids = pd.Series(['geoId/06085', 'geoId/24031']) - expected = pd.Series([[], []]) - - # Call get_property_values and assert the results are correct. - actual = dc.get_property_values(dcids, 'madProperty') - assert_series_equal(actual, expected) - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_series_no_dcid(self, post_mock): - # The input and expected series - dcids = pd.Series([]) - expected = pd.Series([]) - - # Call get_property_values and assert the results are correct. - actual = dc.get_property_values(dcids, 'containedInPlace') - assert_series_equal(actual, expected) - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_dataframe(self, post_mock): - """ Calling get_property_values with a Pandas DataFrame returns the correct - results. - """ - # Set the API key - dc.set_api_key('TEST-API-KEY') - - # The given and expected series. - dcids = pd.DataFrame({'dcids': ['geoId/06085', 'geoId/24031']}) - expected = pd.Series([ - ['geoId/0643294', 'geoId/0644112'], - ['geoId/2462850'] - ]) - - # Call get_property_values with the series as input - actual = dc.get_property_values( - dcids, 'containedInPlace', out=False, value_type='Town') - assert_series_equal(actual, expected) - class TestGetTriples(unittest.TestCase): """ Unit tests for get_triples. """ diff --git a/datacommons/test/places_test.py b/datacommons/test/places_test.py index 963bfdb1..3d883e77 100644 --- a/datacommons/test/places_test.py +++ b/datacommons/test/places_test.py @@ -20,13 +20,10 @@ from __future__ import division from __future__ import print_function -from pandas.util.testing import assert_series_equal from unittest import mock import datacommons as dc import datacommons.utils as utils -import pandas as pd - import json import unittest @@ -144,61 +141,6 @@ def test_no_dcids(self, post_mock): 'dc/MadderDcid': [] }) - # ---------------------------- PANDAS UNIT TESTS ---------------------------- - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_series_multiple_dcids(self, post_mock): - """ Calling get_places_in with a Pandas Series and proper dcids returns - a Pandas Series with valid results. - """ - # Set the API key - dc.set_api_key('TEST-API-KEY') - - # Get the input dcids and expected output - dcids = pd.Series(['geoId/06085', 'geoId/24031']) - expected = pd.Series( - [['geoId/0649670'], ['geoId/2467675', 'geoId/2476650']]) - - # Call get_places_in - actual = dc.get_places_in(dcids, 'City') - assert_series_equal(actual, expected) - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_series_bad_dcids(self, post_mock): - """ Calling get_places_in with a Pandas Series and dcids that do not exist - returns empty results. - """ - # Set the API key - dc.set_api_key('TEST-API-KEY') - - # Get the input dcids and expected output - bad_dcids_1 = pd.Series(['geoId/06085', 'dc/MadDcid']) - bad_dcids_2 = pd.Series(['dc/MadDcid', 'dc/MadderDcid']) - expected_1 = pd.Series([['geoId/0649670'], []]) - expected_2 = pd.Series([[], []]) - - # Call get_places_in - actual_1 = dc.get_places_in(bad_dcids_1, 'City') - actual_2 = dc.get_places_in(bad_dcids_2, 'City') - - # Assert that the answers are correct - assert_series_equal(actual_1, expected_1) - assert_series_equal(actual_2, expected_2) - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_series_no_dcids(self, post_mock): - """ Calling get_places_in with no dcids returns empty results. """ - # Set the API key - dc.set_api_key('TEST-API-KEY') - - # Get the input and expected output - bad_dcids = pd.Series([]) - expected = pd.Series([]) - - # Test get_places_in - actual = dc.get_places_in(bad_dcids, 'City') - assert_series_equal(actual, expected) - if __name__ == '__main__': unittest.main() diff --git a/datacommons/test/populations_test.py b/datacommons/test/populations_test.py index a0c7f0bd..5b63728e 100644 --- a/datacommons/test/populations_test.py +++ b/datacommons/test/populations_test.py @@ -22,13 +22,10 @@ from __future__ import print_function import base64 -from pandas.util.testing import assert_series_equal from unittest import mock import datacommons as dc import datacommons.utils as utils -import pandas as pd - import json import unittest import zlib @@ -278,63 +275,6 @@ def test_no_dcids(self, post_mock): [], 'Person', constraining_properties=self._constraints) self.assertDictEqual(pops, {}) - # ---------------------------- PANDAS UNIT TESTS ---------------------------- - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_series_multiple_dcids(self, post_mock): - """ Calling get_populations with a Pandas Series and proper dcids returns - a Pandas Series with valid results. - """ - # Set the API key - dc.set_api_key('TEST-API-KEY') - - # Get the input and expected output - dcids = pd.Series(['geoId/06085', 'geoId/4805000']) - expected = pd.Series(['dc/p/crgfn8blpvl35', 'dc/p/f3q9whmjwbf36']) - - # Call get_populations - actual = dc.get_populations( - dcids, 'Person', constraining_properties=self._constraints) - assert_series_equal(actual, expected) - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_series_bad_dcids(self, post_mock): - """ Calling get_populations with a Pandas Series and dcids that do not exist - returns empty results. - """ - # Set the API key - dc.set_api_key('TEST-API-KEY') - - # Get input and expected output - dcids_1 = pd.Series(['geoId/06085', 'dc/MadDcid']) - dcids_2 = pd.Series(['dc/MadDcid', 'dc/MadderDcid']) - expected_1 = pd.Series(['dc/p/crgfn8blpvl35', '']) - expected_2 = pd.Series(['', '']) - - # Call get_populations - actual_1 = dc.get_populations( - dcids_1, 'Person', constraining_properties=self._constraints) - actual_2 = dc.get_populations( - dcids_2, 'Person', constraining_properties=self._constraints) - - # Assert that the results are correct - assert_series_equal(actual_1, expected_1) - assert_series_equal(actual_2, expected_2) - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_series_no_dcids(self, post_mock): - """ Calling get_populations with no dcids returns empty results. """ - # Set the API key - dc.set_api_key('TEST-API-KEY') - - dcids = pd.Series([]) - expected = pd.Series([]) - - # Call get_populations - actual = dc.get_populations( - dcids, 'Person', constraining_properties=self._constraints) - assert_series_equal(actual, expected) - class TestGetObservations(unittest.TestCase): """ Unit tests for get_observations. """ @@ -391,62 +331,6 @@ def test_no_dcids(self, post_mock): measurement_method='BLSSeasonallyAdjusted') self.assertDictEqual(actual, {}) - # ---------------------------- PANDAS UNIT TESTS ---------------------------- - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_series_multiple_dcids(self, post_mock): - """ Calling get_observations with a Pandas Series and proper dcids returns - a Pandas Series with valid results. - """ - # Set the API key - dc.set_api_key('TEST-API-KEY') - - dcids = pd.Series( - ['dc/p/x6t44d8jd95rd', 'dc/p/lr52m1yr46r44', 'dc/p/fs929fynprzs']) - expected = pd.Series([18704962.0, 3075662.0, 1973955.0]) - actual = dc.get_observations(dcids, 'count', 'measuredValue', '2018-12', - observation_period='P1M', - measurement_method='BLSSeasonallyAdjusted') - assert_series_equal(actual, expected) - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_series_bad_dcids(self, post_mock): - """ Calling get_observations with a Pandas Series and dcids that do not - exist returns empty results. - """ - # Set the API key - dc.set_api_key('TEST-API-KEY') - - # Get the input and expected output - dcids_1 = pd.Series(['dc/p/x6t44d8jd95rd', 'dc/MadDcid']) - dcids_2 = pd.Series(['dc/MadDcid', 'dc/MadderDcid']) - expected_1 = pd.Series([18704962.0, float('NaN')]) - expected_2 = pd.Series([float('NaN'), float('NaN')]) - - # Call get_observations - actual_1 = dc.get_observations(dcids_1, 'count', 'measuredValue', '2018-12', - observation_period='P1M', - measurement_method='BLSSeasonallyAdjusted') - actual_2 = dc.get_observations(dcids_2, 'count', 'measuredValue', '2018-12', - observation_period='P1M', - measurement_method='BLSSeasonallyAdjusted') - - # Verify the results - assert_series_equal(actual_1, expected_1) - assert_series_equal(actual_2, expected_2) - - @mock.patch('requests.post', side_effect=post_request_mock) - def test_series_no_dcids(self, post_mock): - """ Calling get_observations with no dcids returns empty results. """ - # Set the API key - dc.set_api_key('TEST-API-KEY') - - dcids = pd.Series([]) - expected = pd.Series([]) - actual = dc.get_observations(dcids, 'count', 'measuredValue', '2018-12', - observation_period='P1M', - measurement_method='BLSSeasonallyAdjusted') - assert_series_equal(actual, expected) class TestGetPopObs(unittest.TestCase): """ Unit tests for get_pop_obs. """ diff --git a/datacommons/utils.py b/datacommons/utils.py index cc8acd28..e8a83ca3 100644 --- a/datacommons/utils.py +++ b/datacommons/utils.py @@ -21,7 +21,6 @@ from __future__ import print_function from collections import defaultdict -import pandas as pd import base64 import json @@ -75,67 +74,6 @@ def set_api_key(api_key): os.environ[_ENV_VAR_API_KEY] = api_key -# ------------------------- PANDAS UTILITY FUNCTIONS -------------------------- - - -def flatten_frame(pd_frame, cols=[]): - """ Expands each cell in a Pandas DataFrame containing a list of values. - - Args: - pd_frame (:obj:`pandas.DataFrame`): The Pandas DataFrame. - cols (:obj:`list` of `str`, optional): A list of columns to flatten. If none - are provided, then all columns are flattened. - - Returns: - A :obj:`pandas.DataFrame` with all columns containing lists flattened. - - Raises: - ValueError: If a given column is not in the data frame. - - Examples: - We can flatten a data frame with a column of lists like so. - - >>> frame = pd.DataFrame({"state": ["geoId/06"]}) - >>> frame['county'] = dc.get_places_in(dcids, "County") - >>> frame - state county - 0 geoId/06 [geoId/06041, geoId/06089, geoId/06015, geoId/... - >>> dc.flatten_frame(frame) - state county - 0 geoId/06 geoId/06041 - 1 geoId/06 geoId/06089 - 2 geoId/06 geoId/06015 - .. ... ... - 55 geoId/06 geoId/06019 - 56 geoId/06 geoId/06031 - 57 geoId/06 geoId/06099 - """ - if not cols: - cols = list(pd_frame.columns) - for col in cols: - if col not in pd_frame: - raise ValueError('Column {} is not in data frame.'.format(col)) - if any(isinstance(v, list) for v in pd_frame[col]): - # TODO: Uncomment after colab supports pandas 0.25 - # pd_frame = pd_frame._explode(col) - pd_frame = _explode(pd_frame, col) - pd_frame = pd_frame.reset_index(drop=True) - return pd_frame - - -def clean_frame(pd_frame): - """ A convenience function that cleans a pandas DataFrame. - - Args: - pd_frame (:obj:`pandas.DataFrame`): The Pandas DataFrame. - - Returns: - A :obj:`pandas.DataFrame` with all rows containing empty or NaN elements - removed. - """ - return pd_frame.dropna().reset_index(drop=True) - - # ------------------------- INTERNAL HELPER FUNCTIONS ------------------------- @@ -208,49 +146,6 @@ def _flatten_results(result, default_value=None): return flattened -def _convert_dcids_type(dcids): - """ Amends dcids list type and creates the approprate request dcids list. """ - # Create the requests dcids list. - if isinstance(dcids, list): - req_dcids = dcids - elif isinstance(dcids, pd.Series): - req_dcids = list(dcids) - elif isinstance(dcids, pd.DataFrame): - # Assume user did df[[col]] instead of df[col] - # Or user had to use single-col dataframe for Reticulate - # Take the first column as a series - dcids = dcids.iloc[:,0] - req_dcids = list(dcids) - else: - raise ValueError( - 'dcids parameter must either be of type list or pandas.Series.') - return dcids, req_dcids - - -def _explode(pd_frame, column): - """ Expands a list inside a Pandas cell. """ - matches = [i for i, n in enumerate(pd_frame.columns) if n == column] - col_idx = matches[0] - - def helper(d): - row = list(d.values[0]) - bef = row[:col_idx] - aft = row[col_idx + 1:] - col = row[col_idx] - z = [bef + [c] + aft for c in col] - return pd.DataFrame(z) - - col_idx += len(pd_frame.index.shape) - index_names = list(pd_frame.index.names) - column_names = list(index_names) + list(pd_frame.columns) - return (pd_frame - .reset_index() - .groupby(level=0, as_index=0) - .apply(helper) - .rename(columns=lambda i: column_names[i]) - .set_index(index_names)) - - def _print_header(label): """ Prints a pretty header with the given label. """ print('\n' + '-' * 80)