From 69a1a26d7a3019896f47d7f468b15df40f81a20d Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Wed, 16 Sep 2020 10:28:24 -0700 Subject: [PATCH 1/2] Make the batch size and number of batches also reflective of number of stat_vars queried. Increase number of stat_vars in stress test. --- datacommons/examples/stat_vars.py | 93 ++++++++++++++++++++++++++----- datacommons/stat_vars.py | 19 +++++-- 2 files changed, 91 insertions(+), 21 deletions(-) diff --git a/datacommons/examples/stat_vars.py b/datacommons/examples/stat_vars.py index 984208f8..96a05fe9 100644 --- a/datacommons/examples/stat_vars.py +++ b/datacommons/examples/stat_vars.py @@ -130,27 +130,90 @@ def call_str(pvs): try: dc.get_stat_all( dc.get_places_in(['country/USA'], 'County')['country/USA'], [ - 'Count_Person_Unemployed', 'Median_Age_Person', 'Count_Death', - 'Count_Death_CertainInfectiousParasiticDiseases', - 'Count_UnemploymentInsuranceClaim_StateUnemploymentInsurance', - 'Count_Worker_NAICSAccommodationFoodServices', - 'Count_Household_With0AvailableVehicles', - 'Count_Person_WhiteAlone', 'Count_Person_WorkedFullTime', - 'Count_Person_Employed', 'Count_Person_EnrolledInSchool', + 'Count_Person', 'LandAreaSqMeter', + 'PopulationDensityPerSqMeter', + 'Count_Person_BlackOrAfricanAmericanAlone', + 'PercentBlackOrAfricanAmericanAlone', 'Count_Person_Female', + 'Count_Person_Male', + 'Count_Person_AmericanIndianAndAlaskaNativeAlone', + 'Count_Person_AmericanIndianAndAlaskaNativeAloneOrInCombinationWithOneOrMoreOtherRaces', + 'Count_Person_AmericanIndianOrAlaskaNativeAlone', + 'Count_Person_AsianAlone', + 'Count_Person_AsianAloneOrInCombinationWithOneOrMoreOtherRaces', + 'Count_Person_BlackOrAfricanAmericanAloneOrInCombinationWithOneOrMoreOtherRaces', + 'Count_Person_HispanicOrLatino', + 'Count_Person_NativeHawaiianAndOtherPacificIslanderAlone', + 'Count_Person_NativeHawaiianAndOtherPacificIslanderAloneOrInCombinationWithOneOrMoreOtherRaces', + 'Count_Person_NativeHawaiianOrOtherPacificIslanderAlone', + 'Count_Person_SomeOtherRaceAlone', + 'Count_Person_SomeOtherRaceAloneOrInCombinationWithOneOrMoreOtherRaces', + 'Count_Person_TwoOrMoreRaces', 'Count_Person_WhiteAlone', + 'Count_Person_WhiteAloneNotHispanicOrLatino', + 'Count_Person_WhiteAloneOrInCombinationWithOneOrMoreOtherRaces', + 'Count_Person_Upto5Years', 'Count_Person_Upto18Years', + 'Count_Person_65OrMoreYears', 'Count_Person_75OrMoreYears', + 'Count_Person_ForeignBorn', + 'Count_Person_USCitizenByNaturalization', + 'Count_Person_NotAUSCitizen', 'Count_Person_Nonveteran', + 'Count_Person_Veteran', 'Count_Person_NotWorkedFullTime', + 'Count_Person_WorkedFullTime', 'Count_Person_Employed', + 'Count_Person_Unemployed', 'Count_Person_InLaborForce', + 'Count_Person_IncomeOf10000To14999USDollar', + 'Count_Person_IncomeOf15000To24999USDollar', + 'Count_Person_IncomeOf25000To34999USDollar', + 'Count_Person_IncomeOf35000To49999USDollar', + 'Count_Person_IncomeOf50000To64999USDollar', + 'Count_Person_IncomeOf65000To74999USDollar', + 'Count_Person_IncomeOf75000OrMoreUSDollar', + 'Count_Person_IncomeOfUpto9999USDollar', + 'Count_Person_EnrolledInSchool', + 'Count_Person_NotEnrolledInSchool', + 'Count_Person_EnrolledInCollegeUndergraduateYears', + 'Count_Person_EnrolledInGrade1ToGrade4', + 'Count_Person_EnrolledInGrade5ToGrade8', 'Count_Person_EnrolledInGrade9ToGrade12', - 'Count_Person_ResidesInGroupQuarters', - 'Count_Person_NowMarried', - 'RetailDrugDistribution_DrugDistribution_DMethamphetamine', - 'Count_Household_With2Person', + 'Count_Person_EnrolledInKindergarten', + 'Count_Person_EnrolledInNurserySchoolPreschool', + 'Count_Person_GraduateOrProfessionalSchool', + 'Count_Person_EducationalAttainment10ThGrade', + 'Count_Person_EducationalAttainment11ThGrade', + 'Count_Person_EducationalAttainment12ThGradeNoDiploma', + 'Count_Person_EducationalAttainment1StGrade', 'Count_Person_EducationalAttainment2NdGrade', + 'Count_Person_EducationalAttainment3RdGrade', + 'Count_Person_EducationalAttainment4ThGrade', + 'Count_Person_EducationalAttainment5ThGrade', + 'Count_Person_EducationalAttainment6ThGrade', + 'Count_Person_EducationalAttainment7ThGrade', + 'Count_Person_EducationalAttainment8ThGrade', + 'Count_Person_EducationalAttainment9ThGrade', + 'Count_Person_EducationalAttainmentAssociatesDegree', + 'Count_Person_EducationalAttainmentBachelorsDegree', + 'Count_Person_EducationalAttainmentBachelorsDegreeOrHigher', + 'Count_Person_EducationalAttainmentDoctorateDegree', 'Count_Person_EducationalAttainmentGedOrAlternativeCredential', 'Count_Person_EducationalAttainmentKindergarten', + 'Count_Person_EducationalAttainmentMastersDegree', + 'Count_Person_EducationalAttainmentNoSchoolingCompleted', + 'Count_Person_EducationalAttainmentNurserySchool', + 'Count_Person_EducationalAttainmentPrimarySchool', + 'Count_Person_EducationalAttainmentProfessionalSchoolDegree', 'Count_Person_EducationalAttainmentRegularHighSchoolDiploma', + 'Count_Person_EducationalAttainmentSomeCollege1OrMoreYearsNoDegree', + 'Count_Person_EducationalAttainmentSomeCollegeLessThan1Year', + 'Count_Person_Divorced', 'Count_Person_MarriedAndNotSeparated', + 'Count_Person_NeverMarried', 'Count_Person_Separated', + 'Count_Person_Widowed', 'Count_Person_NowMarried', + 'Count_Person_AbovePovertyLevelInThePast12Months', + 'Count_Person_BelowPovertyLevelInThePast12Months', + 'Percent_Person_20OrMoreYears_WithDiabetes', + 'Percent_Person_20OrMoreYears_Obesity', + 'Percent_Person_20OrMoreYears_PhysicalInactivity', + 'Percent_Person_Upto64Years_NoHealthInsurance', + 'Median_Age_Person', 'Median_Income_Person', 'Count_Death', + 'Count_Death_CertainInfectiousParasiticDiseases', 'Count_Death_DiseasesOfBloodAndBloodFormingOrgansAndImmuneDisorders', - 'Count_Household_NoComputer', - 'Median_Income_Household_HouseholderRaceHispanicOrLatino,', - 'Count_HousingUnit_RenterOccupied', - 'Count_Worker_NAICSInformation' + 'Count_Death_DiseasesOfTheRespiratorySystem' ]) except ValueError: print('Stress test for get_stat_all FAILED!') diff --git a/datacommons/stat_vars.py b/datacommons/stat_vars.py index d8f4a4f8..116a995d 100644 --- a/datacommons/stat_vars.py +++ b/datacommons/stat_vars.py @@ -25,6 +25,9 @@ import datacommons.utils as utils +# stat_var specific batch size. +_STAT_BATCH_SIZE = 2000 + def get_stat_value(place, stat_var, @@ -203,20 +206,24 @@ def get_stat_all(places, stat_vars): } """ url = utils._API_ROOT + utils._API_ENDPOINTS['get_stat_all'] + # Cast iterable-like to list. places = list(places) + stat_vars = list(stat_vars) + + # Aiming for 2000 entries total. + # 2000 = num places x num stat_vars, so aim for + # 2000/len(stat_vars) places per batch. + places_per_batch = _STAT_BATCH_SIZE // len(stat_vars) # Get number of batches via an arithmetic ceiling trick: # 11//10 rounds down to 1. # -11//10 rounds down to -2. # We can divide with, then remove the negative to get the ceiling. - batches = -(-len(places) // utils._QUERY_BATCH_SIZE) + batches = -(-len(places) // places_per_batch) res = {} for i in range(batches): req_json = { - 'stat_vars': - stat_vars, - 'places': - places[i * utils._QUERY_BATCH_SIZE:(i + 1) * - utils._QUERY_BATCH_SIZE] + 'stat_vars': stat_vars, + 'places': places[i * places_per_batch:(i + 1) * places_per_batch] } # Send the request res_json = utils._send_request(url, From a3bf7eb23e3f98e883f77631917a542ee069b1b6 Mon Sep 17 00:00:00 2001 From: tjann <18621425+tjann@users.noreply.github.com> Date: Wed, 16 Sep 2020 11:01:20 -0700 Subject: [PATCH 2/2] Update comment to use constant instead of number. --- datacommons/stat_vars.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datacommons/stat_vars.py b/datacommons/stat_vars.py index 116a995d..3f7f16c2 100644 --- a/datacommons/stat_vars.py +++ b/datacommons/stat_vars.py @@ -210,9 +210,9 @@ def get_stat_all(places, stat_vars): places = list(places) stat_vars = list(stat_vars) - # Aiming for 2000 entries total. - # 2000 = num places x num stat_vars, so aim for - # 2000/len(stat_vars) places per batch. + # Aiming for _STAT_BATCH_SIZE entries total. + # _STAT_BATCH_SIZE = num places x num stat_vars, so aim for + # _STAT_BATCH_SIZE/len(stat_vars) places per batch. places_per_batch = _STAT_BATCH_SIZE // len(stat_vars) # Get number of batches via an arithmetic ceiling trick: # 11//10 rounds down to 1.