In [2]:
#Dave's code

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time #Included to use in pausing between API requests
import re  #Enable regular expressions

# Import API key
from api_keys import api_key


In [47]:
#Dave's code

#Make API calls to US Census Bureau - American Community Survey 1-Year Data
#https://www.census.gov/data/developers/data-sets/acs-1year.2017.html
#Data will be returned on a county & state basis

#Set the range of years to pull from the api
years = ["2012","2013","2014","2015","2016","2017"]

#List of fields to request from the API
#Would like to build this into a file, with columns for the ID required for the API call and a human readable name for output
fields = ["NAME","S0101_C01_001E","S1701_C01_001E","S1701_C02_001E","S1702_C01_001E","S1702_C02_001E",
          "S1703_C01_001E","S1703_C02_001E"]

#Initiate an empty list to hold census records
census_data_raw = []

#Turn the list of fields into a string to use in the api url
field_list = ','.join(fields)

#Base url for the API request. YEAR and FIELDS are placeholders and will be substituted with values
#Using * for county and state will pull all available records
census_base_url = "https://api.census.gov/data/YEAR/acs/acs1/subject?get=FIELDS&for=county:*&in=state:*&key="

#Since API url has a year component, loop through the list of years to pull back the relevant data
for year in years:
    census_url = re.sub("YEAR", year, census_base_url) #Use regular expression to substitute the year into the base url
    census_url = re.sub("FIELDS", field_list, census_url) #Use regular expression to substitute in list of fields
    #print(census_url) #for debugging
    response = requests.get(census_url + api_key).json()
    #print(response) #for debugging
    for census_record in response[1:]: #Ignore first record in the loop
        census_record.append(year)  #Append the year to the end of the returned record
        census_data_raw.append(census_record)  #Append the full record to the list of records
    time.sleep(1) #Sleep for a second, because APIs

#census_data_raw #for debugging


In [2]:
#Dave's code

#Set path for output file
output_data_file = "census_data\census_data.tsv"

#Use the list of fields as the headers for the data frame
headers = fields 
#Add titles for fields returned in API that aren't part of requested fields
headers.extend(["STATE","COUNTY","YEAR"])

#Convert list of census data to a dataframe
census_data_raw_df = pd.DataFrame(census_data_raw, columns=headers)

#Export dataframe to a tab-delimited file, since one of the columns includes a comma
census_data_raw_df.to_csv(output_data_file, sep="\t", index=False, encoding='utf-8')

#show the dataframe
census_data_raw_df


NameError: name 'fields' is not defined

In [None]:
#Doug's code

In [None]:
#Scott's code

In [None]:
#Megan's code

In [None]:
#Justin's code

In [3]:
#Justin's code
#Call this function again so I don't have to run that giant API call in Dave's code
output_data_file = "census_data\census_data.tsv"

#read in the file and save it to a df
justin_df = pd.read_csv(output_data_file, sep="\t", encoding='utf-8')

#rename columns to the best of my ability
justin_df.columns = ['Name', 'Total Population', 'Poverty Determinable', 'Below Poverty Level (PL)', 'Poverty Status of Families',
                     '% of Families Below PL', 'Selected Characteristics', '<50% Poverty Level', 'State Code', 'County Code', "Year"]

#calculate a new value to eventually add to our dataframe as a column
percent_below_PL = justin_df['Below Poverty Level (PL)'] / justin_df['Poverty Determinable']

#format this value to be the same as the other percentage fetched from the API
percent_below_PL_formatted = []
for value in percent_below_PL:
    value = value * 100
    value = "{:.1f}".format(value)
    percent_below_PL_formatted.append(value)
    
#add this new value as a column to our df
justin_df['% Below PL'] = percent_below_PL_formatted

#move this column to a more sensible position
justin_cols = list(justin_df)
justin_cols.insert(4, justin_cols.pop(justin_cols.index('% Below PL')))
justin_df = justin_df.loc[:, justin_cols]

#show new and sexy dataframe
justin_df

Unnamed: 0,Name,Total Population,Poverty Determinable,Below Poverty Level (PL),% Below PL,Poverty Status of Families,% of Families Below PL,Selected Characteristics,<50% Poverty Level,State Code,County Code,Year
0,"Ouachita Parish, Louisiana",155363,146095,35443,24.3,37910,20.0,,,22,73,2012
1,"Rapides Parish, Louisiana",132373,128128,28867,22.5,33286,17.2,,,22,79,2012
2,"St. Landry Parish, Louisiana",83662,82325,21149,25.7,21943,21.0,,,22,97,2012
3,"St. Tammany Parish, Louisiana",239453,237342,32212,13.6,65316,10.2,,,22,103,2012
4,"Tangipahoa Parish, Louisiana",123441,119480,29110,24.4,27469,18.9,,,22,105,2012
5,"Terrebonne Parish, Louisiana",111893,110088,20330,18.5,27860,12.7,,,22,109,2012
6,"Androscoggin County, Maine",107609,104067,16593,15.9,27924,12.1,,,23,1,2012
7,"Aroostook County, Maine",70868,68715,11358,16.5,20883,11.9,,,23,3,2012
8,"Cumberland County, Maine",283921,275849,32297,11.7,71533,8.2,,,23,5,2012
9,"Kennebec County, Maine",121853,118408,17804,15.0,31164,9.5,,,23,11,2012


In [47]:
# Loading EPA data

import os

def read_all_epa_years ():
    
    # Base file name
    base_name = "annual_aqi_by_county_"
    # epa_data_directory
    data_directory = "epa_data"
    
    years = [2012, 2013, 2014, 2015, 2016, 2017]
    df_list = []
    
    # Loop over the years, read in .csv file and append to list of dataframes
    for year in years:
        file_name = f"{base_name}{year}.csv"
        file_path = os.path.join("", data_directory, file_name)
        
        df_list.append(pd.read_csv(file_path))
    
    # concatenate the years
    all_years_df = pd.concat(df_list, ignore_index=True)
    
    # write the combined dataframe to a new .csv file
    output_path = os.path.join("", data_directory, "annual_aqi_by_county_all.csv")
    all_years_df.to_csv(output_path)
    
    # return the dataframe
    return all_years_df

all_years_df = read_all_epa_years()


In [48]:
all_years_df.shape

(6363, 19)

In [49]:
all_years_df.columns

Index(['State', 'County', 'Year', 'Days with AQI', 'Good Days',
       'Moderate Days', 'Unhealthy for Sensitive Groups Days',
       'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days', 'Max AQI',
       '90th Percentile AQI', 'Median AQI', 'Days CO', 'Days NO2',
       'Days Ozone', 'Days SO2', 'Days PM2.5', 'Days PM10'],
      dtype='object')

In [50]:
all_years_df['State'].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Country Of Mexico', 'Delaware',
       'District Of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
       'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
       'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina',
       'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
       'Virgin Islands', 'Virginia', 'Washington', 'West Virginia',
       'Wisconsin', 'Wyoming'], dtype=object)

In [51]:
justin_df.columns

Index(['Name', 'Total Population', 'Poverty Determinable',
       'Below Poverty Level (PL)', '% Below PL', 'Poverty Status of Families',
       '% of Families Below PL', 'Selected Characteristics',
       '<50% Poverty Level', 'State Code', 'County Code', 'Year'],
      dtype='object')

In [57]:
sum(justin_df.Name.str.contains("Puerto Rico"))

66

In [58]:
sum(justin_df.Name.str.contains("District of Columbia"))

6

In [55]:
(justin_df.Name.str.contains("Virgin Islands"))

4979

In [56]:
len(justin_df.Name.str.contains("Country of Mexico"))

4979