# Importing request

In [1]:
import requests
import json
from pprint import pprint
import pandas as pd
import random
from config import api_key
from config import api_id
from config import api_key2

# Creating random list for MSAs
Keith wrote the code to generate the random list from the top polluting MSAs

In [2]:
# The data to load
f = "../data/msa.csv"

# Count the lines
num_lines = sum(1 for l in open(f))

# Sample size - retrieving header and 5 MSA's
size = 10

# The row indices to skip - make sure 0 is not included to keep the header
skip_idx = random.sample(range(1, num_lines), num_lines - size)

# Read the data
msa = pd.read_csv(f, skiprows=skip_idx)

# Display the sample
msa


Unnamed: 0,MSA
0,"Atlanta-Athens-Clarke County-Sandy Springs, G..."
1,"Brownsville-Harlingen-Raymondville, TX"
2,"Chicago-Naperville, IL-IN-WI"
3,"Cleveland-Akron-Canton, OH"
4,"Detroit-Warren-Ann Arbor, MI"
5,"Los Angeles-Long Beach, CA"
6,"McAllen-Edinburg, TX"
7,"Philadelphia-Reading-Camden, PA-NJ-DE-MD"
8,"Phoenix-Mesa, AZ"


# Striping Leading/Trailing Spaces for Merge

In [3]:
msa['MSA1']=msa['MSA'].str.strip()
msa.loc[msa['MSA1']=='Detroit-Warren-Ann Arbor, MI']

Unnamed: 0,MSA,MSA1
4,"Detroit-Warren-Ann Arbor, MI","Detroit-Warren-Ann Arbor, MI"


# Reading in MSA Crosswalk info for MSA codes

In [4]:
file = "../data/msa_crosswalk.csv"
crosswalk = pd.read_csv(file)
crosswalk1 = crosswalk[['CBSA Code','CSA Title']].sort_values('CSA Title',ascending = False).rename(columns = {'CSA Title':'MSA1'}).dropna().drop_duplicates()
crosswalk1

Unnamed: 0,CBSA Code,MSA1
1503,41400,"Youngstown-Warren, OH-PA"
1893,49660,"Youngstown-Warren, OH-PA"
1865,48700,"Williamsport-Lock Haven, PA"
974,30820,"Williamsport-Lock Haven, PA"
1860,48620,"Wichita-Arkansas City-Winfield, KS"
...,...,...
660,24100,"Albany-Schenectady, NY"
31,10580,"Albany-Schenectady, NY"
656,24020,"Albany-Schenectady, NY"
60,11220,"Albany-Schenectady, NY"


# Joining MSA Crosswalk to Top MSAs to get the CBSA Codes needed for API Pulls

In [5]:
msa_codes = pd.merge(crosswalk1,msa,
                how = 'inner',
                on = 'MSA1')

msa_codes.head()

Unnamed: 0,CBSA Code,MSA1,MSA
0,37980,"Philadelphia-Reading-Camden, PA-NJ-DE-MD","Philadelphia-Reading-Camden, PA-NJ-DE-MD"
1,36140,"Philadelphia-Reading-Camden, PA-NJ-DE-MD","Philadelphia-Reading-Camden, PA-NJ-DE-MD"
2,12100,"Philadelphia-Reading-Camden, PA-NJ-DE-MD","Philadelphia-Reading-Camden, PA-NJ-DE-MD"
3,47220,"Philadelphia-Reading-Camden, PA-NJ-DE-MD","Philadelphia-Reading-Camden, PA-NJ-DE-MD"
4,39740,"Philadelphia-Reading-Camden, PA-NJ-DE-MD","Philadelphia-Reading-Camden, PA-NJ-DE-MD"


# API Call - Pollution Part 1
using a for loop to loop through three years work of PM2.5 data

In [6]:
state_code = '06'
county = '075'

response_sample = []
start = ["20160101"]
end = ["20161231"]
codes = msa_codes['CBSA Code']


for index in range(len(start)):
    for each_msa in codes:
        url = f"https://aqs.epa.gov/data/api/sampleData/byCBSA?email={api_id}&key={api_key}&param=88101&bdate={start[index]}&edate={end[index]}&cbsa={each_msa}"
        response_sample.append(requests.get(url).json())

# Pulling Data and putting in list
There are two loops, the first loop is through the 33 different MSAs. Unfortunately the EPA does not always sample every 3 dyas like their website says. This leads to certain sampling sites having different lengths of samples. The second loop goes from 0 to the length of the number of samples they do have

In [31]:
time = []
date = []
cbsa_code = []
lat = []
lon = []
site = []
sample = []


for x in range(len(response_sample)):
    for y in range(0,response_sample[x]['Header'][0]['rows']):
        time.append(response_sample[x]['Data'][y]['time_local'])
        date.append(response_sample[x]['Data'][y]['date_local'])
        cbsa_code.append(response_sample[x]['Data'][y]['cbsa_code'])
        lat.append(response_sample[x]['Data'][y]['latitude'])
        lon.append(response_sample[x]['Data'][y]['longitude'])
        site.append(response_sample[x]['Data'][y]['site_number'])
        sample.append(response_sample[x]['Data'][y]['sample_measurement'])

        
columns = ['time','date','cbsa_code','lat','lon','site','sample']
df_sample = pd.DataFrame(data = list(zip(time,date,cbsa_code,lat,lon,site,sample)), columns = columns)
df_sample.head()

Unnamed: 0,time,date,cbsa_code,lat,lon,site,sample
0,00:00,2016-01-01,12100,39.36326,-74.431,1006,6.0
1,00:00,2016-01-07,12100,39.36326,-74.431,1006,8.4
2,00:00,2016-01-10,12100,39.36326,-74.431,1006,8.0
3,00:00,2016-01-13,12100,39.36326,-74.431,1006,6.9
4,00:00,2016-01-16,12100,39.36326,-74.431,1006,5.4


# Converting date to DateTime
needed to group by month which is format that Oil Data is in

In [74]:
df_sample['date'] = pd.to_datetime(df_sample.date,format = '%Y-%m')
df_sample.head(1)

Unnamed: 0,time,date,cbsa_code,lat,lon,site,sample,test,month_year
0,00:00,2016-01-01,12100,39.36326,-74.431,1006,6.0,2016-01-01,01-2016


In [86]:
df_sample['month_year'] = df_sample['date'].dt.strftime('%m-%Y')
df_sample.head(1)

Unnamed: 0,time,date,cbsa_code,lat,lon,site,sample,test,month_year
0,00:00,2016-01-01,12100,39.36326,-74.431,1006,6.0,2016-01-01,01-2016


checking number of distinct sites

In [32]:
df_sample.site.unique()

array(['1006', '0006', '0007', '0011', '0002', '0003', '0043', '1046',
       '2002', '8001', '0306', '9004', '8005', '1016', '5001', '0009',
       '0027', '2022', '1103', '1201', '2005', '4002', '9033', '1602',
       '1302', '4004', '4008', '1004', '3001', '0021', '0001', '0015',
       '0016', '0019', '0025', '0033', '0036', '0005', '0039', '0093',
       '0095', '0008', '0017', '0020', '0023', '3002', '0034', '0038',
       '0045', '0060', '0065', '1002', '0004', '0022', '0052', '0057',
       '0076', '3103', '3301', '0026', '2004', '4201', '6005', '0024',
       '1011', '4007', '0031'], dtype=object)

# Grouping Hourly Data by day & Merging
PM2.5 data is organized by hour, but we need it on a daily level so it can map in with the AQI levels. First we do a groupby to get daily levels, then we need to remerge with the original data to get the categorical data back in (county, site, lat, lon)

In [78]:
df_sample1 = df_sample[['site','sample','month_year','cbsa_code']].groupby(['month_year','cbsa_code','site']).mean().reset_index().sort_values('month_year',ascending = False)
df_sample1.head()
len(df_sample1)
df_sample1

Unnamed: 0,month_year,cbsa_code,site,sample
1051,12-2016,47220,0007,8.327887
1007,12-2016,19820,0009,9.300000
987,12-2016,16980,1011,8.574171
988,12-2016,16980,1016,8.142857
989,12-2016,16980,2004,7.600000
...,...,...,...,...
61,01-2016,31080,2005,9.945455
62,01-2016,31080,2022,7.118182
63,01-2016,31080,4002,12.533333
64,01-2016,31080,4004,12.116667


# Same process for PM2.5 now for AQI
AQI = Air Quality Index
This is not done every day but every 1 - 3 days depending on how the EPA decided to track it.

In [34]:
response_daily = []
start = ["20160101"]
end = ["20161231"]
codes = msa_codes['CBSA Code']

for index in range(len(start)):
    for each_msa in codes:
        url = f"https://aqs.epa.gov/data/api/dailyData/byCBSA?email={api_id}&key={api_key}&param=88101&bdate={start[index]}&edate={end[index]}&cbsa={each_msa}"
        response_daily.append(requests.get(url).json())

In [79]:
len(response_daily)

29

In [81]:
date = []
cbsa_code = []
lat = []
lon = []
site = []
aqi = []


for x in range(len(response_daily)):
    for y in range(0,response_daily[x]['Header'][0]['rows']):
        date.append(response_daily[x]['Data'][y]['date_local'])
        aqi.append(response_daily[x]['Data'][y]['aqi'])
        cbsa_code.append(response_daily[x]['Data'][y]['cbsa_code'])
        lat.append(response_daily[x]['Data'][y]['latitude'])
        lon.append(response_daily[x]['Data'][y]['longitude'])
        site.append(response_daily[x]['Data'][y]['site_number'])

        
columns = ['date','aqi','cbsa_code','lat','lon','site']
df_daily = pd.DataFrame(data = list(zip(date,aqi,cbsa_code,lat,lon,site)), columns = columns).drop_duplicates().dropna()
df_daily.head()

Unnamed: 0,date,aqi,cbsa_code,lat,lon,site
0,2016-12-31,29.0,37980,39.988842,-75.207205,76
5,2016-12-30,27.0,37980,39.988842,-75.207205,76
10,2016-12-29,45.0,37980,39.988842,-75.207205,76
15,2016-12-28,42.0,37980,39.988842,-75.207205,76
20,2016-12-27,20.0,37980,39.988842,-75.207205,76


In [83]:
df_daily['date'] = pd.to_datetime(df_daily.date,format = '%Y-%m')
df_daily.head(1)

Unnamed: 0,date,aqi,cbsa_code,lat,lon,site
0,2016-12-31,29.0,37980,39.988842,-75.207205,76


In [85]:
df_daily['month_year'] = df_daily['date'].dt.strftime('%m-%Y')
df_daily.head(1)

Unnamed: 0,date,aqi,cbsa_code,lat,lon,site,month_year
0,2016-12-31,29.0,37980,39.988842,-75.207205,76,12-2016


In [87]:
df_daily1 = df_daily[['site','aqi','month_year','cbsa_code']].groupby(['month_year','cbsa_code','site']).mean().reset_index().sort_values('month_year',ascending = False)
df_daily1.head()
# len(df_sample1)
# df_sample1

Unnamed: 0,month_year,cbsa_code,site,aqi
1221,12-2016,47220,7,33.482759
1170,12-2016,19820,93,51.310345
1146,12-2016,16980,3301,35.333333
1147,12-2016,16980,4002,20.833333
1148,12-2016,16980,4007,42.761905


# Exporting AQI and PM2.5
exporting each as CSV

In [90]:
df_daily1.to_csv('aqi_2016.csv')
df_sample1.to_csv('sample_2016.csv')

# API Call Census Data
pulling on 5 polluted state/counties from random selection of the top 20 highest polluted counties in the US

pulling sectors for mining/quarring, utilities, construction, manufactoring, & wholesale trade

https://classcodes.com/naics-2-digit-sector-codes/

In [140]:
year = '2016'
variables_interest = ['NAICS2012_TTL,EMP,ESTAB']
sectors = ["31-33","21","22","42","48-49"]
codes = msa_codes['CBSA Code']
response_census = []



for each_msa in codes:
    base_url = f"https://api.census.gov/data/{year}/cbp?get={variables_interest[0]}&NAICS2012={sectors[0]}&NAICS2012={sectors[1]}&NAICS2012={sectors[2]}&NAICS2012={sectors[3]}&NAICS2012={sectors[4]}&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area:{each_msa}&key={api_key2}"
    response_census.append(requests.get(base_url).json())

In [107]:
len(response_census)

29

In [112]:
response_census[1]

[['NAICS2012_TTL',
  'EMP',
  'ESTAB',
  'NAICS2012',
  'metropolitan statistical area/micropolitan statistical area'],
 ['Mining, quarrying, and oil and gas extraction', '39', '4', '21', '36140'],
 ['Utilities', '137', '3', '22', '36140'],
 ['Manufacturing', '577', '73', '31-33', '36140'],
 ['Wholesale trade', '557', '64', '42', '36140'],
 ['Transportation and warehousing', '193', '45', '48-49', '36140']]

# Creating Dataset from Pulled Data
dropping first index as it is the column heading

In [136]:
columns = ['NAICS2012_TTL','EMP','ESTAB','NAICS2012',"metropolitan area"]
df = pd.DataFrame(data = response_census[0],columns=columns)
df1 = df.drop(df.index[0])
df1

Unnamed: 0,NAICS2012_TTL,EMP,ESTAB,NAICS2012,metropolitan area
1,"Mining, quarrying, and oil and gas extraction",805,70,21,37980
2,Utilities,13488,194,22,37980
3,Manufacturing,168986,4895,31-33,37980
4,Wholesale trade,131748,7799,42,37980
5,Transportation and warehousing,86338,3472,48-49,37980


In [143]:
df_transposed = df1.T
df_transposed

Unnamed: 0,1,2,3,4,5
NAICS2012_TTL,"Mining, quarrying, and oil and gas extraction",Utilities,Manufacturing,Wholesale trade,Transportation and warehousing
EMP,805,13488,168986,131748,86338
ESTAB,70,194,4895,7799,3472
NAICS2012,21,22,31-33,42,48-49
metropolitan area,37980,37980,37980,37980,37980
