# Importing request

In [3]:
import requests
import json
from pprint import pprint
import pandas as pd
import random
import statsmodels.api as sm
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

import numpy as np
from config import api_key
from config import api_id
from config import api_key2

# Reading in MSA Crosswalk info for MSA codes

In [10]:
file = "../data/msa_crosswalk.csv"
crosswalk = pd.read_csv(file)
crosswalk1 = crosswalk[['CBSA Code']].rename(columns = {'CBSA Code':'CBSA'}).dropna().drop_duplicates()
crosswalk1.head()

Unnamed: 0,CBSA
0,10100
2,10140
3,10180
6,10220
7,10260


# API Call - Pollution Part 1
using a for loop to loop through three years work of PM2.5 data

In [12]:
response_sample = []
start = ["20140101"]
end = ["20141231"]
codes = crosswalk1['CBSA']


for index in range(len(start)):
    for each_msa in codes:
        url = f"https://aqs.epa.gov/data/api/sampleData/byCBSA?email={api_id}&key={api_key}&param=88101&bdate={start[index]}&edate={end[index]}&cbsa={each_msa}"
        response_sample.append(requests.get(url).json())

In [22]:
len(response_sample)

948

In [25]:
response_sample[947]['Header'][0]

{'status': 'Failed',
 'request_time': '2020-05-19T20:22:21.542-04:00',
 'url': 'https://aqs.epa.gov/data/api/sampleData/byCBSA?email=candice.sessa10@gmail.com&key=dunfox31&param=88101&bdate=20140101&edate=20141231&cbsa=Internet%20Release%20Date:%20April%202016',
 'error': ['cbsa code: Internet Release Date: April 2016, requires 5 digit numeric value.']}

# Pulling Data and putting in list
There are two loops, the first loop is through the 33 different MSAs. Unfortunately the EPA does not always sample every 3 dyas like their website says. This leads to certain sampling sites having different lengths of samples. The second loop goes from 0 to the length of the number of samples they do have

In [28]:
time = []
date = []
cbsa_code = []
site = []
sample = []


for x in range(len(response_sample)):
# for x in range(0,32):
    try:
        for y in range(0,response_sample[x]['Header'][0]['rows']):
            time.append(response_sample[x]['Data'][y]['time_local'])
            date.append(response_sample[x]['Data'][y]['date_local'])
            cbsa_code.append(response_sample[x]['Data'][y]['cbsa_code'])
            site.append(response_sample[x]['Data'][y]['site_number'])
            sample.append(response_sample[x]['Data'][y]['sample_measurement'])
    except:
        print(f'CBSA {cbsa_code[y]} not found')

columns = ['time','date','cbsa_code','site','sample']
df_sample = pd.DataFrame(data = list(zip(time,date,cbsa_code,site,sample)), columns = columns)

df_sample.head()

CBSA 10500 not found
CBSA 10500 not found
CBSA 10500 not found


Unnamed: 0,time,date,cbsa_code,site,sample
0,00:01,2014-01-02,10100,3,13.7
1,00:01,2014-01-08,10100,3,6.1
2,00:01,2014-01-14,10100,3,3.6
3,00:01,2014-01-20,10100,3,3.1
4,00:01,2014-01-26,10100,3,2.4


# Converting date to DateTime
needed to group by month which is format that Oil Data is in

In [29]:
df_sample['date'] = pd.to_datetime(df_sample.date,format = '%Y-%m')
print(df_sample.head(1))
print('---------------------------------------------------------')
print("There are "+ str(len(df_sample)) +" rows of PM2.5 data")

    time       date cbsa_code  site  sample
0  00:01 2014-01-02     10100  0003    13.7
---------------------------------------------------------
There are 2868125 rows of PM2.5 data


In [30]:
df_sample['month_year'] = df_sample['date'].dt.strftime('%m-%Y')
df_sample.head(1)

Unnamed: 0,time,date,cbsa_code,site,sample,month_year
0,00:01,2014-01-02,10100,3,13.7,01-2014


converting sample to float to be used for regression/analysis later

In [31]:
df_sample= df_sample.astype({'sample': float})

# Grouping Hourly Data by day & Merging
PM2.5 data is organized by hour, but we need it on a daily level so it can map in with the AQI levels. First we do a groupby to get daily levels, then we need to remerge with the original data to get the categorical data back in (county, site, lat, lon)

In [32]:
df_sample1 = df_sample[['site','sample','month_year','cbsa_code']].groupby(['month_year','cbsa_code','site']).mean().reset_index().sort_values('month_year',ascending = False)
df_sample1.head()
len(df_sample1)
print(df_sample1.head(3))
print('-----------------------------------------')
print('There are '+ str(len(df_sample1)) + ' rows after grouping')

      month_year cbsa_code  site     sample
10010    12-2014     49740  8011   5.511440
9384     12-2014     19300  0010   7.188889
9461     12-2014     22800  0008  13.645455
-----------------------------------------
There are 10011 rows after grouping


# Exporting all CBSA EPA locations

In [37]:
df_sample1.to_csv('../data/pm25_all_cbsa_2014')