In [1]:
# Import Dependencies
import pandas as pd
import numpy as np
import psycopg2

In [2]:
# Make a reference to the books.csv file path
csv_path = "Data/annual_conc_by_monitor_2019.csv"

# Import the books.csv file as a DataFrame
air_df = pd.read_csv(csv_path, encoding="utf-8")
air_df

FileNotFoundError: [Errno 2] No such file or directory: 'Data/annual_conc_by_monitor_2019.csv'

In [None]:
# Remove unecessary columns from the DataFrame and save the new DataFrame
base_df = air_df[["State Code", "County Code", "Site Num", "State Name", "City Name", "POC", "Latitude", "Longitude", "Parameter Name", "Arithmetic Mean", "Pollutant Standard"]]
base_df.head()

In [None]:
#Rename columns
base_df = base_df.rename(columns={
    "State Name": "state", 
    "City Name": "city",
    "Latitude": "latitude",
    "Longitude": "longitude",
    "Parameter Name": "pollutant",
    "Arithmetic Mean": "average"
})

base_df.head()

## OZONE DATA:

In [None]:
# Ozone df
ozone = base_df.loc[base_df['pollutant'] == 'Ozone']
ozone.head()

In [None]:
# Choosing to use only one pollutant standard (I chose the most recent which is Ozone 8hr 2015)
# I did this so each city (or coordinates) will have one of each test data
ozone = ozone.loc[ozone['Pollutant Standard'] == 'Ozone 8-hour 2015']
ozone

In [None]:
# Search for duplicates
ozone.duplicated()

In [None]:
# I run this code anyway to see if any duplicates were missed or hidden
ozone = ozone.drop_duplicates()
ozone

In [None]:
# Search for nan values
ozone.isna().any()

In [None]:
# Remove POC and Pollutant Standard columns from the Data frame
ozoneFinal = ozone[["State Code", "County Code", "Site Num","state", "city", "latitude", "longitude", "pollutant", "average"]]
ozoneFinal.head()

## PM.2.5 DATA

In [None]:
# working off the base
base_df.head()

In [None]:
# Ozone df
pm = base_df.loc[base_df['pollutant'] == 'PM2.5 - Local Conditions']
pm.head()

In [None]:
# Choosing to use only one pollutant standard (I chose the most recent which is PM2.5 Annual 2012)
# I did this so each city (or coordinates) will have one of each test data
pm = pm.loc[pm['Pollutant Standard'] == 'PM25 Annual 2012']
pm.head(20)

In [None]:
# Get rid of duplicates using POC=1 (ex, index 323&327, 512&516 are same)
pm = pm.loc[pm['POC'] == 1]
pm.head(20)

In [None]:
# There are duplicates in Alaska Fairbanks, index 855 & 856. 
pm = pm.drop_duplicates()
pm.head(30)

In [None]:
# Alaska Fairbanks! I am not sure how to get rid of those duplicated in Fairbanks... I will leave them there

In [None]:
# Remove POC and Pollutant Standard columns from the Data frame
pm = pm[["State Code", "County Code", "Site Num", "state", "city", "latitude", "longitude", "pollutant", "average"]]
pm.head()

In [None]:
# Search for duplicates
pm.duplicated()

In [None]:
# Search for nan values
pm.isna().any()

## Merge ozone and pm df for making Graphs

In [None]:
merge_df = pd.merge(ozone, pm, on=["State Code", "County Code", "Site Num", "state", "city", "latitude", "longitude"], how="outer")
merge_df

In [None]:
#Rename columns
merge_df = merge_df.rename(columns={
    "pollutant_x": "ozone", 
    "average_x": "mean_ozone",
    "pollutant_y": "pm25",
    "average_y": "mean_pm25"
})
merge_df

In [None]:
# Remove unneccesary columns for making graphs
merge_df = merge_df[["state", "city", "latitude", "longitude", "ozone", "mean_ozone", "pm25", "mean_pm25"]]
merge_df.head()

## Append method to put together two data frames

In [None]:
append_df = ozone.append(pm)
append_df.head(30)

## Make final version of ozone

In [None]:
ozone_final = ozone[["state", "city", "latitude", "longitude", "pollutant", "average"]]
ozone_final.head()

## Make final version of pm

In [None]:
pm_final = pm[["state", "city", "latitude", "longitude", "pollutant", "average"]]
pm_final.head()

## CSV COPIES

In [None]:
ozone_final.to_csv('ozone.csv', index=False, header=True)

In [None]:
pm_final.to_csv('pm.csv', index=False, header=True)

In [None]:
merge_df.to_csv('merge_final.csv', index=False, header=True)

In [None]:
append_df.to_csv('append_final.csv', index=False, header=True)

## Make Json files

In [None]:
#merge_df.to_json('merge_final.json')

In [None]:
#append_df.to_json('append_final.json')