In [None]:
import pandas as pd
import os

In [None]:
current_dir = os.getcwd()
NOAA_dir = current_dir + r'\NOAA Weather Data'
EPA_dir = current_dir + r'\EPA Ozone Data'

In [None]:
os.listdir(NOAA_dir)

In [None]:
os.listdir(EPA_dir + r'\Raw EPA Data')

# Getting Station Codes

In [None]:
# headers2 = ["StationId", "lat", "long", "elev", "name", "a", "b"]

# station_codes = pd.read_csv(NOAA_dir + r'\ghcnd-stations.csv', names = headers2)

# US_only_station = station_codes[station_codes["StationId"].apply(lambda x: x[0:2]) == "US"]

# US_only_station.to_csv("us_station_codes.csv")

In [None]:
us_station_codes = pd.read_csv(NOAA_dir + r"\us_station_codes.csv", index_col = 0).drop(columns = ["a", "b"])
us_station_codes.head(20)

# Cleaning Weather Data and Joining Station Coords

In [None]:
NOAA_files = ['1989.csv.gz',
         '1990.csv.gz',
         '1991.csv.gz',
         '1992.csv.gz',
         '1993.csv.gz',
         '1994.csv.gz',
         '1995.csv.gz',
         '1996.csv.gz',
         '1997.csv.gz',
         '1998.csv.gz',
         '1999.csv.gz',
         '2000.csv.gz',
         '2001.csv.gz',
         '2002.csv.gz',
         '2003.csv.gz',]

In [None]:
headers = ["StationId", "Date", "Measurement", "Value", "Flag1", "Flag2", "Flag3", "Flag4"]
desired_measurements = ["TMAX", "TMIN", "PRCP", "SNOW", "SNWD"]

for file in NOAA_files: 
    if file[0:4] + "_filtered.csv" in os.listdir(NOAA_dir):
        print("skipping "+ file + ", already converted")
        continue
    else: 
        print("working on " + file)
        temp = pd.read_csv(NOAA_dir + "\\" + file, names = headers)
        print("filtering stationid")
        temp = temp[temp["StationId"].apply(lambda x: x[0:2]) == "US"]
        print("filtering measurements")
        temp = temp[temp["Measurement"].apply(lambda x: x in desired_measurements)]
        print("dropping columns")
        temp = temp.drop(columns = ["Flag1", "Flag2", "Flag3", "Flag4"])
        print("joining coords")
        temp = pd.merge(temp, us_station_codes, on = "StationId")
        print("exporting")
        temp.to_csv(file[0:4] + "_filtered.csv")
        

print("Done!")
    

# EPA Data

In [None]:
EPA_Zip_Files = os.listdir(EPA_dir + r"\\Raw EPA Data")
EPA_Zip_Files

### Testing first with Ozone 1989

In [None]:
ozone_1989 = pd.read_csv(EPA_dir + r"\\Raw EPA Data\\"  + EPA_Zip_Files[0])

In [None]:
ozone_1989.head()

In [None]:
# Creating a unique identifier per air quality station based on State Code, County Code, and Site Num
def create_ozone_id(statelist, countylist, sitelist):
    """
    Helper function for creating ozoneID's based on an EPA dataset
    
    returns: 3 lists appended together into tuples to be added into a column
    """
    return [(a,b,c) for a, b, c in zip(statelist, countylist, sitelist)]

def append_ozone_id(ozone_df):
    """
    Creates a copy of the dataframe and adds a new column that concatenates the state code, county code, and site number
    into a tuple to make an individual identifier for each ozone reporting location in that year. 
    
    returns: dataframe with ozoneID
    """
    temp = ozone_df.copy() 
    temp["ozoneID"] = create_ozone_id(temp["State Code"], temp["County Code"], temp["Site Num"])
    return temp

In [None]:
ozone_1989 = append_ozone_id(ozone_1989)

# Joining EPA data with NOAA data

- First need to figure out the 10 closest weather stations for each EPA ozone location (PER YEAR, might have different ozone stations over time) 

- Save these closest weather stations in a dictionary so that we can access it in the future (save as a dictionary based on ozoneID

Run a for loop across each iteration of the EPA data and create data dictionaries for each individual index in the EPA data

In [None]:
def get_ozoneID_coords(ozone_df):
    """
    Groups entire dataframe by ozoneID and then applies lambda function that extracts the first entry of Latitude, Longitude
    
    returns: a series indexed by ozoneID that gives back information that can then be indexed into using key's 'Latitude' and
    'Longitude'
    
    NOTE: WE CAN GET ALL UNIQUE OZONE ID FROM THIS OUTPUT'S INDEX using output.index
    """
    #first check that the required columns are there, otherwise print an error
    if all(column in ozone_df.columns for column in ["ozoneID", "Latitude", "Longitude"]):
        return ozone_df.groupby("ozoneID").apply(lambda gr: gr[["Latitude", "Longitude"]].iloc[0,:])
    else:
        raise Exception("one of the columns needed in ozoneID, Latitude, Longitude was missing")

In [None]:
#make a dictionary after finding the closest weather station and then figure out from there 
lat_long_ozone_1989 = ozone_1989.groupby("ozoneID").apply(lambda gr: gr[["Latitude", "Longitude"]].iloc[0,:])
lat_long_ozone_1989

For every observation location we get their longitude and latitude and calculate the vincenty distance to all of the weather stations and then sort data frame by distance and return top 

In [None]:
from vincenty import vincenty_inverse

def get_closest_stations(lat_long_pair):
    print("working on " + str(lat_long_pair))
    temp = us_station_codes.copy()
    temp["ozone_lat"] = lat_long_pair["Latitude"]
    temp["ozone_long"] = lat_long_pair["Longitude"]
    temp["vincenty_dist"] = [vincenty_inverse((a, b), (c, d)) for a, b, c, d in zip(temp["lat"], 
                                                                            temp["long"], 
                                                                            temp["ozone_lat"], 
                                                                            temp["ozone_long"])]
    sorted_distances = temp.sort_values("vincenty_dist")[["StationId","vincenty_dist"]].iloc[0:10, :]
    return sorted_distances.values.tolist()

In [None]:
closest_stations = {}
for ozone_station in lat_long_ozone_1989.index:
    ozone_station_coord = lat_long_ozone_1989[ozone_station]
    closest_stations[ozone_station] = get_closest_stations(ozone_station_coord)
    

In [None]:
closest_stations

In [None]:
test = pd.DataFrame(closest_stations)

In [None]:
test.to_csv("closest_stations_1989.csv")

In [None]:
# closest stations maps each ozone code to a specific weather station based on longitude and latutude
closest_1989 = pd.read_csv("closest_stations_1989.csv", header = [0,1,2], index_col = 0)

In [None]:
closest_1989.to_dict()