In [None]:
import os
import pandas as pd
import numpy as np
import regex as re
from datetime import datetime

import ast

# Source code downloaded
from vincenty import vincenty_inverse
from tqdm import tqdm
pd.set_option('display.max_columns', 100)

In [None]:
current_dir = os.getcwd()
NOAA_dir = current_dir + r'\NOAA Weather Data'
EPA_dir = current_dir + r'\EPA Ozone Data'

filtered_EPA_dir = current_dir + r'/EPA Ozone Data/Filtered EPA Data/second_cleaning/'
compressed_NOAA_dir = current_dir + r'/NOAA Weather Data' 

In [None]:
filtered_EPA_files = [file for file in os.listdir(filtered_EPA_dir) if file.endswith(".csv")]
filtered_EPA_files

In [None]:
noaa_files = [file for file in os.listdir(compressed_NOAA_dir) if file.endswith(".csv.gz")]
noaa_files

In [None]:
us_station_codes = pd.read_csv(NOAA_dir + '/us_station_codes.csv')

# Finding the 10 closest weather stations based on Vincenty distance and then figure out the weathers based on that 

- Don't extract the NOAA data bc it is too large

In [None]:
def get_ozoneID_coords(ozone_df):
    """
    Groups entire dataframe by ozoneID and then applies lambda function that extracts the first entry of Latitude, Longitude
    
    returns: a series indexed by ozoneID that gives back information that can then be indexed into using key's 'Latitude' and
    'Longitude'
    
    NOTE: WE CAN GET ALL UNIQUE OZONE ID FROM THIS OUTPUT'S INDEX using output.index
    """
    #first check that the required columns are there, otherwise print an error
    if all(column in ozone_df.columns for column in ["ozoneID", "Latitude", "Longitude"]):
        return ozone_df.groupby("ozoneID").apply(lambda gr: gr[["Latitude", "Longitude"]].iloc[0,:])
    else:
        raise Exception("one of the columns needed in ozoneID, Latitude, Longitude was missing")

In [None]:
from vincenty import vincenty_inverse

def get_closest_stations(lat_long_pair):
#     print("working on " + str(lat_long_pair))
    temp = us_station_codes.copy()
    temp["ozone_lat"] = lat_long_pair["Latitude"]
    temp["ozone_long"] = lat_long_pair["Longitude"]
    temp["vincenty_dist"] = [vincenty_inverse((a, b), (c, d)) for a, b, c, d in zip(temp["lat"], 
                                                                            temp["long"], 
                                                                            temp["ozone_lat"], 
                                                                            temp["ozone_long"])]
    sorted_distances = temp.sort_values("vincenty_dist")[["StationId","vincenty_dist"]].iloc[0:10, :]
    return sorted_distances.values.tolist()

In [None]:
for file in filtered_EPA_files:
    if "closest_stations_" + file in os.listdir(NOAA_dir + "/closest_stations"):
        print("closest_stations for " + file + " already found, skipping")
        continue
    else:
        print("working on " + file)
        temp = pd.read_csv(filtered_EPA_dir + file, low_memory = False, index_col = 0)
        temp_lat_long = get_ozoneID_coords(temp)

        # create dictionary 
        closest_stations = {}
        indices = temp_lat_long.index
        for ozone_index in tqdm(range(len(indices))):
            ozone_station = indices[ozone_index]
            ozone_station_coord = temp_lat_long.loc[ozone_station]
            closest_stations[ozone_station] = get_closest_stations(ozone_station_coord)
        
        output = pd.DataFrame(closest_stations)
        output.to_csv(NOAA_dir + "/closest_stations/closest_stations_" + file)

    
    

    

# Getting the closest weather station data

In [None]:
closest_noaa = [file for file in os.listdir(NOAA_dir + "/closest_stations") if file.endswith(".csv")]
closest_noaa

In [None]:
column_name = dict(zip(range(10), ["station_" + str(i) for i in range(10)]))

In [None]:
closest_1989 = pd.read_csv(NOAA_dir + "/closest_stations/" + closest_noaa[1], index_col = 0)
closest_1989 = closest_1989.T.applymap(ast.literal_eval)
closest_1989 = closest_1989.rename(columns = column_name)
closest_1989 = closest_1989.applymap(lambda x: x[0])
closest_1989

# Cleaning NOAA Data

In [None]:
header_list = ["Code", "Date", "Measurement", "Value", "V1", "V2", "V3", "V4"]
noaa_1989 = pd.read_csv(NOAA_dir + "/" + noaa_files[1], names = header_list)
noaa_1989["Datetime"] = pd.to_datetime(noaa_1989["Date"], format='%Y%m%d', errors='ignore')

In [None]:
noaa_1989.head()

In [None]:
us_noaa_1989 = noaa_1989[noaa_1989["Code"].str.startswith("US")]

In [None]:
us_noaa_1989_prcp = us_noaa_1989[us_noaa_1989["Measurement"] == "PRCP"].rename(columns = {"Value": "RAIN"}).drop(columns = ["Measurement", "V1", "V2", "V3", "V4", "Date"])


In [None]:
us_noaa_1989_snow = us_noaa_1989[us_noaa_1989["Measurement"] == "SNOW"].rename(columns = {"Value": "SNOW"}).drop(columns = ["Measurement", "V1", "V2", "V3", "V4", "Date"])


In [None]:
us_noaa_1989_tmax = us_noaa_1989[us_noaa_1989["Measurement"] == "TMAX"].rename(columns = {"Value": "TMAX"}).drop(columns = ["Measurement", "V1", "V2", "V3", "V4", "Date"])


In [None]:
us_noaa_1989_tmin = us_noaa_1989[us_noaa_1989["Measurement"] == "TMIN"].rename(columns = {"Value": "TMIN"}).drop(columns = ["Measurement", "V1", "V2", "V3", "V4", "Date"])


# Testing Joining with 1989

In [None]:
epa_1989 = pd.read_csv(filtered_EPA_dir + filtered_EPA_files[1], low_memory = False, index_col = 0).drop(columns = ["Unnamed: 0.1"])
epa_1989["Datetime"] = pd.to_datetime(epa_1989["Date Local"], format='%Y-%m-%d', errors='ignore')
epa_1989

In [None]:
epa_1989 = epa_1989.merge(closest_1989, left_on = "ozoneID", right_index = True)

In [None]:
epa_1989

In [None]:
need_weather = epa_1989.iloc[:,14:25]

In [None]:
need_weather

In [None]:
noaa_1989

In [None]:
ugly = need_weather.merge(us_noaa_1989_tmax, left_on = ["Datetime", "station_0"], right_on = ["Datetime", "Code"], how = "left").drop(columns = ["Code", "Datetime",]).rename(columns = {"TMAX":"TMAX_0"})\
.merge(us_noaa_1989_tmax, left_on = ["Datetime", "station_1"], right_on = ["Datetime", "Code"], how = "left").drop(columns = ["Code", "Datetime",]).rename(columns = {"TMAX":"TMAX_1"})\
.merge(us_noaa_1989_tmax, left_on = ["Datetime", "station_2"], right_on = ["Datetime", "Code"], how = "left").drop(columns = ["Code", "Datetime",]).rename(columns = {"TMAX":"TMAX_2"})\
.merge(us_noaa_1989_tmax, left_on = ["Datetime", "station_3"], right_on = ["Datetime", "Code"], how = "left").drop(columns = ["Code", "Datetime",]).rename(columns = {"TMAX":"TMAX_3"})\
.merge(us_noaa_1989_tmax, left_on = ["Datetime", "station_4"], right_on = ["Datetime", "Code"], how = "left").drop(columns = ["Code", "Datetime",]).rename(columns = {"TMAX":"TMAX_4"})\
.merge(us_noaa_1989_tmax, left_on = ["Datetime", "station_5"], right_on = ["Datetime", "Code"], how = "left").drop(columns = ["Code", "Datetime",]).rename(columns = {"TMAX":"TMAX_5"})\
.merge(us_noaa_1989_tmax, left_on = ["Datetime", "station_6"], right_on = ["Datetime", "Code"], how = "left").drop(columns = ["Code", "Datetime",]).rename(columns = {"TMAX":"TMAX_6"})\
.merge(us_noaa_1989_tmax, left_on = ["Datetime", "station_7"], right_on = ["Datetime", "Code"], how = "left").drop(columns = ["Code", "Datetime",]).rename(columns = {"TMAX":"TMAX_7"})\
.merge(us_noaa_1989_tmax, left_on = ["Datetime", "station_8"], right_on = ["Datetime", "Code"], how = "left").drop(columns = ["Code", "Datetime",]).rename(columns = {"TMAX":"TMAX_8"})\
.merge(us_noaa_1989_tmax, left_on = ["Datetime", "station_9"], right_on = ["Datetime", "Code"], how = "left").drop(columns = ["Code", "Datetime",]).rename(columns = {"TMAX":"TMAX_9"})



we only have 139787/208268 of the weather data wtf?

In [None]:
ugly

In [None]:
sum(ugly.isna().sum(axis = 1) == 10)

In [None]:
epa_1989["TMAX"] = ugly.iloc[:, 11:].values.tolist()

In [None]:
epa_1989

In [None]:
def merge_epa_NOAA(noaa_df_dir, epa_df_dir, closest_noaa_dir):
    # clean closest_noaa 
    column_name = dict(zip(range(10), ["station_" + str(i) for i in range(10)]))
    closest_noaa = pd.read_csv(NOAA_dir + "/closest_stations/" + closest_noaa_dir, index_col = 0)
    closest_noaa = closest_noaa.T.applymap(ast.literal_eval).rename(columns = column_name).applymap(lambda x: x[0])
    
    # clean weather data
    header_list = ["Code", "Date", "Measurement", "Value", "V1", "V2", "V3", "V4"]
    noaa_df = pd.read_csv(NOAA_dir + "/" + noaa_df_dir, names = header_list)
    noaa_1989["Datetime"] = pd.to_datetime(noaa_1989["Date"], format='%Y%m%d', errors='ignore')
    
