In [None]:
import os
import math
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
from scipy.stats import zscore
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeRegressor
import graphviz

In [None]:
#DEFINITIONS
def rmse(predicted, actual):
    return np.sqrt(((predicted - actual)**2).mean())

In [None]:
##READ IN WEATHER STATION DATA FROM ALL CSVs
folder_path = r'C:\Users\bryan.zanoli\OneDrive\Documents\School\CSUMB\CST383-30\project\station_data'

df_weatherdata = pd.DataFrame()

for filename in os.listdir(folder_path):
    if os.path.isfile(os.path.join(folder_path, filename)):
        df_station = pd.read_csv(folder_path + "\\" + filename, nrows = 1, header = 0)
        df_weather = pd.read_csv(folder_path + "\\" + filename, skiprows = 3)
        #print('Station: ',df_station.head(3))
        #print('Weather: ',df_weather.head(3))
        df_weather['key'] = 1
        df_station['key'] = 1
        df_new = pd.merge(df_station, df_weather, on='key').drop('key', axis = 1)
        #print('New: ',df_new.head(3))
        df_weatherdata = pd.concat([df_new, df_weatherdata], axis = 0)

df_weatherdata['year'] = pd.to_datetime(df_weatherdata['time']).dt.year

print('Weather Data Read In: ',df_weatherdata.head(10))
df_weatherdata.info()

##CREATE coordinates DataFrame from df_weatherdata
df_location = df_weatherdata[['latitude', 'longitude']].drop_duplicates()
print('Location Data Read In: \n',df_location)

In [None]:
##CREATE coordinates DataFrame from df_weatherdata
df_location = df_weatherdata[['latitude', 'longitude']].drop_duplicates()
print('Location Data Read In: \n',df_location)

In [None]:
##READ IN TEST DATA - Weather Station Data not reflected in rain stations
folder_path = r'C:\Users\bryan.zanoli\OneDrive\Documents\School\CSUMB\CST383-30\project\test_data_with_prec'

df_testdata = pd.DataFrame()

for filename in os.listdir(folder_path):
    if os.path.isfile(os.path.join(folder_path, filename)):
        df_station = pd.read_csv(folder_path + "\\" + filename, nrows = 1, header = 0)
        df_weather = pd.read_csv(folder_path + "\\" + filename, skiprows = 3)
        #print('Station: ',df_station.head(3))
        #print('Weather: ',df_weather.head(3))
        df_weather['key'] = 1
        df_station['key'] = 1
        df_new = pd.merge(df_station, df_weather, on='key').drop('key', axis = 1)
        #print('New: ',df_new.head(3))
        df_testdata = pd.concat([df_new, df_testdata], axis = 0)

df_testdata['year'] = pd.to_datetime(df_testdata['time']).dt.year
df_testdata['location'] = df_testdata['latitude'].astype('str')+' '+df_testdata['longitude'].astype('str')

print('Test Data Read In: ',df_testdata.head(10))
df_testdata.info()

df_testdata['temp_max_mean'] = df_testdata.groupby(['location', 'year'])['temperature_2m_max (°F)'].transform(np.mean)
df_testdata['temp_min_mean'] = df_testdata.groupby(['location', 'year'])['temperature_2m_min (°F)'].transform(np.mean)
df_testdata['temp_min_ann'] = df_testdata.groupby(['location', 'year'])['temperature_2m_min (°F)'].transform(np.min)
df_testdata['temp_max_ann'] = df_testdata.groupby(['location', 'year'])['temperature_2m_max (°F)'].transform(np.max)
df_testdata['temp_mean_ann'] = df_testdata.groupby(['location', 'year'])['temperature_2m_mean (°F)'].transform(np.mean)
df_testdata['temp_mean_max'] = df_testdata.groupby(['location', 'year'])['temperature_2m_mean (°F)'].transform(np.max)
df_testdata['temp_mean_min'] = df_testdata.groupby(['location', 'year'])['temperature_2m_mean (°F)'].transform(np.min)
df_testdata['wind_speed_max_min'] = df_testdata.groupby(['location', 'year'])['wind_speed_10m_max (mp/h)'].transform(np.min)
df_testdata['wind_speed_max_max'] = df_testdata.groupby(['location', 'year'])['wind_speed_10m_max (mp/h)'].transform(np.max)
df_testdata['wind_speed_max_mean'] = df_testdata.groupby(['location', 'year'])['wind_speed_10m_max (mp/h)'].transform(np.mean)
df_testdata['wind_gusts_max_min'] = df_testdata.groupby(['location', 'year'])['wind_gusts_10m_max (mp/h)'].transform(np.min)
df_testdata['wind_gusts_max_max'] = df_testdata.groupby(['location', 'year'])['wind_gusts_10m_max (mp/h)'].transform(np.max)
df_testdata['wind_gusts_max_mean'] = df_testdata.groupby(['location', 'year'])['wind_gusts_10m_max (mp/h)'].transform(np.mean)
df_testdata['precipitation_hours_sum'] = df_testdata.groupby(['location', 'year'])['precipitation_hours (h)'].transform(np.sum)
df_testdata['precipitation_hours_mean'] = df_testdata.groupby(['location', 'year'])['precipitation_hours (h)'].transform(np.mean)
df_testdata['evapotranspiration_min'] = df_testdata.groupby(['location', 'year'])['et0_fao_evapotranspiration (inch)'].transform(np.min)
df_testdata['evapotranspiration_max'] = df_testdata.groupby(['location', 'year'])['et0_fao_evapotranspiration (inch)'].transform(np.max)
df_testdata['evapotranspiration_mean'] = df_testdata.groupby(['location', 'year'])['et0_fao_evapotranspiration (inch)'].transform(np.mean)
df_testdata['precipitation_sum'] = df_testdata.groupby(['location', 'year'])['precipitation_sum (inch)'].transform(np.sum)

df_testdata.drop(['time', 'temperature_2m_max (°F)', 'temperature_2m_min (°F)', 'temperature_2m_mean (°F)',
                          'precipitation_hours (h)', 'wind_speed_10m_max (mp/h)', 'wind_gusts_10m_max (mp/h)',
                          'et0_fao_evapotranspiration (inch)', 'rain_sum (inch)', 'precipitation_sum (inch)'], 
            axis = 1, inplace = True)

print(df_testdata['location'][0:8000:500])

df_testdata = df_testdata.drop_duplicates(subset = ['location', 'year'])
df_testdata = df_testdata.reset_index(drop=True)

df_testdata.info()

print('Test Data Read In: ', df_testdata)

In [None]:
##READ IN RAIN STATION DATA
df_rain = pd.read_csv(r"C:\Users\bryan.zanoli\OneDrive\Documents\School\CSUMB\CST383-30\project\lwu-precip-data-to-2023_basic_flatfile_withcoordinates.csv", index_col=None)
df_rain.columns = df_rain.columns.str.strip()
df_rain.info()

df_rain = df_rain.dropna(subset = ['TotalPrecipitation_inches', 'WaterYear'])
nonfloat = np.where(df_rain['TotalPrecipitation_inches'] == '.')[0]
df_rain.drop(df_rain.index[nonfloat], inplace=True)
df_rain.drop(['BeginGageServiceDate', 'EndGageServiceDate', 'Notes_FlaggedResults'], axis = 1, inplace = True)

df_rain['TotalPrecipitation_inches'] = df_rain['TotalPrecipitation_inches'].astype('float64')

df_rain.info()


##Find nearest WEATHER STATION to RAIN STATION
for rain in df_rain.index :
    min_dist = float(1e7)
    selected_long = 0
    selected_lat = 0
    for location in df_location.values:
        # print('Location: ', location)
        # print('X Coord: ', df_rain.loc[rain, ['x_coord']])
        # print('Operation: ', location[1] - df_rain.loc[rain, ['x_coord']])
        distance = (location[1] - df_rain.loc[rain, ['x_coord']].values)**2 + (location[0] - df_rain.loc[rain, ['y_coord']].values)**2
        if(distance < min_dist):
            min_dist = distance
            selected_long = location[1]
            selected_lat = location[0]
        #print(distances)
    df_rain.loc[rain, 'min_distance'] = min_dist
    df_rain.loc[rain, 'longitude'] = selected_long
    df_rain.loc[rain, 'latitude'] = selected_lat

print(df_rain[['StationName', 'x_coord', 'y_coord', 'longitude', 'latitude']])
print(df_weatherdata[['longitude', 'latitude']])