In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.models import Sequential
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import plotly.graph_objects as go
import joblib
import arcpy
import os

ModuleNotFoundError: No module named 'arcpy'

In [15]:
# process temporal features
df = pd.DataFrame(pd.read_csv('/Users/emilyzhao/li-sound/data/processed/processed-surface.csv'))
weather = pd.DataFrame(pd.read_csv('/Users/emilyzhao/li-sound/data/used/meteorological.csv'))   
df['datetime'] = pd.to_datetime(df['DATE'])
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['precipitation'] = weather['Precipitation_in']
df['temperature'] = weather['Air_Temp_C']
df['wind_speed'] = weather['Wind_Speed_MPH']


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



In [16]:
# create lag features
df['lag_1'] = df['DISSOLVED_OXYGEN_MG_L'].shift(1)
df['lag_2'] = df['DISSOLVED_OXYGEN_MG_L'].shift(2)
df.dropna(inplace=True)  # Drop rows with NaN values resulting from lag
df

Unnamed: 0.1,Unnamed: 0,DATE,STATION_ID,CLASS,DEPTH_M,DISSOLVED_OXYGEN_MG_L,TEMPERATURE_C,SALINITY,LON,LAT,datetime,year,month,day,precipitation,temperature,wind_speed,lag_1,lag_2
2,4,07/25/91 00:00:00,A3,Surface,1.0,5.0,18.5,24.0,-73.755000,40.841667,1991-07-25,1991,7,25,0.120000,27.777778,9.366667,3.2,7.0
3,6,07/25/91 00:00:00,A4,Surface,1.0,6.1,19.0,24.5,-73.735000,40.876389,1991-07-25,1991,7,25,0.120000,27.777778,9.366667,5.0,3.2
4,8,07/25/91 00:00:00,H-D,Surface,1.0,5.2,23.0,31.0,-73.660000,40.845000,1991-07-25,1991,7,25,0.120000,27.777778,9.366667,6.1,5.0
5,10,07/25/91 00:00:00,H-C,Surface,1.0,8.5,22.5,31.0,-73.675000,40.865000,1991-07-25,1991,7,25,0.120000,27.777778,9.366667,5.2,6.1
6,12,07/25/91 00:00:00,H-C1,Surface,1.0,6.8,21.5,26.0,-73.695000,40.886667,1991-07-25,1991,7,25,0.120000,27.777778,9.366667,8.5,5.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4493,15234,09/13/10 00:00:00,DI1,Surface,1.0,4.2,21.5,28.2,-73.773333,40.892500,2010-09-13,2010,9,13,0.143333,18.004082,3.038776,4.3,4.7
4494,18915,02/22/18 00:00:00,A1,Surface,1.0,7.0,22.0,30.0,-73.826667,40.803333,2018-02-22,2018,2,22,0.143333,18.004082,3.038776,4.2,4.3
4495,18917,02/22/18 00:00:00,A2M,Surface,1.0,3.2,18.5,20.0,-73.783333,40.801667,2018-02-22,2018,2,22,0.000000,22.882500,3.880000,7.0,4.2
4496,18919,06/01/18 00:00:00,A1,Surface,1.0,7.0,22.0,30.0,-73.826667,40.803333,2018-06-01,2018,6,1,0.000000,22.882500,3.880000,3.2,7.0


In [17]:
features = ['year', 'month', 'day', 'LAT', 'LON', 'lag_1', 'lag_2', 'precipitation', 'temperature', 'wind_speed']
X = df[features]
y = df['DISSOLVED_OXYGEN_MG_L']


In [18]:
# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=False)

In [19]:
# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f'Validation RMSE: {rmse}')

Validation RMSE: 1.7274604774953672


In [21]:
joblib.dump(model, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [22]:
# Create a trace for historical data
trace1 = go.Scatter(
    x=X.index,
    y=y,
    mode='lines',
    name='Historical Data'
)

# Create a trace for predicted data
trace2 = go.Scatter(
    x=X_valid.index,
    y=y_pred,
    mode='lines',
    name='Predicted Data'
)

# Create a trace for future predictions
future_dates = pd.date_range(start=X_valid.index[-1], periods=3, freq='M')
future_predictions = pd.DataFrame({'datetime': future_dates, 'predicted_value': model.predict(X_valid.tail(3))})
trace3 = go.Scatter(
    x=future_predictions['datetime'],
    y=future_predictions['predicted_value'],
    mode='lines',
    name='Future Predictions'
)

# Create the layout for the graph
layout = go.Layout(
    title='Historical and Predicted Data',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Dissolved Oxygen (mg/L)')
)

# Create the figure and add the traces
fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)



'M' is deprecated and will be removed in a future version, please use 'ME' instead.



In [23]:
fig.show()