In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from ipywidgets import interact
import ipywidgets as widgets
from ipywidgets import interact, FloatSlider
from IPython.display import display, clear_output

In [2]:
do_data = pd.read_excel('hurricane_irene.xlsx', sheet_name=5).drop(['Piermont D.O. (ppm)'], axis = 1)
turbidity_data = pd.read_excel('hurricane_irene.xlsx', sheet_name=2).drop(['Piermont Turbidity in NTU'], axis = 1)
rainfall_data = pd.read_excel('hurricane_irene.xlsx', sheet_name=1).drop(['Piermont  Rainfall Daily Accumulation (Inches)'], axis = 1)

In [3]:
do_data.head()

Unnamed: 0,Date Time (ET),Port of Albany D.O. (ppm),Norrie Point D.O. (ppm)
0,2011-08-25 00:00:00,7.68,7.81
1,2011-08-25 00:15:00,7.6,7.73
2,2011-08-25 00:30:00,7.57,7.63
3,2011-08-25 00:45:00,7.72,7.67
4,2011-08-25 01:00:00,7.74,7.63


In [4]:
turbidity_data.head()

Unnamed: 0,Date Time (ET),Port of Albany Turbidity in NTU,Norrie Point Turbidity in NTU
0,2011-08-25 00:00:00,4.0,9.3
1,2011-08-25 00:15:00,3.9,8.4
2,2011-08-25 00:30:00,4.3,7.9
3,2011-08-25 00:45:00,4.7,8.1
4,2011-08-25 01:00:00,4.4,8.4


In [5]:
rainfall_data.head()

Unnamed: 0,Date Time (ET),Port of Albany Rainfall Daily Accumulation (Inches),Norrie Point Rainfall Daily Accumulation (Inches)
0,2011-08-25 00:00:00,0.0,0.0
1,2011-08-25 00:15:00,0.0,0.0
2,2011-08-25 00:30:00,0.0,0.0
3,2011-08-25 00:45:00,0.0,0.0
4,2011-08-25 01:00:00,0.0,0.0


In [6]:
df = do_data.merge(turbidity_data)

In [7]:
df = df.merge(rainfall_data)

In [8]:
df.head()

Unnamed: 0,Date Time (ET),Port of Albany D.O. (ppm),Norrie Point D.O. (ppm),Port of Albany Turbidity in NTU,Norrie Point Turbidity in NTU,Port of Albany Rainfall Daily Accumulation (Inches),Norrie Point Rainfall Daily Accumulation (Inches)
0,2011-08-25 00:00:00,7.68,7.81,4.0,9.3,0.0,0.0
1,2011-08-25 00:15:00,7.6,7.73,3.9,8.4,0.0,0.0
2,2011-08-25 00:30:00,7.57,7.63,4.3,7.9,0.0,0.0
3,2011-08-25 00:45:00,7.72,7.67,4.7,8.1,0.0,0.0
4,2011-08-25 01:00:00,7.74,7.63,4.4,8.4,0.0,0.0


In [15]:
df = df.rename(columns = {'Date Time (ET)': 'date',
                         ' Port of Albany D.O. (ppm)': 'albany_DO',
                         'Norrie Point D.O. (ppm)': 'norrie_DO',
                         ' Port of Albany Turbidity in NTU': 'albany_turbidity',
                         'Norrie Point Turbidity in NTU': 'norrie_turbidity',
                         ' Port of Albany Rainfall Daily Accumulation (Inches)': 'albany_rainfall',
                         'Norrie Point  Rainfall Daily Accumulation (Inches)': 'norrie_rainfall'})

In [16]:
df.head()

Unnamed: 0,date,albany_DO,norrie_DO,albany_turbidity,norrie_turbidity,albany_rainfall,norrie_rainfall
0,2011-08-25 00:00:00,7.68,7.81,4.0,9.3,0.0,0.0
1,2011-08-25 00:15:00,7.6,7.73,3.9,8.4,0.0,0.0
2,2011-08-25 00:30:00,7.57,7.63,4.3,7.9,0.0,0.0
3,2011-08-25 00:45:00,7.72,7.67,4.7,8.1,0.0,0.0
4,2011-08-25 01:00:00,7.74,7.63,4.4,8.4,0.0,0.0


In [23]:
# Define predictors and the target variable
X = df[['albany_rainfall', 'albany_DO']]
y = df[['albany_turbidity']]
# We need this to be a dataframe because .fit and .predict are expecting a 2D object

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Create and fit the model
model = LinearRegression().fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

In [28]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(rmse, r2)

221.9143474905527 0.490738951845751


In [35]:
# Create a widget for selecting predictors
predictor_selector = widgets.SelectMultiple(
    options = df.columns, # what is the widget choosing between
    value = [df.columns[0]],# what is the default value for the widget
    description = 'Predictors'
)

# Create a dropdown for selecting the target variable 
target_selector = widgets.Dropdown(
    options = df.columns,
    value = df.columns[1],
    description = 'Target'
)

# Button to evaluate the model
evaluate_button = widgets.Button(description = 'Evaluate Model')

# Output widget to display results
output = widgets.Output()

# Define the function to handle button clicks
def evaluate_model(b):
    with output:
        clear_output(wait=True)
        
        # Make sure the target is not in the predictors
        selected_predictors = [item for item in predictor_selector.value]
        if target_selector.value in selected_predictors:
            print('Target variable must not be in the predictors')
            return 
        
        # Prepare the data
        X = df[selected_predictors]
        y = df[target_selector.value]
        
        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
        
        # Create and fit the model
        model = LinearRegression().fit(X_train, y_train)
        
        # Predict and calculate r2 and MSE
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        
        # Display the r2 score and MSE
        print(f"R^2: {r2: .4f}")
        print(f"RMSE: {rmse: .4f}")
        
# Display the widgets and connect the button to the function
display(predictor_selector, target_selector, evaluate_button, output)
evaluate_button.on_click(evaluate_model)

SelectMultiple(description='Predictors', index=(0,), options=('date', 'albany_DO', 'norrie_DO', 'albany_turbid…

Dropdown(description='Target', index=1, options=('date', 'albany_DO', 'norrie_DO', 'albany_turbidity', 'norrie…

Button(description='Evaluate Model', style=ButtonStyle())

Output()