# Base

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

Download the dataset [here](https://www.kaggle.com/datasets/d4rklucif3r/full-scale-waste-water-treatment-plant-data)

In [17]:
df = pd.read_csv('/content/Data-Melbourne_F_fixed.csv', index_col = 'Unnamed: 0')
df.head()

Unnamed: 0,Average Outflow,Average Inflow,Energy Consumption,Ammonia,Biological Oxygen Demand,Chemical Oxygen Demand,Total Nitrogen,Average Temperature,Maximum temperature,Minimum temperature,Atmospheric pressure,Average humidity,Total rainfall,Average visibility,Average wind speed,Maximum wind speed,Year,Month,Day
0,2.941,2.589,175856.0,27.0,365.0,730.0,60.378,19.3,25.1,12.6,0.0,56.0,1.52,10.0,26.9,53.5,2014.0,1.0,1.0
1,2.936,2.961,181624.0,25.0,370.0,740.0,60.026,17.1,23.6,12.3,0.0,63.0,0.0,10.0,14.4,27.8,2014.0,1.0,2.0
2,2.928,3.225,202016.0,42.0,418.0,836.0,64.522,16.8,27.2,8.8,0.0,47.0,0.25,10.0,31.9,61.1,2014.0,1.0,5.0
3,2.928,3.354,207547.0,36.0,430.0,850.0,63.0,14.6,19.9,11.1,0.0,49.0,0.0,10.0,27.0,38.9,2014.0,1.0,6.0
4,2.917,3.794,202824.0,46.0,508.0,1016.0,65.59,13.4,19.1,8.0,0.0,65.0,0.0,10.0,20.6,35.2,2014.0,1.0,7.0


# Response 1

In [30]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import precision_score

# Split data into features and target variables
X = df.drop(['Chemical Oxygen Demand', 'Biological Oxygen Demand', 'Ammonia', 'Energy Consumption', 'Total Nitrogen'], axis=1)
y = df[['Chemical Oxygen Demand', 'Biological Oxygen Demand', 'Ammonia', 'Energy Consumption', 'Total Nitrogen']]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train the MultiOutputRegressor model
multi_output_regressor = MultiOutputRegressor(RandomForestRegressor())
multi_output_regressor.fit(X_train, y_train)

# Define and train the dense neural network model
dense_model = Sequential()
dense_model.add(Dense(64, activation='relu', input_shape=(14,)))
dense_model.add(Dense(32, activation='relu'))
dense_model.add(Dense(5))
dense_model.compile(loss='mean_squared_error', optimizer='adam')
dense_model.fit(X_train, y_train, epochs=100, verbose=0)

# Make predictions with both models
y_pred_multi_output = multi_output_regressor.predict(X_test)
y_pred_dense = dense_model.predict(X_test)

# Calculate RMSE for both models
rmse_multi_output = np.sqrt(mean_squared_error(y_test, y_pred_multi_output))
rmse_dense = np.sqrt(mean_squared_error(y_test, y_pred_dense))

print(f"RMSE for MultiOutputRegressor: {rmse_multi_output}")
print(f"RMSE for Dense Model: {rmse_dense}")

# Calculate precision matrix for both models
precision_multi_output = precision_score(np.where(y_test > np.mean(y_test, axis=0), 1, 0), np.where(y_pred_multi_output > np.mean(y_test, axis=0), 1, 0), average=None)
precision_dense = precision_score(np.where(y_test > np.mean(y_test, axis=0), 1, 0), np.where(y_pred_dense > np.mean(y_test, axis=0), 1, 0), average=None)

print(f"Precision Matrix for MultiOutputRegressor: {precision_multi_output}")
print(f"Precision Matrix for Dense Model: {precision_dense}")

# Plot the predicted values for both models
fig = go.Figure()
for i, col in enumerate(y_test.columns):
    fig.add_trace(go.Scatter(x=y_test.index, y=y_test[col], name=f"Actual {col}"))
    fig.add_trace(go.Scatter(x=y_test.index, y=y_pred_multi_output[:, i], name=f"Predicted {col} by MultiOutputRegressor"))
    fig.add_trace(go.Scatter(x=y_test.index, y=y_pred_dense[:, i], name=f"Predicted {col} by Dense Model"))
fig.update_layout(title="Predicted Values for Both Models", xaxis_title="Index", yaxis_title="Value")
fig.show()

RMSE for MultiOutputRegressor: 68.79306846978507
RMSE for Dense Model: 74.1919259787198


ValueError: ('Lengths must match to compare', (5,), (277, 5))

# Response 2

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np
import plotly.graph_objects as go
from sklearn.metrics import precision_score

# Assuming df is your DataFrame and the last 5 columns are your target variables
X = df.drop(['Chemical Oxygen Demand', 'Biological Oxygen Demand', 'Ammonia', 'Energy Consumption', 'Total Nitrogen'], axis=1)
y = df[['Chemical Oxygen Demand', 'Biological Oxygen Demand', 'Ammonia', 'Energy Consumption', 'Total Nitrogen']]
# X = df.iloc[:, :-5]
# y = df.iloc[:, -5:]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train MultiOutputRegressor model
multi_output_regressor = MultiOutputRegressor(RandomForestRegressor())
multi_output_regressor.fit(X_train_scaled, y_train)

# Train dense neural network model
dense_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(5)
])
dense_model.compile(optimizer='adam', loss='mean_squared_error')
dense_model.fit(X_train_scaled, y_train, epochs=100, verbose=0)

# Make predictions
y_pred_multi_output_regressor = multi_output_regressor.predict(X_test_scaled)
y_pred_dense_model = dense_model.predict(X_test_scaled)

# Evaluate models
rmse_multi_output_regressor = np.sqrt(mean_squared_error(y_test, y_pred_multi_output_regressor, multioutput='raw_values'))
rmse_dense_model = np.sqrt(mean_squared_error(y_test, y_pred_dense_model, multioutput='raw_values'))

precision_multi_output_regressor = np.array([precision_score(np.sign(y_test.iloc[:, i]), np.sign(y_pred_multi_output_regressor[:, i]), average='macro') for i in range(5)])
precision_dense_model = np.array([precision_score(np.sign(y_test.iloc[:, i]), np.sign(y_pred_dense_model[:, i]), average='macro') for i in range(5)])

print("RMSE Multi Output Regressor: ", rmse_multi_output_regressor)
print("RMSE Dense Model: ", rmse_dense_model)

print("Precision Multi Output Regressor: ", precision_multi_output_regressor)
print("Precision Dense Model: ", precision_dense_model)

# Plot results
fig = go.Figure(data=[
    go.Bar(name='Multi Output Regressor', x=['COD', 'BOD', 'Ammonia', 'Energy Consumption', 'Total Nitrogen'], y=rmse_multi_output_regressor),
    go.Bar(name='Dense Model', x=['COD', 'BOD', 'Ammonia', 'Energy Consumption', 'Total Nitrogen'], y=rmse_dense_model)
])
fig.update_layout(title='RMSE Comparison', xaxis_title='Target Variable', yaxis_title='RMSE')
fig.show()

fig = go.Figure(data=[
    go.Bar(name='Multi Output Regressor', x=['COD', 'BOD', 'Ammonia', 'Energy Consumption', 'Total Nitrogen'], y=precision_multi_output_regressor),
    go.Bar(name='Dense Model', x=['COD', 'BOD', 'Ammonia', 'Energy Consumption', 'Total Nitrogen'], y=precision_dense_model)
])
fig.update_layout(title='Precision Comparison', xaxis_title='Target Variable', yaxis_title='Precision')
fig.show()

RMSE Multi Output Regressor:  [123.29025666  84.14862611   7.82625642  37.7161033    3.39720547]
RMSE Dense Model:  [135.6792619   91.4015365    8.60676449  44.55949938   5.29254521]
Precision Multi Output Regressor:  [1. 1. 1. 1. 1.]
Precision Dense Model:  [1. 1. 1. 1. 1.]
