<a href="https://colab.research.google.com/github/empyreanlee/Classification_ML/blob/main/SeoulBikeRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
import copy
import seaborn as sns
import tensorflow as tf
from sklearn.linear_model import LinearRegression

In [None]:
dataset_cols = ["bike_count", "hour", "temp", "humidity", "wind", "visibility", "dew_pt_temp", "radiation", "rain", "snow", "functional"]
df = pd.read_excel("SeoulBikeData.xlsx").drop(["Date", "Holiday", "Seasons"], axis=1)

In [None]:
df.columns = dataset_cols
df["functional"] = (df["functional"] == "Yes").astype(int)
df = df[df["hour"] == 12]
df = df.drop(["hour"], axis=1)

In [None]:
for label in df.columns[1:]:
  plt.scatter(df[label], df["bike_count"])
  plt.title(label)
  plt.ylabel("Bike Count at Noon")
  plt.xlabel(label )
  plt.show()

In [None]:
df = df.drop(["wind", "visibility", "functional"], axis=1)

In [None]:
df.head()

## Train, Test, Validate

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

In [None]:
def get_xy(dataframe, y_label, x_labels=None):
  dataframe = copy.deepcopy(dataframe)

  if not x_labels:
    X = dataframe[[c for c in dataframe.columns if c!=y_label]].values
  else:
    if len(x_labels) == 1:
      X = dataframe[x_labels[0]].values.reshape(-1, 1)
    else:
      X = dataframe[x_labels].values

  y = dataframe[y_label].values.reshape(-1, 1)
  data = np.hstack((X, y))

  return data, X, y

In [None]:
_, X_train_temp, y_train_temp = get_xy(train, "bike_count", x_labels=["temp"])
_, X_valid_temp, y_valid_temp = get_xy(valid, "bike_count", x_labels=["temp"])
_, X_test_temp, y_test_temp = get_xy(test, "bike_count", x_labels=["temp"])

In [None]:
temp_reg = LinearRegression()
temp_reg.fit(X_train_temp, y_train_temp)

In [None]:
temp_reg.score(X_test_temp, y_test_temp)

In [None]:
plt.scatter(X_train_temp, y_train_temp, label="Data", color="red")
x = tf.linspace(-20, 40, 100)
plt.plot(x, temp_reg.predict(np.array(x).reshape(-1, 1)), label="Fit", color="green", linewidth=3)
plt.legend()
plt.title("Bikes vs Temp")
plt.ylabel("No. of Bikes")
plt.xlabel("Temperature")
plt.show()

## Multiple Linear Regression

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])
_, X_train_all, y_train_all = get_xy(train, "bike_count", x_labels=df.columns[1:].tolist())
_, X_valid_all, y_valid_all = get_xy(valid, "bike_count", x_labels=df.columns[1:].tolist())
_, X_test_all, y_test_all = get_xy(test, "bike_count", x_labels=df.columns[1:].tolist())

In [None]:
all_reg = LinearRegression()
all_reg.fit(X_train_all, y_train_all)

In [None]:
all_reg.score(X_test_all, y_test_all)

In [None]:
y_pred_lr = all_reg.predict(X_test_all)

## Regression with Neural Network

In [None]:
def plot_history(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('MSE') # Mean Squared Error
  plt.grid(True)
  plt.show()

In [None]:
temp_normalizer = tf.keras.layers.Normalization(input_shape=(1,), axis=None)
temp_normalizer.adapt(X_train_temp.reshape(-1))

In [None]:
temp_nn_model = tf.keras.Sequential([
    temp_normalizer,
    tf.keras.layers.Dense(1)
])

In [None]:
temp_nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), loss="mean_squared_error")

In [None]:
history = temp_nn_model.fit(
    X_train_temp.reshape(-1), y_train_temp,
    verbose=0,
    epochs=1000,
    validation_data=(X_valid_temp, y_valid_temp)
    )

In [None]:
plot_history(history)

In [None]:
plt.scatter(X_train_temp, y_train_temp, label="Data", color="red")
x = tf.linspace(-20, 40, 100)
plt.plot(x, temp_nn_model.predict(np.array(x).reshape(-1, 1)), label="Fit", color="green", linewidth=3)
plt.legend()
plt.title("Bikes vs Temp")
plt.ylabel("No. of Bikes")
plt.xlabel("Temperature")
plt.show()

## Neural Net

In [None]:
all_normalizer = tf.keras.layers.Normalization(input_shape=(6,), axis=-1)
all_normalizer.adapt(X_train_all)

In [None]:
nn_model = tf.keras.Sequential([
    all_normalizer,
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])
nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

In [None]:
history = nn_model.fit(
    X_train_all, y_train_all,
    validation_data=(X_valid_all, y_valid_all),
    verbose=0, epochs=100
)

In [None]:
plot_history(history)

In [None]:
# calculate the MSE for both linear reg and nn
y_pred_lr = all_reg.predict(X_test_all)
y_pred_nn = nn_model.predict(X_test_all)

In [None]:
def MSE(y_pred, y_real):
  return(np.square(y_pred - y_real)).mean()

In [None]:
MSE(y_pred_lr, y_test_all)

In [None]:
MSE(y_pred_nn, y_test_all)

In [None]:
ax = plt.axes(aspect="equal")
plt.scatter(y_test_all, y_pred_lr, label="Lin Reg Preds")
plt.scatter(y_test_all, y_pred_nn, label="NN Preds")
plt.xlabel("True Values")
plt.ylabel("Predictions")
lims = [0, 1800]
plt.xlim(lims)
plt.ylim(lims)
plt.legend()
_ = plt.plot(lims, lims, c="red")