In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error

In [2]:
DATA_PATH = "dataset"
DATA_FILE = "data.csv"
data = pd.read_csv(os.path.join(DATA_PATH, DATA_FILE))
N = data.shape[0]

In [3]:
data["delta"] = data["duration1"]-data["duration2"]

In [4]:
data = data.dropna()

In [5]:
data.shape

(75000, 7)

In [6]:
data.head(2)

Unnamed: 0,duration1,duration2,temperature,Humidity,Direction,Velocity,delta
0,573,563,26.9,64.8,0,19.1,10
1,596,561,26.9,64.8,0,19.1,35


In [7]:
y = data["Velocity"].values
X = data.drop(["Velocity"], axis=1)

In [8]:
X = scale(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [10]:
def performance_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mae = median_absolute_error(y_true, y_pred)
    return r2, mse, mae

In [11]:
model_classes = {
    "Linear Regression": LinearRegression,
    "Lasso Regressor": Lasso,
    "Random Forest Regressor":RandomForestRegressor,
}

In [13]:
for key, value in model_classes.items():
    regressor = model_classes[key]
    model = regressor()
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    r2_train, mse_train, mae_train = performance_metrics(y_train, y_pred_train)
    r2_test, mse_test, mae_test = performance_metrics(y_test, y_pred_test)
    r2_test = np.round(r2_test, 4)
    mse_test = np.round(mse_test, 4)
    mae_test = np.round(mae_test, 4)
    r2_train = np.round(r2_train, 4)
    mse_train = np.round(mse_train, 4)
    mae_train = np.round(mae_train, 4)
    print("{} model has Train r2 score:{}, mse:{} and mae {}".format(key, r2_train, mse_train, mae_train)) 
    print("{} model has Test r2 score:{}, mse:{} and mae {}".format(key, r2_test, mse_test, mae_test))    

Linear Regression model has Train r2 score:0.466, mse:4.517 and mae 1.3598
Linear Regression model has Test r2 score:0.4635, mse:4.4928 and mae 1.361
Lasso Regressor model has Train r2 score:0.2462, mse:6.3772 and mae 2.4153
Lasso Regressor model has Test r2 score:0.2442, mse:6.3299 and mae 2.4153
Random Forest Regressor model has Train r2 score:0.9999, mse:0.0009 and mae 0.0
Random Forest Regressor model has Test r2 score:0.9996, mse:0.0029 and mae 0.0


In [15]:
rf = RandomForestRegressor(n_estimators=2)
rf.fit(X_train, y_train)
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)
r2_train, mse_train, mae_train = performance_metrics(y_train, y_pred_train)
r2_test, mse_test, mae_test = performance_metrics(y_test, y_pred_test)
r2_test = np.round(r2_test, 4)
mse_test = np.round(mse_test, 4)
mae_test = np.round(mae_test, 4)
r2_train = np.round(r2_train, 4)
mse_train = np.round(mse_train, 4)
mae_train = np.round(mae_train, 4)
print("RF model has Train r2 score:{}, mse:{} and mae {}".format(r2_train, mse_train, mae_train)) 
print("RF model has Test r2 score:{}, mse:{} and mae {}".format(r2_test, mse_test, mae_test))  

RF model has Train r2 score:0.9995, mse:0.004 and mae 0.0
RF model has Test r2 score:0.9987, mse:0.0111 and mae 0.0
