In [None]:
!pip3 install pandas==1.3.3
!pip3 install numpy==1.19.5
!pip3 install xgboost==1.4.2
!pip3 install scikit-learn==0.24.2
!pip3 install matplotlib

In [None]:
from sklearn import metrics
import xgboost as xgb
import joblib

In [None]:
xgbModel = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, n_estimators=400)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import pytz
from datetime import timedelta, datetime
local_tz = pytz.timezone('America/Toronto') # Set local timezone for InfluxDB based times calculations
today=datetime.today().strftime('%Y-%m-%d')

#### Note: if you did not create new data for the model training, a sample is provided in this repository. 
#### To use it, ensure that in the next cell the instruction with the file name lt_results_2022-10-01.csv in it is executed and not the one using "today's" date

In [None]:
data = pd.read_csv('lt_results_'+today+'.csv', index_col='DateTime', parse_dates=True, infer_datetime_format=True)
#data = pd.read_csv('lt_results_2022-10-01.csv', index_col='DateTime', parse_dates=True, infer_datetime_format=True)

In [None]:
data.head()

In [None]:
data = data.drop(columns=['req2xx', 'testDurationSeconds'])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, data.mean_tps,
                                                    test_size=0.1,
                                                    random_state=0) # we are setting the seed here
X_train.shape, X_test.shape

In [None]:
target_var = 'mean_tps'
X_train = X_train.drop(target_var, axis=1)
X_test = X_test.drop(target_var, axis=1)

In [None]:
evalset = [(X_train, y_train), (X_test, y_test)]

In [None]:
xgbModel.fit(X_train, y_train, eval_set=evalset ,  eval_metric=['mae', 'rmse'], verbose=False)

In [None]:
y_pred = xgbModel.predict(X_test)

In [None]:
results = xgbModel.evals_result()

In [None]:
plt.figure(figsize=(20,10))
plt.rcParams.update({'font.size': 20})
plt.plot(results['validation_0']['rmse'])
plt.plot(results['validation_1']['rmse'])
plt.title('Validation vs Train loss')
plt.ylabel('Loss')
plt.xlabel('Estimators')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
mae = metrics.mean_absolute_error(y_test, y_pred)
R2 = metrics.r2_score(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
mape = metrics.mean_absolute_percentage_error(y_test, y_pred)
print(f'MAE = {mae} \nRMSE = {np.sqrt(mse)} \nR2 = {R2} \nMAPE = {mape*100:.2f} %')

In [None]:
# Plot prediction vs original
plt.figure(figsize=(20,10))
plt.scatter(range(y_test.shape[0]),y_test,label="Original Data", alpha=0.6, c='black')
plt.scatter(range(y_pred.shape[0]),y_pred,label="Predicted Data", 
            alpha=0.6, c='red')
plt.ylabel('Mean TPS')
plt.xlabel('Test Records')
plt.title('XGBoost prediction vs original')
plt.legend()
plt.show()

In [None]:
xgb.plot_importance(xgbModel);

In [None]:
X_test.columns

In [None]:
record = [[True, 21, 277, 1712, 262, 7, 31, 5]]

In [None]:
test_rec = pd.DataFrame(record, columns=X_train.columns)

In [None]:
xgbModel.predict(test_rec)

In [None]:
joblib.dump(xgbModel, 'xgbModel.pkl')

In [None]:
record = [[1, 21, 277, 1712, 262, 7, 31, 5]]

In [None]:
test_rec = pd.DataFrame(record, columns=X_train.columns)

In [None]:
xgbModel.predict(test_rec)