## Imports

In [5]:
# Importing machine learning algorithms
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# Importing other packages
import timeit
import pandas as pd
import numpy as np
import neptune
import tempfile
from scipy import stats
import time


import seaborn as sns
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
import my_functions as mf

import mlflow
import mlflow.catboost
import os


# Importing packages for machine learning operations
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score


import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = "browser"

## Start Experiment

In [6]:


# Set the experiment name in MLflow
instance_Family = 'GPU_instance'
# General_purpose
# Memory_optimized
# Storage_optimized
# Compute_optimized
# GPU_instance

outlier = 'zscore'

# zscore
# iqr
# lof
# isolation


mlflow.set_experiment(instance_Family)
mlflow.set_tracking_uri("file:///media/gfragi/data/BarraCuda/mlruns/mlruns/")

### Add some experiment tags

In [7]:

# Set tags for the run
tags = {
    "experiment": "catboost_regression",
    "model_type": "CatBoostRegressor",
    # "task_type": "CPU",
    "dataset": "Amazon",
    "year to predict": "2022",
    "all years for prediction": "no",
    "outlier": outlier
}

# Set the tags for the current run
mlflow.set_tags(tags)

In [8]:
# mlflow.end_run()

## Import data 

In [9]:

data_all = pd.read_csv(
    f'./data/amazon_22_quarters.csv', parse_dates=['EffectiveDate'])

# data = data.drop(['OfferTermCode', 'instanceType', 'instanceFamily', 'OfferingClass'], axis=1)

data_all = data_all.drop(['SKU', 'RateCode', 'OfferTermCode', 'Location', 'instanceFamily',
                          'License Model', 'TermType', 'year', 'Tenancy', 'OfferingClass',
                          'instanceType', 'Product Family', 'Current Generation',
                          'License Model'], axis=1)

In [10]:
data_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554079 entries, 0 to 554078
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   PricePerUnit         554079 non-null  float64       
 1   LeaseContractLength  554079 non-null  int64         
 2   PurchaseOption       554079 non-null  object        
 3   vCPU                 554079 non-null  int64         
 4   Memory               554079 non-null  int64         
 5   operatingSystem      554079 non-null  object        
 6   networkPerformance   554079 non-null  float64       
 7   EffectiveDate        554079 non-null  datetime64[ns]
 8   DiskType             554079 non-null  object        
 9   StorageSize          554079 non-null  int64         
 10  Quarter              554079 non-null  int64         
 11  YearQuarter          554079 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(5), object(4)
memory usage: 50.7

In [11]:
# Select regions

In [12]:
# # # replace USEast and USWest to US and EU to Europe
# data_all = data_all.replace({'USEast': 'US', 'USWest': 'US', 'EU': 'Europe'})
# data_all = data_all[(data_all['Location'] != "Africa") & (data_all['Location'] != "MiddleEast") & (data_all['Location']
#                                                                                                    != "Canada") & (data_all['Location'] != "SouthAmerica") & (data_all['Location'] != "AWSGovCloud")]

In [13]:
# data_all = data_all.drop(['Location'], axis=1)

In [14]:
# # create several new date time features
data_all['year'] = data_all['EffectiveDate'].dt.year
# data['day_of_year'] = data['EffectiveDate'].dt.dayofyear
# data['weekday'] = data['EffectiveDate'].dt.weekday
# data['week_of_year'] = data['EffectiveDate'].dt.week
# data['day_of_month'] = data['EffectiveDate'].dt.day
# data['quarter'] = data['EffectiveDate'].dt.quarter

# data.drop('EffectiveDate', axis=1, inplace=True)

### Sort 

In [15]:
data_all.sort_values(by='EffectiveDate')
data_all.reset_index(drop=True)

Unnamed: 0,PricePerUnit,LeaseContractLength,PurchaseOption,vCPU,Memory,operatingSystem,networkPerformance,EffectiveDate,DiskType,StorageSize,Quarter,YearQuarter,year
0,0.40800,3,Partial Upfront,8,61,Linux,1.0,2016-09-30,SSD,1600,3,2016Q3,2016
1,0.11800,3,Partial Upfront,4,7,Linux,0.3,2016-09-30,SSD,80,3,2016Q3,2016
2,0.12400,1,Partial Upfront,4,7,Linux,0.3,2016-09-30,SSD,80,3,2016Q3,2016
3,0.22600,1,No Upfront,4,7,Linux,0.3,2016-09-30,SSD,80,3,2016Q3,2016
4,0.11600,3,Partial Upfront,4,7,Linux,0.3,2016-09-30,SSD,80,3,2016Q3,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...
554074,3.83952,1,No Upfront,8,32,Windows,12500.0,2022-10-01,SSD,474,4,2022Q4,2022
554075,3.74373,1,No Upfront,8,32,Windows,12500.0,2022-10-01,SSD,474,4,2022Q4,2022
554076,1.80328,3,Partial Upfront,8,32,Windows,12500.0,2022-10-01,SSD,474,4,2022Q4,2022
554077,0.71500,1,No Upfront,2,16,Windows,10.0,2022-12-01,NVMe SSD,468,4,2022Q4,2022


In [16]:
data_all['year'].value_counts()

2022    184168
2020    112164
2021    103740
2019     72480
2018     43773
2017     19952
2016     17802
Name: year, dtype: int64

# Outlier Methods

## IsolationForest

In [17]:
from sklearn.ensemble import IsolationForest

if outlier == 'isolation':

    # Select the column(s) you want to detect outliers in
    columns_to_check = ['PricePerUnit', 'vCPU', 'Memory']

    # Create an Isolation Forest object
    # Adjust the contamination parameter as needed
    isolation_forest = IsolationForest(contamination=0.1)

    # Fit the Isolation Forest model and predict outlier labels
    outlier_labels = isolation_forest.fit_predict(data_all[columns_to_check])

    # Identify outliers based on the predicted labels
    outliers = data_all[outlier_labels == -1]

    # Exclude outliers from the dataset
    data_all = data_all[outlier_labels != -1]

    # Print the outliers
    print("Outliers:")
    print(outliers)

    # Print the cleaned dataset
    print("Cleaned Data:")
    print(data_all)

## LOF

In [18]:
from sklearn.neighbors import LocalOutlierFactor


if outlier == 'lof':

    # Select the column(s) you want to detect outliers in
    columns_to_check = ['PricePerUnit']

    # Create a LOF object
    # Adjust parameters as needed
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)

    # Fit the LOF model and predict outlier scores
    outlier_scores = lof.fit_predict(data_all[columns_to_check])

    # Identify outliers based on the predicted scores
    outliers = data_all[outlier_scores == -1]

    # Exclude outliers from the dataset
    data_all = data_all[outlier_scores != -1]

    # Print the outliers
    print("Outliers:")
    print(outliers)

    # Print the cleaned dataset
    print("Cleaned Data:")
    print(data_all)
else:
    pass

## Interquartile Range (IQR)



In [19]:

if outlier == 'iqr':
    # Select the column(s) you want to detect outliers in
    columns_to_check = ['PricePerUnit']

    # Calculate the IQR for each column
    Q1 = data_all[columns_to_check].quantile(0.25)
    Q3 = data_all[columns_to_check].quantile(0.75)
    IQR = Q3 - Q1

    # Define a threshold for identifying outliers
    threshold = 2

    # Determine the lower and upper bounds for outlier detection
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR

    # Identify outliers by filtering the dataset
    outliers = data_all[~((data_all[columns_to_check] >= lower_bound) & (
        data_all[columns_to_check] <= upper_bound)).all(axis=1)]

    # Exclude outliers from the dataset
    data_all = data_all[((data_all[columns_to_check] >= lower_bound) & (
        data_all[columns_to_check] <= upper_bound)).all(axis=1)]

    # Print the outliers
    print("Outliers:")
    print(outliers)

    # Print the cleaned dataset
    print("Cleaned Data:")
    print(data_all)
else:
    pass

## Z-score

In [20]:
if outlier == 'zscore':

    # Select the column(s) you want to detect outliers in
    columns_to_check = ['PricePerUnit']

    # Compute the Z-scores for each data point
    z_scores = np.abs(stats.zscore(data_all[columns_to_check]))

    # Define a threshold for identifying outliers
    threshold = 0.75

    # Find the indices of outliers
    outlier_indices = np.where(z_scores > threshold)

    # Get the rows containing outliers
    outliers = data_all.iloc[outlier_indices[0]]

    # Exclude outliers from the dataset
    data_all = data_all.drop(data_all.index[outlier_indices[0]])

    # Print the outliers
    print(outliers)

    mlflow.log_param("z-threshold", threshold)

else:
    pass

        PricePerUnit  LeaseContractLength   PurchaseOption  vCPU  Memory  \
7409        17.87500                    3       No Upfront    32     244   
7418        30.85800                    1  Partial Upfront   128     192   
7433        28.92900                    3  Partial Upfront   128     192   
7528        18.36400                    1       No Upfront    32     244   
7554        15.52800                    1       No Upfront    32      60   
...              ...                  ...              ...   ...     ...   
554008      14.03152                    1  Partial Upfront    64      12   
554009      13.74384                    1  Partial Upfront    64      12   
554010      13.41242                    3  Partial Upfront    64      12   
554011      27.05083                    3       No Upfront    64      12   
554078      15.56115                    1  Partial Upfront    64      12   

       operatingSystem  networkPerformance EffectiveDate DiskType  \
7409           Win

##  Create a quarterly seasonality column


In [21]:
# # Convert EffectiveDate column to datetime format
# data_all['EffectiveDate'] = pd.to_datetime(data_all['EffectiveDate'])

# # Extract the quarter component from the EffectiveDate column
# data_all['Quarter'] = data_all['EffectiveDate'].dt.quarter


# # Combine year and quarter into a single column
# data_all['YearQuarter'] = data_all['year'].astype(
#     str) + '-Q' + data_all['Quarter'].astype(str)

In [22]:
# data_all

# data_all = data_all.drop(['Seasonality'], axis=1)

In [23]:
# categorical = ['PurchaseOption','OfferingClass', 'Location', 'Tenancy', 'operatingSystem', 'DiskType']

# Prepare the Data

In [24]:
# # %%  =========== Prepare the Data for regression============

# # Map binary categorical columns to numerical

# categorical_binary = ['PurchaseOption']
# data_all[categorical_binary] = data_all[categorical_binary].apply(
#     mf.binary_map)

In [25]:
# # Write the categorical values as a list
# # categorical = ['operatingSystem', 'DiskType', 'Location', 'instanceFamily']
# categorical = ['operatingSystem', 'DiskType']
# categorical2numeric = pd.get_dummies(data_all[categorical], drop_first=False)

# categorical2numeric_pred = pd.get_dummies(
#     data_all[categorical], drop_first=False)

# # Add the above results to the original dataframe df
# data_all = pd.concat([data_all, categorical2numeric], axis=1)
# data_all.drop(columns=categorical, axis=1, inplace=True)

# Years

In [26]:
data_all.year.unique()

array([2016, 2017, 2018, 2019, 2020, 2021, 2022])

In [27]:
data_pred = data_all[data_all['year'] > 2021]
data_pred = data_pred.sort_values(by='EffectiveDate')
data_pred = data_pred.reset_index(drop=True)
data_pred.year.unique()

array([2022])

In [28]:
data = data_all[(data_all['year'] > 2020) & (data_all['year'] < 2022)]
data = data.sort_values(by='EffectiveDate')
data = data.reset_index(drop=True)
data.year.unique()


mlflow.log_param("Years used to predict", data.year.unique())

array([2021])

## Assign target value

In [29]:
X = data.drop(['PricePerUnit', 'EffectiveDate', 'YearQuarter'], axis=1)
y = data.PricePerUnit

In [30]:
# # identify categorical features indices
# def column_index(data, query_cols):
#     cols = data.columns.values
#     sidx = np.argsort(cols)
#     return sidx[np.searchsorted(cols, query_cols, sorter=sidx)]

# categorical_features_indices = column_index(X, categorical)

# CatBoost Model Training

In [31]:
# CatBoost Model Training
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, random_state=42)

In [32]:
model = CatBoostRegressor(iterations=50, depth=3, cat_features=['PurchaseOption', 'operatingSystem', 'DiskType', 'Quarter'],
                          learning_rate=0.1, loss_function='RMSE')

# cat_features=categorical_features_indices
model.fit(X_train, y_train,
          eval_set=(X_valid, y_valid), plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 2.0534168	test: 2.0798053	best: 2.0798053 (0)	total: 49.3ms	remaining: 2.41s
1:	learn: 1.9803815	test: 2.0065484	best: 2.0065484 (1)	total: 51.7ms	remaining: 1.24s
2:	learn: 1.9207877	test: 1.9471039	best: 1.9471039 (2)	total: 53.6ms	remaining: 840ms
3:	learn: 1.8633417	test: 1.8894489	best: 1.8894489 (3)	total: 55ms	remaining: 633ms
4:	learn: 1.8162071	test: 1.8418636	best: 1.8418636 (4)	total: 56.7ms	remaining: 510ms
5:	learn: 1.7714793	test: 1.7977333	best: 1.7977333 (5)	total: 58.1ms	remaining: 426ms
6:	learn: 1.7343713	test: 1.7599761	best: 1.7599761 (6)	total: 59.6ms	remaining: 366ms
7:	learn: 1.7051624	test: 1.7309033	best: 1.7309033 (7)	total: 61.1ms	remaining: 321ms
8:	learn: 1.6773737	test: 1.7025822	best: 1.7025822 (8)	total: 62.6ms	remaining: 285ms
9:	learn: 1.6330351	test: 1.6573143	best: 1.6573143 (9)	total: 64.9ms	remaining: 260ms
10:	learn: 1.5952405	test: 1.6188323	best: 1.6188323 (10)	total: 66.7ms	remaining: 236ms
11:	learn: 1.5635358	test: 1.5858902	best: 

<catboost.core.CatBoostRegressor at 0x7fa3855b46a0>

# Catboost Model

### Catboost Regressor parameters

In [33]:
start = time.time()

model = CatBoostRegressor(iterations=15000,
                          loss_function='RMSE',
                          # 0.001 - 0.1 A lower learning rate results in slower but more precise convergence
                          learning_rate=0.001,
                          # 4 - 10  Higher values allow the model to capture more complex interactions but can increase the risk of overfitting.
                          depth=16,
                          custom_metric=['MAPE', 'RMSE', 'MAE', 'R2'],
                          random_seed=42,
                          bagging_temperature=0.2,  # 0 - 1
                          # Can be 'Iter' or 'IncToDec'. 'Iter' stops training when the evaluation metric stops improving, and 'IncToDec' stops when the evaluation metric starts to worsen.
                          od_type='Iter',
                          metric_period=75,  # how frequently the evaluation metric is calculated during training
                          task_type='GPU',  # Enable GPU training
                          # number of iterations to wait for the evaluation metric to improve before stopping training.
                          od_wait=100,
                          cat_features=['PurchaseOption',
                                        'operatingSystem', 'DiskType']
                          )

# Log specific parameters of the CatBoost model
params_to_track = ['iterations', 'learning_rate', 'depth', 'loss_function', 'bagging_temperature',
                   'random_seed', 'metric_period', 'od_wait', 'task_type']
for param in params_to_track:
    param_value = model.get_params().get(param)
    mlflow.log_param(param, str(param_value))

In [34]:
# mlflow.end_run()

In [35]:
model.fit(X_train, y_train,
          eval_set=(X_valid, y_valid),
          use_best_model=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.1377268	test: 2.1644172	best: 2.1644172 (0)	total: 503ms	remaining: 2h 5m 48s
75:	learn: 2.0368145	test: 2.0627090	best: 2.0627090 (75)	total: 27.8s	remaining: 1h 30m 58s
150:	learn: 1.9449415	test: 1.9698949	best: 1.9698949 (150)	total: 54.5s	remaining: 1h 29m 21s
225:	learn: 1.8614748	test: 1.8857090	best: 1.8857090 (225)	total: 1m 21s	remaining: 1h 29m 4s
300:	learn: 1.7859312	test: 1.8095392	best: 1.8095392 (300)	total: 1m 47s	remaining: 1h 27m 36s
375:	learn: 1.7173934	test: 1.7405184	best: 1.7405184 (375)	total: 2m 15s	remaining: 1h 27m 41s
450:	learn: 1.6554316	test: 1.6780893	best: 1.6780893 (450)	total: 2m 43s	remaining: 1h 28m 6s
525:	learn: 1.5994885	test: 1.6217179	best: 1.6217179 (525)	total: 3m 10s	remaining: 1h 27m 29s
600:	learn: 1.5493877	test: 1.5712328	best: 1.5712328 (600)	total: 3m 37s	remaining: 1h 26m 50s
675:	learn: 1.5044390	test: 1.5260366	best: 1.5260366 (675)	total: 4m 3s	remaining: 1h 26m 8s
750:	learn: 1.4641326	test: 1.4855626	best: 1.4855626 

<catboost.core.CatBoostRegressor at 0x7fa387c50370>

In [36]:
mlflow.catboost.log_model(model, "catboost_model")

best_iteration = model.get_best_iteration()

mlflow.log_metric('best_iteration', best_iteration)



## Plot the Feature Importance

In [37]:

fea_imp = pd.DataFrame({'imp': model.feature_importances_, 'col': X.columns})
fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[
                              True, False]).iloc[-30:]

fig = go.Figure(data=[go.Bar(
    x=fea_imp['imp'],
    y=fea_imp['col'],
    orientation='h',
    marker=dict(color=fea_imp['imp'], colorbar=dict(title='Importance'))

)])

fig.update_layout(
    title='CatBoost - Feature Importance - All',
    yaxis=dict(title='Features'),
    xaxis=dict(title='Importance'),
    height=600,
    width=800
)

# fig.show()


# Save the Plotly figure as an HTML file
html_path = "temp/feature_importance_all.html"
pio.write_html(fig, html_path)

# Log the HTML file as an artifact in MLflow
mlflow.log_artifact(html_path)

### Log feature importance - mlflow

In [38]:
# Get the feature importance values and feature names from your CatBoost model
feature_importance = model.get_feature_importance()
feature_names = X.columns

# Create a DataFrame to store the feature importance values and feature names
feature_importance_df = pd.DataFrame(
    {'Feature': feature_names, 'Importance': feature_importance})

# Create a temporary file to save the feature importance DataFrame
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as temp_file:
    # Save the feature importance DataFrame as a CSV file
    feature_importance_df.to_csv(temp_file, index=False)

# Log the feature importance CSV file as an artifact in MLflow
mlflow.log_artifact(temp_file.name, "feature_importance.csv")

In [39]:
# import pandas as pd
# import tempfile

# # Define the years sets
# years_sets = [
#     [2020, 2021],
#     [2019, 2020, 2021],
#     [2018, 2019, 2020, 2021],
#     [2021]
# ]

# # Create a DataFrame to store the feature importance values and feature names
# feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# # Insert the year sets as the first column in the DataFrame
# feature_importance_df.insert(0, 'Year Sets', years_sets)

# # Create a temporary file to save the feature importance DataFrame
# with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as temp_file:
#     # Save the feature importance DataFrame as a CSV file
#     feature_importance_df.to_csv(temp_file, index=False)

# # Log the feature importance CSV file as an artifact in MLflow
# mlflow.log_artifact(temp_file.name, "feature_importance.csv")

### Evalute Model

In [40]:
# model evaluation
from sklearn.metrics import mean_squared_error
print('Model evaluation:')
print(model.get_params())
print('RMSE:', np.sqrt(mean_squared_error(y_valid, model.predict(X_valid))))

Model evaluation:
{'iterations': 15000, 'learning_rate': 0.001, 'depth': 16, 'loss_function': 'RMSE', 'od_wait': 100, 'od_type': 'Iter', 'random_seed': 42, 'metric_period': 75, 'custom_metric': ['MAPE', 'RMSE', 'MAE', 'R2'], 'bagging_temperature': 0.2, 'task_type': 'GPU', 'cat_features': ['PurchaseOption', 'operatingSystem', 'DiskType']}
RMSE: 1.1692944184974097


In [41]:
print(model.get_best_score())

# Flatten and log the best scores as parameters in MLflow
best_scores = model.get_best_score()
for stage, metrics in best_scores.items():
    for metric, value in metrics.items():
        mlflow.log_metric(f'{stage}_{metric}', value)

{'learn': {'MAE': 0.5923814971470504, 'MAPE': 0.26801998714972497, 'R2': 0.7239675368347193, 'RMSE': 1.12388382219725}, 'validation': {'MAE': 0.619836912495371, 'MAPE': 0.27529402555754434, 'R2': 0.7085045200099078, 'RMSE': 1.1692943947126397}}


In [42]:
# End the MLflow run
# mlflow.end_run()

### Finish without shap

## SHAP

In [43]:
# import shap

# pool = cb.Pool(X_valid, y_valid)


# # Compute SHAP values
# shap_values = model.get_feature_importance(pool, type='ShapValues')

# # Convert SHAP values to a DataFrame
# shap_df = pd.DataFrame(shap_values[:, :-1], columns=X_valid.columns)

# # Log the SHAP values as an artifact in MLflow
# shap_df.to_csv("shap_values.csv", index=False)
# mlflow.log_artifact("shap_values.csv")

In [44]:
# # Exclude the constant offset column from shap_values
# shap_values = shap_values[:, :-1]

# # Create a SHAP summary plot
# shap.summary_plot(shap_values, X_valid)

# # Save the plot as an artifact in MLflow
# shap_plot_path = 'shap_summary_plot.png'
# shap.summary_plot(shap_values, X_valid, show=False)
# plt.savefig(shap_plot_path)
# mlflow.log_artifact(shap_plot_path)

# Evaluation


## Predictions vs Actual Prices

In [46]:
predict(data,
        prediction_type=None,
        ntree_start=0,
        ntree_end=0,
        thread_count=-1,
        verbose=None)

NameError: name 'predict' is not defined

In [45]:
data_pred = data_pred.drop(['YearQuarter'], axis=1)

# data_pred_tmp = data_pred.drop(['PricePerUnit'], axis=1)
predictions = model.predict(data_pred)

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame(predictions, columns=['Predictions'])

# Save the predictions to a CSV file
predictions_df.to_csv('temp/predictions.csv', index=False)

# Log the predictions CSV file as an artifact in MLflow
mlflow.log_artifact('temp/predictions.csv', 'predictions')

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=2]="No Upfront": Cannot convert 'b'No Upfront'' to float

In [None]:
predictions

In [None]:
actual_prices = data_pred.PricePerUnit

In [None]:

import plotly.graph_objects as go

# Create a line plot using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(actual_prices))),
              y=actual_prices, mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=list(range(len(predictions))),
              y=predictions, mode='lines', name='Predicted'))

fig.update_layout(
    title='Actual vs. Predicted Prices for 2022',
    xaxis=dict(title='Time'),
    yaxis=dict(title='Price')
)

# fig.show()

# Save the Plotly figure as an HTML file
html_path = "temp/actual_predicted.html"
pio.write_html(fig, html_path)

# Log the HTML file as an artifact in MLflow
mlflow.log_artifact(html_path)

In [None]:
rmse = mean_squared_error(actual_prices, predictions, squared=False)

In [None]:
rmse

In [None]:
error = actual_prices - predictions

In [None]:
plt.scatter(actual_prices, predictions, c=error, cmap='coolwarm')
plt.plot(np.linspace(min(actual_prices), max(actual_prices), 100), np.linspace(
    min(actual_prices), max(actual_prices), 100), color='black', linestyle='--')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.colorbar()
plt.show()

In [None]:
# Create a scatter plot using Plotly
fig = go.Figure(data=go.Scatter(
    x=actual_prices,
    y=predictions,
    mode='markers',
    marker=dict(
        size=8,
        color=error,
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title='Error')
    )
))

fig.update_layout(
    title='Actual vs. Predicted Prices',
    xaxis=dict(title='Actual Prices'),
    yaxis=dict(title='Predicted Prices')
)

# fig.show()

# Save the Plotly figure as an HTML file
html_path = "temp/actual_predicted_errors.html"
pio.write_html(fig, html_path)

# Log the HTML file as an artifact in MLflow
mlflow.log_artifact(html_path)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error


mse = mean_squared_error(actual_prices, predictions)
rmse = mean_squared_error(actual_prices, predictions, squared=False)
mae = mean_absolute_error(actual_prices, predictions)
r2 = r2_score(actual_prices, predictions)
mape = mean_absolute_percentage_error(actual_prices, predictions)

In [None]:
mlflow.log_metric("MSE", mse)
mlflow.log_metric("RMSE", rmse)
mlflow.log_metric("MAE", mae)
mlflow.log_metric("MAPE", mape)
mlflow.log_metric("R-squared", r2)

# mlflow.log_metric()

In [None]:
print('MSE:', mse)
print('RMSE:', rmse)
print('MAE:', mae)
print('MAPE:', mape)
print('R-squared:', r2)

https://towardsdatascience.com/ad-demand-forecast-with-catboost-lightgbm-819e5073cd3e

https://towardsdatascience.com/understanding-feature-importance-and-how-to-implement-it-in-python-ff0287b20285

In [47]:
# End the MLflow run
mlflow.end_run()

: 