## Imports

In [5]:
# Importing machine learning algorithms
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# Importing other packages
import timeit
import pandas as pd
import numpy as np
import neptune
import tempfile
from scipy import stats
import time


import seaborn as sns
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
import my_functions as mf

import mlflow
import mlflow.catboost
import os


# Importing packages for machine learning operations
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score


import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = "browser"

## Start Experiment

In [6]:


# Set the experiment name in MLflow
instance_Family = 'ALL'
# General_purpose
# Memory_optimized
# Storage_optimized
# Compute_optimized
# GPU instance

outlier = 'zscore'

# zscore
# iqr
# lof
# isolation


mlflow.set_experiment(instance_Family)
mlflow.set_tracking_uri("file:///media/gfragi/data/BarraCuda/mlruns/mlruns/")

### Add some experiment tags

In [7]:

# Set tags for the run
tags = {
    "experiment": "catboost_regression",
    "model_type": "CatBoostRegressor",
    # "task_type": "CPU",
    "dataset": "Amazon",
    "year to predict": "2022",
    "all years for prediction": "no",
    "outlier": outlier
}

# Set the tags for the current run
mlflow.set_tags(tags)

In [8]:
# mlflow.end_run()

## Import data 

In [9]:

data_all = pd.read_csv(
    f'./data/amazon_22.csv', parse_dates=['EffectiveDate'])

# data = data.drop(['OfferTermCode', 'instanceType', 'instanceFamily', 'OfferingClass'], axis=1)

data_all = data_all.drop(['SKU', 'RateCode', 'OfferTermCode', 'Location',
                          'License Model', 'TermType','year', 'Tenancy', 'OfferingClass',
                          'instanceType', 'Product Family','Current Generation',
                          'License Model'], axis=1)

In [10]:
data_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 570343 entries, 0 to 570342
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   PricePerUnit         570343 non-null  float64       
 1   instanceFamily       570343 non-null  object        
 2   LeaseContractLength  570343 non-null  int64         
 3   PurchaseOption       570343 non-null  object        
 4   vCPU                 570343 non-null  int64         
 5   Memory               570343 non-null  int64         
 6   operatingSystem      570343 non-null  object        
 7   Network Performance  570343 non-null  float64       
 8   EffectiveDate        570343 non-null  datetime64[ns]
 9   DiskType             570343 non-null  object        
 10  StorageSize          570343 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(4), object(4)
memory usage: 47.9+ MB


In [11]:
# Select regions

In [12]:
# # # replace USEast and USWest to US and EU to Europe
# data_all = data_all.replace({'USEast': 'US', 'USWest': 'US', 'EU': 'Europe'})
# data_all = data_all[(data_all['Location'] != "Africa") & (data_all['Location'] != "MiddleEast") & (data_all['Location']
#                                                                                                    != "Canada") & (data_all['Location'] != "SouthAmerica") & (data_all['Location'] != "AWSGovCloud")]

In [13]:
# data_all = data_all.drop(['Location'], axis=1)

In [14]:
# # create several new date time features
data_all['year'] = data_all['EffectiveDate'].dt.year
# data['day_of_year'] = data['EffectiveDate'].dt.dayofyear
# data['weekday'] = data['EffectiveDate'].dt.weekday
# data['week_of_year'] = data['EffectiveDate'].dt.week
# data['day_of_month'] = data['EffectiveDate'].dt.day
# data['quarter'] = data['EffectiveDate'].dt.quarter

# data.drop('EffectiveDate', axis=1, inplace=True)

### Sort 

In [15]:
data_all.sort_values(by='EffectiveDate')
data_all.reset_index(drop=True)

Unnamed: 0,PricePerUnit,instanceFamily,LeaseContractLength,PurchaseOption,vCPU,Memory,operatingSystem,Network Performance,EffectiveDate,DiskType,StorageSize,year
0,0.0605,Memory optimized,3,Partial Upfront,1,8,Red Hat Enterprise Linux with HA,10.0,2021-03-01,NVMe SSD,59,2021
1,0.1403,Memory optimized,1,No Upfront,1,8,Red Hat Enterprise Linux with HA,10.0,2021-03-01,NVMe SSD,59,2021
2,0.1466,Memory optimized,1,No Upfront,1,8,Red Hat Enterprise Linux with HA,10.0,2021-03-01,NVMe SSD,59,2021
3,0.1231,Memory optimized,3,No Upfront,1,8,Red Hat Enterprise Linux with HA,10.0,2021-03-01,NVMe SSD,59,2021
4,0.0721,Memory optimized,1,Partial Upfront,1,8,Red Hat Enterprise Linux with HA,10.0,2021-03-01,NVMe SSD,59,2021
...,...,...,...,...,...,...,...,...,...,...,...,...
570338,0.1470,Compute optimized,3,No Upfront,4,8,RHEL,10.0,2020-06-01,NVMe SSD,150,2020
570339,0.1340,Compute optimized,1,Partial Upfront,4,8,RHEL,10.0,2020-06-01,NVMe SSD,150,2020
570340,0.1210,Compute optimized,1,Partial Upfront,4,8,RHEL,10.0,2020-06-01,NVMe SSD,150,2020
570341,0.1080,Compute optimized,3,Partial Upfront,4,8,RHEL,10.0,2020-06-01,NVMe SSD,150,2020


In [16]:
data_all['year'].value_counts()

2022    184264
2020    112164
2021    104188
2019     72528
2018     43965
2017     20336
2016     17802
2023     15096
Name: year, dtype: int64

# Outlier Methods

## IsolationForest

In [17]:
from sklearn.ensemble import IsolationForest

if outlier == 'isolation':

    # Select the column(s) you want to detect outliers in
    columns_to_check = ['PricePerUnit', 'vCPU', 'Memory']

    # Create an Isolation Forest object
    # Adjust the contamination parameter as needed
    isolation_forest = IsolationForest(contamination=0.1)

    # Fit the Isolation Forest model and predict outlier labels
    outlier_labels = isolation_forest.fit_predict(data_all[columns_to_check])

    # Identify outliers based on the predicted labels
    outliers = data_all[outlier_labels == -1]

    # Exclude outliers from the dataset
    data_all = data_all[outlier_labels != -1]

    # Print the outliers
    print("Outliers:")
    print(outliers)

    # Print the cleaned dataset
    print("Cleaned Data:")
    print(data_all)

## LOF

In [18]:
from sklearn.neighbors import LocalOutlierFactor


if outlier == 'lof':

    # Select the column(s) you want to detect outliers in
    columns_to_check = ['PricePerUnit']

    # Create a LOF object
    # Adjust parameters as needed
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)

    # Fit the LOF model and predict outlier scores
    outlier_scores = lof.fit_predict(data_all[columns_to_check])

    # Identify outliers based on the predicted scores
    outliers = data_all[outlier_scores == -1]

    # Exclude outliers from the dataset
    data_all = data_all[outlier_scores != -1]

    # Print the outliers
    print("Outliers:")
    print(outliers)

    # Print the cleaned dataset
    print("Cleaned Data:")
    print(data_all)
else:
    pass

## Interquartile Range (IQR)



In [19]:

if outlier == 'iqr':
    # Select the column(s) you want to detect outliers in
    columns_to_check = ['PricePerUnit']

    # Calculate the IQR for each column
    Q1 = data_all[columns_to_check].quantile(0.25)
    Q3 = data_all[columns_to_check].quantile(0.75)
    IQR = Q3 - Q1

    # Define a threshold for identifying outliers
    threshold = 2

    # Determine the lower and upper bounds for outlier detection
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR

    # Identify outliers by filtering the dataset
    outliers = data_all[~((data_all[columns_to_check] >= lower_bound) & (
        data_all[columns_to_check] <= upper_bound)).all(axis=1)]

    # Exclude outliers from the dataset
    data_all = data_all[((data_all[columns_to_check] >= lower_bound) & (
        data_all[columns_to_check] <= upper_bound)).all(axis=1)]

    # Print the outliers
    print("Outliers:")
    print(outliers)

    # Print the cleaned dataset
    print("Cleaned Data:")
    print(data_all)
else:
    pass

## Z-score

In [20]:
if outlier == 'zscore':

    # Select the column(s) you want to detect outliers in
    columns_to_check = ['PricePerUnit']

    # Compute the Z-scores for each data point
    z_scores = np.abs(stats.zscore(data_all[columns_to_check]))

    # Define a threshold for identifying outliers
    threshold = 0.75

    # Find the indices of outliers
    outlier_indices = np.where(z_scores > threshold)

    # Get the rows containing outliers
    outliers = data_all.iloc[outlier_indices[0]]

    # Exclude outliers from the dataset
    data_all = data_all.drop(data_all.index[outlier_indices[0]])

    # Print the outliers
    print(outliers)

    mlflow.log_param("z-threshold", threshold)

else:
    pass

        PricePerUnit     instanceFamily  LeaseContractLength   PurchaseOption  \
32            13.487  Storage optimized                    3  Partial Upfront   
33            28.485  Storage optimized                    1       No Upfront   
34            29.158  Storage optimized                    1       No Upfront   
35            27.213  Storage optimized                    3       No Upfront   
36            14.456  Storage optimized                    1  Partial Upfront   
...              ...                ...                  ...              ...   
570293        25.886  Storage optimized                    1       No Upfront   
570294        21.422  Storage optimized                    3       No Upfront   
570295        12.706  Storage optimized                    1  Partial Upfront   
570296        12.078  Storage optimized                    1  Partial Upfront   
570298        22.294  Storage optimized                    3       No Upfront   

        vCPU  Memory operat

##  Create a quarterly seasonality column


In [21]:
# Convert EffectiveDate column to datetime format
data_all['EffectiveDate'] = pd.to_datetime(data_all['EffectiveDate'])

# Extract the quarter component from the EffectiveDate column
data_all['Quarter'] = data_all['EffectiveDate'].dt.quarter

# Create a dictionary mapping quarter numbers to season labels
seasons = {1: 'Q1', 2: 'Q2', 3: 'Q3', 4: 'Q4'}

# Map the quarter numbers to season labels using the dictionary
data_all['Seasonality'] = data_all['Quarter'].map(seasons)

In [22]:
data_all

data_all = data_all.drop(['Seasonality'], axis=1)

In [23]:
data_all.head()

Unnamed: 0,PricePerUnit,instanceFamily,LeaseContractLength,PurchaseOption,vCPU,Memory,operatingSystem,Network Performance,EffectiveDate,DiskType,StorageSize,year,Quarter
0,0.0605,Memory optimized,3,Partial Upfront,1,8,Red Hat Enterprise Linux with HA,10.0,2021-03-01,NVMe SSD,59,2021,1
1,0.1403,Memory optimized,1,No Upfront,1,8,Red Hat Enterprise Linux with HA,10.0,2021-03-01,NVMe SSD,59,2021,1
2,0.1466,Memory optimized,1,No Upfront,1,8,Red Hat Enterprise Linux with HA,10.0,2021-03-01,NVMe SSD,59,2021,1
3,0.1231,Memory optimized,3,No Upfront,1,8,Red Hat Enterprise Linux with HA,10.0,2021-03-01,NVMe SSD,59,2021,1
4,0.0721,Memory optimized,1,Partial Upfront,1,8,Red Hat Enterprise Linux with HA,10.0,2021-03-01,NVMe SSD,59,2021,1


In [24]:
# categorical = ['PurchaseOption','OfferingClass', 'Location', 'Tenancy', 'operatingSystem', 'DiskType']

# Prepare the Data

In [25]:
# %%  =========== Prepare the Data for regression============

# Map binary categorical columns to numerical

categorical_binary = ['PurchaseOption']
data_all[categorical_binary] = data_all[categorical_binary].apply(mf.binary_map)

In [26]:
# Write the categorical values as a list
# categorical = ['operatingSystem', 'DiskType', 'Location']
categorical = ['operatingSystem', 'DiskType', 'instanceFamily']
categorical2numeric = pd.get_dummies(data_all[categorical], drop_first=False)

categorical2numeric_pred = pd.get_dummies(
    data_all[categorical], drop_first=False)

# Add the above results to the original dataframe df
data_all= pd.concat([data_all, categorical2numeric], axis=1)
data_all.drop(columns=categorical, axis=1, inplace=True)


# Years

In [27]:
data_all.year.unique()

array([2021, 2020, 2018, 2022, 2019, 2016, 2017, 2023])

In [28]:
data_pred = data_all[data_all['year'] > 2021]
data_pred = data_pred.sort_values(by='EffectiveDate')
data_pred = data_pred.reset_index(drop=True)
data_pred.year.unique()

array([2022, 2023])

In [29]:
data = data_all[(data_all['year'] > 2017) & (data_all['year'] < 2022)]
data = data.sort_values(by='EffectiveDate')
data = data.reset_index(drop=True)
data.year.unique()


mlflow.log_param("Years used to predict", data.year.unique())

array([2018, 2019, 2020, 2021])

## Assign target value

In [30]:
X = data.drop(['PricePerUnit', 'EffectiveDate', 'year'], axis=1)
y = data.PricePerUnit

In [31]:
# # identify categorical features indices
# def column_index(data, query_cols):
#     cols = data.columns.values
#     sidx = np.argsort(cols)
#     return sidx[np.searchsorted(cols, query_cols, sorter=sidx)]

# categorical_features_indices = column_index(X, categorical)

# CatBoost Model Training

In [32]:
# CatBoost Model Training
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, random_state=42)

In [33]:
model = CatBoostRegressor(iterations=50, depth=3,
                          learning_rate=0.1, loss_function='RMSE')

# cat_features=categorical_features_indices
model.fit(X_train, y_train,
          eval_set=(X_valid, y_valid), plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 2.3945498	test: 2.3874993	best: 2.3874993 (0)	total: 50.8ms	remaining: 2.49s
1:	learn: 2.3033683	test: 2.2967288	best: 2.2967288 (1)	total: 54.6ms	remaining: 1.31s
2:	learn: 2.2247566	test: 2.2185418	best: 2.2185418 (2)	total: 58.1ms	remaining: 910ms
3:	learn: 2.1532908	test: 2.1473406	best: 2.1473406 (3)	total: 61.3ms	remaining: 705ms
4:	learn: 2.0916831	test: 2.0859080	best: 2.0859080 (4)	total: 64.2ms	remaining: 578ms
5:	learn: 2.0364151	test: 2.0305396	best: 2.0305396 (5)	total: 67.3ms	remaining: 493ms
6:	learn: 1.9901870	test: 1.9846605	best: 1.9846605 (6)	total: 70.2ms	remaining: 431ms
7:	learn: 1.9476949	test: 1.9421501	best: 1.9421501 (7)	total: 73ms	remaining: 383ms
8:	learn: 1.9127733	test: 1.9073156	best: 1.9073156 (8)	total: 75.5ms	remaining: 344ms
9:	learn: 1.8794152	test: 1.8742761	best: 1.8742761 (9)	total: 78.2ms	remaining: 313ms
10:	learn: 1.8489374	test: 1.8441281	best: 1.8441281 (10)	total: 80.6ms	remaining: 286ms
11:	learn: 1.8225889	test: 1.8176029	best: 

<catboost.core.CatBoostRegressor at 0x7f6d924c1c10>

# Catboost Model

### Catboost Regressor parameters

In [34]:
start = time.time()

model = CatBoostRegressor(iterations=15000,
                          loss_function='RMSE',
                          # 0.001 - 0.1 A lower learning rate results in slower but more precise convergence
                          learning_rate=0.001,
                          # 4 - 10  Higher values allow the model to capture more complex interactions but can increase the risk of overfitting.
                          depth=16,
                          custom_metric=['MAPE', 'RMSE', 'MAE', 'R2'],
                          random_seed=42,
                          bagging_temperature=0.2,  # 0 - 1
                          # Can be 'Iter' or 'IncToDec'. 'Iter' stops training when the evaluation metric stops improving, and 'IncToDec' stops when the evaluation metric starts to worsen.
                          od_type='Iter',
                          metric_period=75,  # how frequently the evaluation metric is calculated during training
                          task_type='GPU',  # Enable GPU training
                          # number of iterations to wait for the evaluation metric to improve before stopping training.
                          od_wait=100,
                          )

# Log specific parameters of the CatBoost model
params_to_track = ['iterations', 'learning_rate', 'depth', 'loss_function', 'bagging_temperature',
                   'random_seed', 'metric_period', 'od_wait', 'task_type']
for param in params_to_track:
    param_value = model.get_params().get(param)
    mlflow.log_param(param, str(param_value))

In [35]:
# mlflow.end_run()

In [36]:
model.fit(X_train, y_train,
          eval_set=(X_valid, y_valid),
          use_best_model=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 2.5004065	test: 2.4931000	best: 2.4931000 (0)	total: 177ms	remaining: 44m 16s
75:	learn: 2.3820903	test: 2.3750271	best: 2.3750271 (75)	total: 10.6s	remaining: 34m 37s
150:	learn: 2.2747992	test: 2.2679797	best: 2.2679797 (150)	total: 21.1s	remaining: 34m 38s
225:	learn: 2.1780433	test: 2.1714927	best: 2.1714927 (225)	total: 31.2s	remaining: 34m 1s
300:	learn: 2.0906529	test: 2.0843972	best: 2.0843972 (300)	total: 41.8s	remaining: 34m 2s
375:	learn: 2.0121518	test: 2.0062031	best: 2.0062031 (375)	total: 52.2s	remaining: 33m 51s
450:	learn: 1.9417086	test: 1.9360953	best: 1.9360953 (450)	total: 1m 2s	remaining: 33m 45s
525:	learn: 1.8786471	test: 1.8733604	best: 1.8733604 (525)	total: 1m 13s	remaining: 33m 40s
600:	learn: 1.8223725	test: 1.8174274	best: 1.8174274 (600)	total: 1m 24s	remaining: 33m 37s
675:	learn: 1.7723326	test: 1.7677365	best: 1.7677365 (675)	total: 1m 34s	remaining: 33m 19s
750:	learn: 1.7278259	test: 1.7235917	best: 1.7235917 (750)	total: 1m 44s	remaining: 

<catboost.core.CatBoostRegressor at 0x7f6d9e955160>

In [1]:
mlflow.catboost.log_model(model, "catboost_model")

best_iteration = model.get_best_iteration()

mlflow.log_metric('best_iteration', best_iteration)

NameError: name 'mlflow' is not defined

## Plot the Feature Importance

In [None]:

fea_imp = pd.DataFrame({'imp': model.feature_importances_, 'col': X.columns})
fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[
                              True, False]).iloc[-30:]

fig = go.Figure(data=[go.Bar(
    x=fea_imp['imp'],
    y=fea_imp['col'],
    orientation='h',
    marker=dict(color=fea_imp['imp'], colorbar=dict(title='Importance'))

)])

fig.update_layout(
    title='CatBoost - Feature Importance - All',
    yaxis=dict(title='Features'),
    xaxis=dict(title='Importance'),
    height=600,
    width=800
)

# fig.show()


# Save the Plotly figure as an HTML file
html_path = "temp/feature_importance_all.html"
pio.write_html(fig, html_path)

# Log the HTML file as an artifact in MLflow
mlflow.log_artifact(html_path)

### Log feature importance - mlflow

In [None]:
# Get the feature importance values and feature names from your CatBoost model
feature_importance = model.get_feature_importance()
feature_names = X.columns

# Create a DataFrame to store the feature importance values and feature names
feature_importance_df = pd.DataFrame(
    {'Feature': feature_names, 'Importance': feature_importance})

# Create a temporary file to save the feature importance DataFrame
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as temp_file:
    # Save the feature importance DataFrame as a CSV file
    feature_importance_df.to_csv(temp_file, index=False)

# Log the feature importance CSV file as an artifact in MLflow
mlflow.log_artifact(temp_file.name, "feature_importance.csv")

In [None]:
# import pandas as pd
# import tempfile

# # Define the years sets
# years_sets = [
#     [2020, 2021],
#     [2019, 2020, 2021],
#     [2018, 2019, 2020, 2021],
#     [2021]
# ]

# # Create a DataFrame to store the feature importance values and feature names
# feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# # Insert the year sets as the first column in the DataFrame
# feature_importance_df.insert(0, 'Year Sets', years_sets)

# # Create a temporary file to save the feature importance DataFrame
# with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv') as temp_file:
#     # Save the feature importance DataFrame as a CSV file
#     feature_importance_df.to_csv(temp_file, index=False)

# # Log the feature importance CSV file as an artifact in MLflow
# mlflow.log_artifact(temp_file.name, "feature_importance.csv")

### Evalute Model

In [None]:
# model evaluation
from sklearn.metrics import mean_squared_error
print('Model evaluation:')
print(model.get_params())
print('RMSE:', np.sqrt(mean_squared_error(y_valid, model.predict(X_valid))))

In [None]:
print(model.get_best_score())

# Flatten and log the best scores as parameters in MLflow
best_scores = model.get_best_score()
for stage, metrics in best_scores.items():
    for metric, value in metrics.items():
        mlflow.log_metric(f'{stage}_{metric}', value)

In [None]:
# End the MLflow run
# mlflow.end_run()

### Finish without shap

## SHAP

In [None]:
# import shap

# pool = cb.Pool(X_valid, y_valid)


# # Compute SHAP values
# shap_values = model.get_feature_importance(pool, type='ShapValues')

# # Convert SHAP values to a DataFrame
# shap_df = pd.DataFrame(shap_values[:, :-1], columns=X_valid.columns)

# # Log the SHAP values as an artifact in MLflow
# shap_df.to_csv("shap_values.csv", index=False)
# mlflow.log_artifact("shap_values.csv")

In [None]:
# # Exclude the constant offset column from shap_values
# shap_values = shap_values[:, :-1]

# # Create a SHAP summary plot
# shap.summary_plot(shap_values, X_valid)

# # Save the plot as an artifact in MLflow
# shap_plot_path = 'shap_summary_plot.png'
# shap.summary_plot(shap_values, X_valid, show=False)
# plt.savefig(shap_plot_path)
# mlflow.log_artifact(shap_plot_path)

# Evaluation


## Predictions vs Actual Prices

In [None]:
# data_pred_tmp = data_pred.drop(['PricePerUnit'], axis=1)
predictions = model.predict(data_pred)

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame(predictions, columns=['Predictions'])

# Save the predictions to a CSV file
predictions_df.to_csv('temp/predictions.csv', index=False)

# Log the predictions CSV file as an artifact in MLflow
mlflow.log_artifact('temp/predictions.csv', 'predictions')

In [None]:
predictions

In [None]:
actual_prices = data_pred.PricePerUnit

In [None]:

import plotly.graph_objects as go

# Create a line plot using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(actual_prices))),
              y=actual_prices, mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=list(range(len(predictions))),
              y=predictions, mode='lines', name='Predicted'))

fig.update_layout(
    title='Actual vs. Predicted Prices for 2022',
    xaxis=dict(title='Time'),
    yaxis=dict(title='Price')
)

# fig.show()

# Save the Plotly figure as an HTML file
html_path = "temp/actual_predicted.html"
pio.write_html(fig, html_path)

# Log the HTML file as an artifact in MLflow
mlflow.log_artifact(html_path)

In [None]:
rmse = mean_squared_error(actual_prices, predictions, squared=False)

In [None]:
rmse

In [None]:
error = actual_prices - predictions

In [None]:
plt.scatter(actual_prices, predictions, c=error, cmap='coolwarm')
plt.plot(np.linspace(min(actual_prices), max(actual_prices), 100), np.linspace(
    min(actual_prices), max(actual_prices), 100), color='black', linestyle='--')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.colorbar()
plt.show()

In [None]:
# Create a scatter plot using Plotly
fig = go.Figure(data=go.Scatter(
    x=actual_prices,
    y=predictions,
    mode='markers',
    marker=dict(
        size=8,
        color=error,
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title='Error')
    )
))

fig.update_layout(
    title='Actual vs. Predicted Prices',
    xaxis=dict(title='Actual Prices'),
    yaxis=dict(title='Predicted Prices')
)

# fig.show()

# Save the Plotly figure as an HTML file
html_path = "temp/actual_predicted_errors.html"
pio.write_html(fig, html_path)

# Log the HTML file as an artifact in MLflow
mlflow.log_artifact(html_path)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error


mse = mean_squared_error(actual_prices, predictions)
rmse = mean_squared_error(actual_prices, predictions, squared=False)
mae = mean_absolute_error(actual_prices, predictions)
r2 = r2_score(actual_prices, predictions)
mape = mean_absolute_percentage_error(actual_prices, predictions)

In [None]:
mlflow.log_metric("MSE", mse)
mlflow.log_metric("RMSE", rmse)
mlflow.log_metric("MAE", mae)
mlflow.log_metric("MAPE", mape)
mlflow.log_metric("R-squared", r2)

# mlflow.log_metric()

In [None]:
print('MSE:', mse)
print('RMSE:', rmse)
print('MAE:', mae)
print('MAPE:', mape)
print('R-squared:', r2)

https://towardsdatascience.com/ad-demand-forecast-with-catboost-lightgbm-819e5073cd3e

https://towardsdatascience.com/understanding-feature-importance-and-how-to-implement-it-in-python-ff0287b20285

In [None]:
# End the MLflow run
mlflow.end_run()