In [196]:
import mlflow.sklearn
import pandas as pd
import plotly.graph_objects as go


import my_functions as mf

In [197]:
# instance_type = "g5.xlarge"
# operating_system = 'Windows'
# purchase_option = "Partial Upfront"

instance_type = input("Enter the instance type: ")
operating_system = input("Enter the operating system: ")
purchase_option = input("Enter the purchase option: ")

In [198]:
# Import data for prediction

data = pd.read_csv(
    f'./data/{instance_type}_{operating_system}_{purchase_option}.csv')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   YearQuarter          10 non-null     object 
 1   PricePerUnit         10 non-null     float64
 2   LeaseContractLength  10 non-null     float64
 3   PurchaseOption       10 non-null     object 
 4   vCPU                 10 non-null     float64
 5   DiskType             10 non-null     object 
 6   operatingSystem      10 non-null     object 
 7   Memory               10 non-null     float64
 8   networkPerformance   10 non-null     float64
 9   StorageSize          10 non-null     float64
 10  Quarter              10 non-null     float64
 11  year                 10 non-null     int64  
dtypes: float64(7), int64(1), object(4)
memory usage: 1.1+ KB


In [199]:
data_actual = data[data['year'] != 2023]

actual_prices = data_actual.set_index('YearQuarter')['PricePerUnit']

data = data.set_index('YearQuarter')[['LeaseContractLength', 'PurchaseOption',
                                      'vCPU', 'Memory', 'operatingSystem',
                                      'networkPerformance', 'StorageSize',
                                      'DiskType', 'Quarter', 'year']]

data_pred = data[data['year'] > 2022]

## Prepare the Data

In [200]:
# %%  =========== Prepare the Data for regression============

# Map binary categorical columns to numerical
categorical_binary = ['PurchaseOption']
data[categorical_binary] = data[categorical_binary].apply(
    mf.binary_map)

# Write the categorical values as a list
categorical = ['operatingSystem', 'DiskType']
categorical2numeric = pd.get_dummies(
    data[categorical], drop_first=False)

categorical2numeric_pred = pd.get_dummies(
    data[categorical], drop_first=False)

# Add the above results to the original dataframe df
data = pd.concat([data, categorical2numeric], axis=1)
data.drop(columns=categorical, axis=1, inplace=True)

In [201]:
# Add missing categorical feature if necessary
if 'DiskType_NVMe SSD' not in data.columns:
    data['DiskType_NVMe SSD'] = 0

operatingSystem_RHEL_HA = 0
operatingSystem_Windows = 0
operatingSystem_Linux = 0

if 'operatingSystem_RHEL_HA' not in data.columns:
    data.insert(9, 'operatingSystem_RHEL_HA', operatingSystem_RHEL_HA)

if 'operatingSystem_Windows' not in data.columns:
    data.insert(10, 'operatingSystem_Windows',
                operatingSystem_Windows)

if 'operatingSystem_Linux' not in data.columns:
    data.insert(8, 'operatingSystem_Linux',
                operatingSystem_Linux)

In [202]:
# RUN_ID = input("Enter the model RUN ID you want to use: ")
path1 = "file:///media/gfragi/data/BarraCuda/mlruns/mlruns"
path2 = "artifacts/catboost_model"
# g4dn.12xlarge_Windows_Partial Upfront.csv
logged_model = f"{path1}/977346654452183974/9fa9c1ec775a46bcb670cb8aac6d321b/{path2}"

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [203]:

# Predict on a Pandas DataFrame.
predictions = loaded_model.predict(pd.DataFrame(data))

# Create a DataFrame with the predictions and set the index to match the actual prices
predictions_df = pd.DataFrame(
    predictions, columns=['Predictions'], index=data.index)


# # Save the predictions to a CSV file
# predictions_df.to_csv(
#     f'predResults/{instance_type}_predictions.csv', index=False)

In [204]:
import plotly.io as pio

pio.renderers.default = "browser"

# Create a line plot using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=actual_prices.index,
              y=actual_prices, mode='lines+markers', name='Actual'))
fig.add_trace(go.Scatter(x=predictions_df.index,
              y=predictions, mode='lines+markers', name='Predicted'))

fig.update_layout(
    title=f'Forecast for the quarters of 2023 for {instance_type}',
    xaxis=dict(title='Year Quarter'),
    yaxis=dict(title='Price ($) per hour'),
    template='presentation'
)

fig.show()

# Save the Plotly figure as an HTML file
# html_path = "temp/actual_predicted.html"
# pio.write_html(fig, html_path)

# # Log the HTML file as an artifact in MLflow
# mlflow.log_artifact(html_path)

Opening in existing browser session.


/snap/core20/current/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /lib/x86_64-linux-gnu/libproxy.so.1)
Failed to load module: /home/gfragi/snap/code/common/.cache/gio-modules/libgiolibproxy.so
Gtk-Message: 06:46:11.678: Failed to load module "canberra-gtk-module"
Gtk-Message: 06:46:11.678: Failed to load module "canberra-gtk-module"


In [205]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error


mse = mean_squared_error(actual_prices, predictions)
rmse = mean_squared_error(actual_prices, predictions, squared=False)
mae = mean_absolute_error(actual_prices, predictions)
r2 = r2_score(actual_prices, predictions)
mape = mean_absolute_percentage_error(actual_prices, predictions)

ValueError: Found input variables with inconsistent numbers of samples: [6, 10]

In [None]:
print('MSE:', mse)
print('RMSE:', rmse)
print('MAE:', mae)
print('MAPE:', mape)
print('R-squared:', r2)

MSE: 34.64502444447058
RMSE: 5.886002416281409
MAE: 5.880372817441975
MAPE: 0.6569875914834162
R-squared: -410.89957652977847
