In [181]:
# Importing other packages
import timeit
import pandas as pd
import numpy as np
import neptune
import tempfile
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = "browser"

In [182]:
df = pd.read_csv(f'../data/amazon_22.csv', parse_dates=['EffectiveDate'])

In [183]:
df['Instance Family'].value_counts()

Memory optimized                   213171
Storage optimized                  131324
General purpose                    115748
Compute optimized                   84582
GPU instance                        24350
FPGA Instances                       1040
Machine Learning ASIC Instances       128
Name: Instance Family, dtype: int64

In [184]:
df['TermType'].value_counts()

Reserved    570343
Name: TermType, dtype: int64

[101787:101787:0615/210150.536750:ERROR:shared_context_state.cc(898)] SharedContextState context lost via ARB/EXT_robustness. Reset status = GL_UNKNOWN_CONTEXT_RESET_KHR
[101787:101787:0615/210150.541501:ERROR:gpu_service_impl.cc(1010)] Exiting GPU process because some drivers can't recover from errors. GPU process will restart shortly.
[101735:101735:0615/210150.594058:ERROR:gpu_process_host.cc(953)] GPU process exited unexpectedly: exit_code=8704
libva error: vaGetDriverNameByIndex() failed with unknown libva error, driver_name = (null)
[0615/215525.751701:ERROR:ptracer.cc(567)] ptrace: Input/output error (5)
[0615/215525.764602:ERROR:elf_dynamic_array_reader.h(64)] tag not found
[0615/215525.778165:ERROR:directory_reader_posix.cc(42)] opendir /home/gfragi/snap/brave/244/.config/BraveSoftware/Brave-Browser/Crash Reports/attachments/c44933c0-081d-4278-b30b-395834dad43f: No such file or directory (2)
[101735:101735:0615/215525.787378:ERROR:gpu_process_host.cc(953)] GPU process exited u

In [148]:
# exclude some instance families
df = df[(df['Instance Family'] != "FPGA Instances") & (
    df['Instance Family'] != "Machine Learning ASIC Instances")]

In [149]:
df['Location'].value_counts()

AsiaPacific     170419
EU              136519
USEast           75502
USWest           56837
AWSGovCloud      33972
SouthAmerica     19694
Canada           19568
MiddleEast       16352
Europe           15920
Africa           12544
Name: Location, dtype: int64

In [150]:
# replace USEast and USWest to US and EU to Europe
df = df.replace({'USEast': 'US', 'USWest': 'US', 'EU': 'Europe'})

In [151]:
df['Location'].value_counts()

AsiaPacific     170419
Europe          152439
US              132339
AWSGovCloud      33972
SouthAmerica     19694
Canada           19568
MiddleEast       16352
Africa           12544
Name: Location, dtype: int64

In [152]:
# keep only the countries that have data in all years
df = df[(df['Location'] != "Africa") & (df['Location'] != "MiddleEast") & (df['Location']
                                                                           != "Canada") & (df['Location'] != "SouthAmerica") & (df['Location'] != "AWSGovCloud")]

In [153]:
# Convert EffectiveDate column to datetime format
df['EffectiveDate'] = pd.to_datetime(df['EffectiveDate'])

# Extract the quarter component from the EffectiveDate column
df['Quarter'] = df['EffectiveDate'].dt.quarter

# Create a dictionary mapping quarter numbers to season labels
seasons = {1: 'Q1', 2: 'Q2', 3: 'Q3', 4: 'Q4'}

# Map the quarter numbers to season labels using the dictionary
df['Seasonality'] = df['Quarter'].map(seasons)

In [154]:
df

Unnamed: 0,SKU,OfferTermCode,RateCode,PricePerUnit,Instance Type,Instance Family,LeaseContractLength,PurchaseOption,OfferingClass,Product Family,...,Tenancy,Operating System,License Model,year,Network Performance,EffectiveDate,DiskType,StorageSize,Quarter,Seasonality
0,222AY99RA8W7WFR4,38NPMPTW36,222AY99RA8W7WFR4.38NPMPTW36.6YS6EN2CT7,0.0605,r6gd.medium,Memory optimized,3,Partial Upfront,standard,Compute Instance,...,Dedicated,Red Hat Enterprise Linux with HA,No License required,2021,10.0,2021-03-01,NVMe SSD,59,1,Q1
1,222AY99RA8W7WFR4,4NA7Y494T4,222AY99RA8W7WFR4.4NA7Y494T4.6YS6EN2CT7,0.1403,r6gd.medium,Memory optimized,1,No Upfront,standard,Compute Instance,...,Dedicated,Red Hat Enterprise Linux with HA,No License required,2021,10.0,2021-03-01,NVMe SSD,59,1,Q1
2,222AY99RA8W7WFR4,7NE97W5U4E,222AY99RA8W7WFR4.7NE97W5U4E.6YS6EN2CT7,0.1466,r6gd.medium,Memory optimized,1,No Upfront,convertible,Compute Instance,...,Dedicated,Red Hat Enterprise Linux with HA,No License required,2021,10.0,2021-03-01,NVMe SSD,59,1,Q1
3,222AY99RA8W7WFR4,BPH4J8HBKS,222AY99RA8W7WFR4.BPH4J8HBKS.6YS6EN2CT7,0.1231,r6gd.medium,Memory optimized,3,No Upfront,standard,Compute Instance,...,Dedicated,Red Hat Enterprise Linux with HA,No License required,2021,10.0,2021-03-01,NVMe SSD,59,1,Q1
4,222AY99RsA8W7WFR4,CUZHX8X6JH,222AY99RA8W7WFR4.CUZHX8X6JH.6YS6EN2CT7,0.0721,r6gd.medium,Memory optimized,1,Partial Upfront,convertible,Compute Instance,...,Dedicated,Red Hat Enterprise Linux with HA,No License required,2021,10.0,2021-03-01,NVMe SSD,59,1,Q1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558490,ZZZU9HQBJNWNW7AD,BPH4J8HBKS,ZZZU9HQBJNWNW7AD.BPH4J8HBKS.6YS6EN2CT7,0.1470,c5ad.xlarge,Compute optimized,3,No Upfront,standard,Compute Instance,...,Shared,RHEL,No License required,2020,10.0,2020-06-01,NVMe SSD,150,2,Q2
558491,ZZZU9HQBJNWNW7AD,CUZHX8X6JH,ZZZU9HQBJNWNW7AD.CUZHX8X6JH.6YS6EN2CT7,0.1340,c5ad.xlarge,Compute optimized,1,Partial Upfront,convertible,Compute Instance,...,Shared,RHEL,No License required,2020,10.0,2020-06-01,NVMe SSD,150,2,Q2
558492,ZZZU9HQBJNWNW7AD,HU7G6KETJZ,ZZZU9HQBJNWNW7AD.HU7G6KETJZ.6YS6EN2CT7,0.1210,c5ad.xlarge,Compute optimized,1,Partial Upfront,standard,Compute Instance,...,Shared,RHEL,No License required,2020,10.0,2020-06-01,NVMe SSD,150,2,Q2
558493,ZZZU9HQBJNWNW7AD,R5XV2EPZQZ,ZZZU9HQBJNWNW7AD.R5XV2EPZQZ.6YS6EN2CT7,0.1080,c5ad.xlarge,Compute optimized,3,Partial Upfront,convertible,Compute Instance,...,Shared,RHEL,No License required,2020,10.0,2020-06-01,NVMe SSD,150,2,Q2


In [155]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 455197 entries, 0 to 558494
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   SKU                  455197 non-null  object        
 1   OfferTermCode        455197 non-null  object        
 2   RateCode             455197 non-null  object        
 3   PricePerUnit         455197 non-null  float64       
 4   Instance Type        455197 non-null  object        
 5   Instance Family      455197 non-null  object        
 6   LeaseContractLength  455197 non-null  int64         
 7   PurchaseOption       455197 non-null  object        
 8   OfferingClass        455197 non-null  object        
 9   Product Family       455197 non-null  object        
 10  Location             455197 non-null  object        
 11  Current Generation   455197 non-null  object        
 12  vCPU                 455197 non-null  int64         
 13  Memory        

In [168]:
# Calculate statistic metricsby group
stats = df.groupby(['year', 'Instance Family'])['PricePerUnit'].agg(
    ['mean', 'std', 'median', 'max', 'min']).reset_index()
stats.rename(columns={'mean': 'mean_price', 'std': 'std_deviation',
             'max': 'maximum', 'min': 'minimum'}, inplace=True)

In [172]:
import plotly.express as px

fig = px.line(stats, x='year', y=['median'],
              color='Instance Family', template='plotly_dark',
              title='Evolution of median price by Instance Family')

fig.update_traces(mode='lines+markers')

fig.update_yaxes(range=[0, 7])

fig.update_layout(
    xaxis=dict(title='Year', tickfont=dict(size=20, family='Arial')),
    yaxis=dict(title='Price per Unit', tickfont=dict(size=20, family='Arial')),
    legend=dict(orientation='h', yanchor='bottom',
                y=1.02, xanchor='right', x=1),
    title=dict(font=dict(size=24, family='Arial')),
    font=dict(size=18, family='Arial')
)

fig.show()

Opening in existing browser session.


/snap/core20/current/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /lib/x86_64-linux-gnu/libproxy.so.1)
Failed to load module: /home/gfragi/snap/code/common/.cache/gio-modules/libgiolibproxy.so
Gtk-Message: 14:45:21.589: Failed to load module "canberra-gtk-module"
Gtk-Message: 14:45:21.589: Failed to load module "canberra-gtk-module"


In [158]:
fig = px.scatter(df, x=df.index, y=df['PricePerUnit'], template='plotly_dark',
                 labels=dict(x='Data points', y='Price per unit'),
                 color='PricePerUnit', title='Density of Price per year', facet_col='year',
                 category_orders={'year': [2016, 2017, 2018, 2019, 2020, 2021, 2022]})

# fig.write_html("results/stats/density.html")
fig.show()

libva error: vaGetDriverNameByIndex() failed with unknown libva error, driver_name = (null)
/snap/core20/current/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /lib/x86_64-linux-gnu/libproxy.so.1)
Failed to load module: /home/gfragi/snap/code/common/.cache/gio-modules/libgiolibproxy.so


Opening in existing browser session.


Gtk-Message: 14:19:11.315: Failed to load module "canberra-gtk-module"
Gtk-Message: 14:19:11.316: Failed to load module "canberra-gtk-module"


In [180]:
import plotly.express as px

fig = px.scatter_3d(df, x='year', y='Instance Family', z='PricePerUnit', template='plotly_dark',
                    labels=dict(z='Price per Hour'),
                    color='Instance Family', title='Density of Price per Year',
                    category_orders={'year': [2016, 2017, 2018, 2019, 2020, 2021, 2022]})

fig.update_layout(
    legend=dict(orientation='h', yanchor='bottom',
                y=-0.1, xanchor='left', x=0),
    font=dict(size=16, family='Arial'),
    scene=dict(
        xaxis=dict(title_font=dict(size=20, family='Arial')),
        yaxis=dict(title_font=dict(size=20, family='Arial')),
        zaxis=dict(title_font=dict(size=20, family='Arial')),
        xaxis_title='', yaxis_title='', zaxis_title='Price($) per Hour'
    ),
    title=dict(font=dict(size=24, family='Arial'))
)

fig.update_traces(marker=dict(size=8))

fig.show()

/snap/core20/current/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /lib/x86_64-linux-gnu/libproxy.so.1)
Failed to load module: /home/gfragi/snap/code/common/.cache/gio-modules/libgiolibproxy.so
Gtk-Message: 14:59:36.219: Failed to load module "canberra-gtk-module"
Gtk-Message: 14:59:36.220: Failed to load module "canberra-gtk-module"


Opening in existing browser session.


libva error: vaGetDriverNameByIndex() failed with unknown libva error, driver_name = (null)


In [162]:
import plotly.express as px

fig = px.scatter(df, x='year', y='Instance Family', color='PricePerUnit', template='plotly_dark',
                 labels=dict(x='Year', y='Instance Family',
                             color='Price per Hour'),
                 color_continuous_scale='Viridis', title='Density of Price per Year')

fig.update_layout(
    legend=dict(orientation='v', yanchor='bottom',
                y=0.5, xanchor='right', x=0),
    font=dict(size=16, family='Arial'),
    xaxis=dict(title_font=dict(size=20, family='Arial')),
    yaxis=dict(title_font=dict(size=20, family='Arial')),
    title=dict(font=dict(size=24, family='Arial'))
)

fig.update_traces(marker=dict(size=8))

fig.show()

/snap/core20/current/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /lib/x86_64-linux-gnu/libproxy.so.1)
Failed to load module: /home/gfragi/snap/code/common/.cache/gio-modules/libgiolibproxy.so
Gtk-Message: 14:21:19.383: Failed to load module "canberra-gtk-module"
Gtk-Message: 14:21:19.384: Failed to load module "canberra-gtk-module"


Opening in existing browser session.


libva error: vaGetDriverNameByIndex() failed with unknown libva error, driver_name = (null)
