In [9]:
# Importing other packages
import timeit
import pandas as pd
import numpy as np
import neptune
import tempfile
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = "browser"

In [10]:
df = pd.read_csv(f'./data/amazon_22.csv', parse_dates=['EffectiveDate'])

In [11]:
df['instanceFamily'].value_counts()

Memory optimized                   213171
Storage optimized                  131324
General purpose                    115748
Compute optimized                   84582
GPU instance                        24350
FPGA Instances                       1040
Machine Learning ASIC Instances       128
Name: instanceFamily, dtype: int64

In [12]:
df['TermType'].value_counts()

Reserved    570343
Name: TermType, dtype: int64

In [13]:
# exclude some instance families
df = df[(df['instanceFamily'] != "FPGA Instances") & (
    df['instanceFamily'] != "Machine Learning ASIC Instances")]

In [14]:
df['Location'].value_counts()

AsiaPacific     178379
EU              137095
USEast           75870
USWest           59045
AWSGovCloud      33972
SouthAmerica     19694
Canada           19568
Europe           16656
MiddleEast       16352
Africa           12544
Name: Location, dtype: int64

In [15]:
# replace USEast and USWest to US and EU to Europe
df = df.replace({'USEast': 'US', 'USWest': 'US', 'EU': 'Europe'})

In [16]:
df['Location'].value_counts()

AsiaPacific     178379
Europe          153751
US              134915
AWSGovCloud      33972
SouthAmerica     19694
Canada           19568
MiddleEast       16352
Africa           12544
Name: Location, dtype: int64

In [17]:
# # keep only the countries that have data in all years
# df = df[(df['Location'] != "Africa") & (df['Location'] != "MiddleEast") & (df['Location']
#                                                                            != "Canada") & (df['Location'] != "SouthAmerica") & (df['Location'] != "AWSGovCloud")]

In [18]:
# Convert EffectiveDate column to datetime format
df['EffectiveDate'] = pd.to_datetime(df['EffectiveDate'])

# Extract the quarter component from the EffectiveDate column
df['Quarter'] = df['EffectiveDate'].dt.quarter

# Create a dictionary mapping quarter numbers to season labels
seasons = {1: 'Q1', 2: 'Q2', 3: 'Q3', 4: 'Q4'}

# Map the quarter numbers to season labels using the dictionary
df['Seasonality'] = df['Quarter'].map(seasons)

In [19]:
df = df[df['year'] != 2023]

In [20]:
df

Unnamed: 0,SKU,OfferTermCode,RateCode,PricePerUnit,instanceType,instanceFamily,LeaseContractLength,PurchaseOption,OfferingClass,Product Family,...,operatingSystem,License Model,year,TermType,Network Performance,EffectiveDate,DiskType,StorageSize,Quarter,Seasonality
0,222AY99RA8W7WFR4,38NPMPTW36,222AY99RA8W7WFR4.38NPMPTW36.6YS6EN2CT7,0.0605,r6gd.medium,Memory optimized,3,Partial Upfront,standard,Compute Instance,...,Red Hat Enterprise Linux with HA,No License required,2021,Reserved,10.0,2021-03-01,NVMe SSD,59,1,Q1
1,222AY99RA8W7WFR4,4NA7Y494T4,222AY99RA8W7WFR4.4NA7Y494T4.6YS6EN2CT7,0.1403,r6gd.medium,Memory optimized,1,No Upfront,standard,Compute Instance,...,Red Hat Enterprise Linux with HA,No License required,2021,Reserved,10.0,2021-03-01,NVMe SSD,59,1,Q1
2,222AY99RA8W7WFR4,7NE97W5U4E,222AY99RA8W7WFR4.7NE97W5U4E.6YS6EN2CT7,0.1466,r6gd.medium,Memory optimized,1,No Upfront,convertible,Compute Instance,...,Red Hat Enterprise Linux with HA,No License required,2021,Reserved,10.0,2021-03-01,NVMe SSD,59,1,Q1
3,222AY99RA8W7WFR4,BPH4J8HBKS,222AY99RA8W7WFR4.BPH4J8HBKS.6YS6EN2CT7,0.1231,r6gd.medium,Memory optimized,3,No Upfront,standard,Compute Instance,...,Red Hat Enterprise Linux with HA,No License required,2021,Reserved,10.0,2021-03-01,NVMe SSD,59,1,Q1
4,222AY99RA8W7WFR4,CUZHX8X6JH,222AY99RA8W7WFR4.CUZHX8X6JH.6YS6EN2CT7,0.0721,r6gd.medium,Memory optimized,1,Partial Upfront,convertible,Compute Instance,...,Red Hat Enterprise Linux with HA,No License required,2021,Reserved,10.0,2021-03-01,NVMe SSD,59,1,Q1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
570338,ZZZU9HQBJNWNW7AD,BPH4J8HBKS,ZZZU9HQBJNWNW7AD.BPH4J8HBKS.6YS6EN2CT7,0.1470,c5ad.xlarge,Compute optimized,3,No Upfront,standard,Compute Instance,...,RHEL,No License required,2020,Reserved,10.0,2020-06-01,NVMe SSD,150,2,Q2
570339,ZZZU9HQBJNWNW7AD,CUZHX8X6JH,ZZZU9HQBJNWNW7AD.CUZHX8X6JH.6YS6EN2CT7,0.1340,c5ad.xlarge,Compute optimized,1,Partial Upfront,convertible,Compute Instance,...,RHEL,No License required,2020,Reserved,10.0,2020-06-01,NVMe SSD,150,2,Q2
570340,ZZZU9HQBJNWNW7AD,HU7G6KETJZ,ZZZU9HQBJNWNW7AD.HU7G6KETJZ.6YS6EN2CT7,0.1210,c5ad.xlarge,Compute optimized,1,Partial Upfront,standard,Compute Instance,...,RHEL,No License required,2020,Reserved,10.0,2020-06-01,NVMe SSD,150,2,Q2
570341,ZZZU9HQBJNWNW7AD,R5XV2EPZQZ,ZZZU9HQBJNWNW7AD.R5XV2EPZQZ.6YS6EN2CT7,0.1080,c5ad.xlarge,Compute optimized,3,Partial Upfront,convertible,Compute Instance,...,RHEL,No License required,2020,Reserved,10.0,2020-06-01,NVMe SSD,150,2,Q2


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 554079 entries, 0 to 570342
Data columns (total 25 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   SKU                  554079 non-null  object        
 1   OfferTermCode        554079 non-null  object        
 2   RateCode             554079 non-null  object        
 3   PricePerUnit         554079 non-null  float64       
 4   instanceType         554079 non-null  object        
 5   instanceFamily       554079 non-null  object        
 6   LeaseContractLength  554079 non-null  int64         
 7   PurchaseOption       554079 non-null  object        
 8   OfferingClass        554079 non-null  object        
 9   Product Family       554079 non-null  object        
 10  Location             554079 non-null  object        
 11  Current Generation   554079 non-null  object        
 12  vCPU                 554079 non-null  int64         
 13  Memory        

In [22]:
# Calculate statistic metricsby group
stats = df.groupby(['year', 'instanceFamily'])['PricePerUnit'].agg(
    ['mean', 'std', 'median', 'max', 'min']).reset_index()
stats.rename(columns={'mean': 'mean_price', 'std': 'std_deviation',
             'max': 'maximum', 'min': 'minimum'}, inplace=True)

In [23]:
stats['year'].value_counts()

2016    5
2017    5
2018    5
2019    5
2020    5
2021    5
2022    5
Name: year, dtype: int64

In [25]:
#create a list for inst families
instFamilies = df['instanceFamily'].unique()
instFamilies

array(['Memory optimized', 'Compute optimized', 'Storage optimized',
       'GPU instance', 'General purpose'], dtype=object)

In [26]:

# # Loop through each instanceFamily and save the corresponding subset as a new DataFrame
# for instance_family in instFamilies:
#     subset = df[df['instanceFamily'] == instance_family]
#     subset.to_csv(f'./data/{instance_family}_subset.csv', index=False)

In [27]:
import plotly.express as px

fig = px.line(stats, x='year', y=['median'],
              color='instanceFamily', template='plotly_dark',
              title='Evolution of median price by Instance Family')

fig.update_traces(mode='lines+markers')

fig.update_yaxes(range=[0, 7])

fig.update_layout(
    xaxis=dict(title='Year', tickfont=dict(size=20, family='Arial')),
    yaxis=dict(title='Price per Unit', tickfont=dict(size=20, family='Arial')),
    legend=dict(orientation='h', yanchor='bottom',
                y=1.02, xanchor='right', x=1),
    title=dict(font=dict(size=24, family='Arial')),
    font=dict(size=18, family='Arial')
)

fig.show()

In [35]:
fig = px.scatter(df, x=df.index, y=df['PricePerUnit'], template='plotly_dark',
                 labels=dict(x='Data points', y='Price per unit'),
                 color='PricePerUnit', title='Density of Price per year', facet_col='year',
                 category_orders={'year': [2016, 2017, 2018, 2019, 2020, 2021, 2022]})
# Modify font properties
fig.update_layout(
    font=dict(
        family='Arial',  # Change to your desired font family
        size=16,
        )
    )# Change to your desired font size
# fig.write_html("results/stats/density.html")
fig.show()

In [30]:
import plotly.express as px

fig = px.scatter_3d(df, x='year', y='instanceFamily', z='PricePerUnit', template='plotly_dark',
                    labels=dict(z='Price per Hour'),
                    color='instanceFamily', title='Density of Price per Year',
                    category_orders={'year': [2016, 2017, 2018, 2019, 2020, 2021, 2022]})

fig.update_layout(
    legend=dict(orientation='h', yanchor='bottom',
                y=-0.1, xanchor='left', x=0),
    font=dict(size=16, family='Arial'),
    scene=dict(
        xaxis=dict(title_font=dict(size=20, family='Arial')),
        yaxis=dict(title_font=dict(size=20, family='Arial')),
        zaxis=dict(title_font=dict(size=20, family='Arial')),
        xaxis_title='', yaxis_title='', zaxis_title='Price($) per Hour'
    ),
    title=dict(font=dict(size=24, family='Arial'))
)

fig.update_traces(marker=dict(size=8))

fig.show()

In [32]:
import plotly.express as px

fig = px.scatter(df, x='year', y='instanceFamily', color='PricePerUnit', template='plotly_dark',
                 labels=dict(x='Year', y='instanceFamily',
                             color='Price per Hour'),
                 color_continuous_scale='Viridis', title='Density of Price per Year')

fig.update_layout(
    legend=dict(orientation='v', yanchor='bottom',
                y=0.5, xanchor='right', x=0),
    font=dict(size=16, family='Arial'),
    xaxis=dict(title_font=dict(size=20, family='Arial')),
    yaxis=dict(title_font=dict(size=20, family='Arial')),
    title=dict(font=dict(size=24, family='Arial'))
)

fig.update_traces(marker=dict(size=8))

fig.show()

In [34]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Histogram2d(
    x=df['year'],
    y=df['instanceFamily'],
    colorscale='YlGnBu',
    zauto=True,
    nbinsx=len(df['year'].unique()),
    nbinsy=len(df['instanceFamily'].unique()),
))

fig.update_layout(
    title='Density of Offers per Year',
    xaxis=dict(title='Year'),
    yaxis=dict(title='Instance Family'),
    coloraxis_colorbar=dict(title='Density'),
)

fig.show()


# 'Viridis'
# 'Cividis'
# 'Hot'
# 'Jet'
# 'Rainbow'
# 'Electric'
# 'Blues'
# 'Greens'
# 'Reds'
# 'YlOrRd'
# 'YlGnBu'