In [17]:
import numpy as np
from sklearn.cluster import SpectralClustering
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Simulation parameters
num_providers = 3
num_regions = 3
min_dcs_per_provider = 2
max_dcs_per_provider = 5
num_vms_per_dc = 8
vm_processing_capability = 1500  # in MIPS
vm_num_cpus = 2
vm_ram = 4  # in Gb
vm_storage_capacity = 8  # in Gb
num_data = 200  # Number of data
data_size_range = (300, 1000)  # Data size range in Mb
task_size_range = (200, 1000)  # Task size range in MI
Re = 0.7  # Provider revenues per task execution ($)
C_penalty = 0.0025  # Penalty per violation ($)
num_dc_list = [6, 9, 12, 15]

inter_region_bw_capacity = 500  # in Mb/s
inter_region_bw_delay = 150  # in ms

intra_region_bw_capacity = 1000  # in Mb/s
intra_region_bw_delay = 50  # in ms

intra_dc_bw_capacity = 8000  # in Mb/s
intra_dc_bw_delay = 10  # in ms

# Pricing information
operating_cost = {
    'Provider 1': {
        'US': [0.020, 0.006, 0.001, 0.0015, 0.002, 0.004, 0.008],
        'EU': [0.025, 0.006, 0.001, 0.0015, 0.002, 0.004, 0.008],
        'AS': [0.027, 0.0066, 0.001, 0.0015, 0.002, 0.004, 0.008],
    },
    'Provider 2': {
        'US': [0.020, 0.0096, 0.001, 0.0015, 0.002, 0.004, 0.008],
        'EU': [0.018, 0.0096, 0.001, 0.0015, 0.002, 0.004, 0.008],
        'AS': [0.020, 0.0096, 0.001, 0.0015, 0.002, 0.004, 0.008],
    },
    'Provider 3': {
        'US': [0.0095, 0.00120, 0.001, 0.0015, 0.002, 0.004, 0.008],
        'EU': [0.0090, 0.0096, 0.001, 0.0015, 0.002, 0.004, 0.008],
        'AS': [0.0080, 0.0090, 0.001, 0.0015, 0.002, 0.004, 0.008],
    },
}

# Simulate data for each provider and region
for num_dcs in num_dc_list:
    provider_data = []  # Reset provider data for each iteration
    for provider_id in range(1, num_providers + 1):
        for region in ['US', 'EU', 'AS']:
            for dc_id in range(1, num_dcs + 1):                
                for data_id in range(1, num_data + 1):
                    data_size = np.random.uniform(data_size_range[0], data_size_range[1])
                    task_size = np.random.uniform(task_size_range[0], task_size_range[1])
                    
                    base_response_time = 180
                    response_time_variation = np.random.normal(loc=0, scale=10)
                    response_time = max(0, base_response_time + response_time_variation)
                    
                    # Include bandwidth and delay information in the data
                    vm_data = {
                        'provider_id': provider_id,
                        'region': region,
                        'dc_id': dc_id,
                        'num_vms': num_vms_per_dc,
                        'vm_processing_capability': vm_processing_capability,
                        'vm_num_cpus': vm_num_cpus,
                        'vm_ram': vm_ram,
                        'vm_storage_capacity': vm_storage_capacity,
                        'cpu_cost': operating_cost[f'Provider {provider_id}'][region][0],
                        'storage_cost': operating_cost[f'Provider {provider_id}'][region][1],
                        'intra_dc_bw_cost': operating_cost[f'Provider {provider_id}'][region][2],
                        'inter_region_bw_cost': operating_cost[f'Provider {provider_id}'][region][3],
                        'intra_region_bw_cost': operating_cost[f'Provider {provider_id}'][region][4],
                        'response_time_slo': response_time,
                        'availability_slo': 0.95,
                        'task_count': np.random.choice([1000, 2000, 3000, 5000, 7000, 10000]),
                        'data_size': data_size,
                        'task_size': task_size,
                        'inter_region_bw_capacity': inter_region_bw_capacity,
                        'inter_region_bw_delay': inter_region_bw_delay,
                        'intra_region_bw_capacity': intra_region_bw_capacity,
                        'intra_region_bw_delay': intra_region_bw_delay,
                        'intra_dc_bw_capacity': intra_dc_bw_capacity,
                        'intra_dc_bw_delay': intra_dc_bw_delay,
                    }
                    provider_data.append(vm_data)

    # 'provider_data' includes task-related features
    # 'provider_data' includes task-related features
features = [
    'cpu_cost', 'storage_cost', 'intra_dc_bw_cost', 'inter_region_bw_cost',
    'total_cost', 'task_count', 'data_size', 'task_size', 'intra_region_bw_cost',
    'response_time_slo', 'availability_slo',
    'inter_region_bw_capacity', 'inter_region_bw_delay',
    'intra_region_bw_capacity', 'intra_region_bw_delay',
    'intra_dc_bw_capacity', 'intra_dc_bw_delay', 'provider_id',
]

data = np.array([[vm.get(feature, 0) for feature in features] for vm in provider_data])

# Create a DataFrame using Pandas
df = pd.DataFrame(data, columns=features)

# Add provider information
provider_info = []
for provider_id in range(1, num_providers + 1):
    num_dcs = np.random.randint(min_dcs_per_provider, max_dcs_per_provider + 1)
    provider_info.extend([f'Provider {provider_id}'] * (num_regions * num_dcs * num_data))

# Ensure that the length of provider_info matches the total number of rows in the DataFrame
provider_info = provider_info[:len(df)]

df['provider'] = provider_info

# Print the DataFrame
print(df.head())

# Data Identification Phase: Spectral Clustering
n_clusters = 3  # Adjust based on the data

# Use Spectral Clustering to identify clusters and correlate with SLA violations
spectral = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors', n_neighbors=5)
cluster_labels = spectral.fit_predict(data)

# Identify data most likely to cause SLA violations if not replicated
# Here, we assume that clusters with higher density represent potential SLA violation areas
cluster_density = np.bincount(cluster_labels)
most_critical_clusters = np.argsort(cluster_density)[-2:]  # Select the top 2 densest clusters

critical_data_indices = []
for cluster in most_critical_clusters:
    cluster_indices = np.where(cluster_labels == cluster)[0]
    critical_data_indices.extend(cluster_indices)

# Critical data information
critical_data = data[critical_data_indices]

# Assuming you have the critical_data from the last part of your script
# Extract necessary information for calculations
data_sizes = critical_data[:, features.index('data_size')]
data_transfer_times = critical_data[:, features.index('response_time_slo')]

# Updated Data Transfer Cost Calculation
data_transfer_cost = (
    (data_sizes / critical_data[:, features.index('intra_dc_bw_capacity')]) * critical_data[:, features.index('intra_dc_bw_cost')] +
    (data_sizes / critical_data[:, features.index('inter_region_bw_capacity')]) * critical_data[:, features.index('inter_region_bw_cost')] +
    (data_sizes / critical_data[:, features.index('intra_region_bw_capacity')]) * critical_data[:, features.index('intra_region_bw_cost')]
)

# 1. Data Transfer Time Ratio
longest_data_transfer_time = np.max(data_transfer_times)
data_transfer_time_ratios = data_transfer_times / longest_data_transfer_time

# 2. Virtual Machine's Load
queuing_capacities = np.random.uniform(1, 10, size=len(critical_data))
processing_capacities = np.random.uniform(1, 10, size=len(critical_data))
vm_loads = queuing_capacities + processing_capacities

# 3. Availability
MRFD = 2
availability_slo = critical_data[:, features.index('availability_slo')]
availability = 1 - (1 - 1 / MRFD) ** MRFD

# 4. Profit
num_tasks_executed = critical_data[:, features.index('task_count')]
revenues_per_task_execution = Re
task_execution_revenues = num_tasks_executed * revenues_per_task_execution

data_transfer_volume = data_sizes / critical_data[:, features.index('intra_dc_bw_capacity')]
data_transfer_revenue = data_transfer_cost * data_transfer_volume

cpu_cost = critical_data[:, features.index('cpu_cost')]
storage_cost = critical_data[:, features.index('storage_cost')]
intra_dc_bw_cost = critical_data[:, features.index('intra_dc_bw_cost')]
task_execution_cost = num_tasks_executed * (cpu_cost + storage_cost + intra_dc_bw_cost)

data_management_cost = data_transfer_cost

operating_cost_vm = task_execution_cost + data_management_cost

profit_vm = task_execution_revenues - operating_cost_vm

# Display Results
for i in range(len(critical_data)):
    print(f"Data {i+1} - Data Transfer Time Ratio: {data_transfer_time_ratios[i]:.4f}, VM Load: {vm_loads[i]:.4f}, "
          f"Availability: {availability[i]:.4f}, Profit: {profit_vm[i]:.4f}")

ValueError: Length of values (4800) does not match length of index (27000)