In [215]:
import numpy as np
from sklearn.cluster import SpectralClustering
import skfuzzy as fuzz

# Set random seed for reproducibility
np.random.seed(42)

# Simulation parameters
num_providers = 3
num_regions = 3
min_dcs_per_provider = 2
max_dcs_per_provider = 5
num_vms_per_dc = 8
vm_processing_capability = 1500  # in MIPS
vm_num_cpus = 2
vm_ram = 4  # in Gb
vm_storage_capacity = 8  # in Gb
num_data = 200  # Number of data
data_size_range = (300, 1000)  # Data size range in Mb
task_size_range = (200, 1000)  # Task size range in MI
Re = 0.7  # Provider revenues per task execution ($)
C_penalty = 0.0025  # Penalty per violation ($)
num_dc_list = [6, 9, 12, 15]

inter_region_bw_capacity = 500  # in Mb/s
inter_region_bw_delay = 150  # in ms

intra_region_bw_capacity = 1000  # in Mb/s
intra_region_bw_delay = 50  # in ms

intra_dc_bw_capacity = 8000  # in Mb/s
intra_dc_bw_delay = 10  # in ms

# Pricing information
provider_pricing = {
    'Provider 1': {
        'US': [0.020, 0.006, 0.001, 0.0015, 0.002, 0.004, 0.008],
        'EU': [0.025, 0.006, 0.001, 0.0015, 0.002, 0.004, 0.008],
        'AS': [0.027, 0.0066, 0.001, 0.0015, 0.002, 0.004, 0.008],
    },
    'Provider 2': {
        'US': [0.020, 0.0096, 0.001, 0.0015, 0.002, 0.004, 0.008],
        'EU': [0.018, 0.0096, 0.001, 0.0015, 0.002, 0.004, 0.008],
        'AS': [0.020, 0.0096, 0.001, 0.0015, 0.002, 0.004, 0.008],
    },
    'Provider 3': {
        'US': [0.0095, 0.00120, 0.001, 0.0015, 0.002, 0.004, 0.008],
        'EU': [0.0090, 0.0096, 0.001, 0.0015, 0.002, 0.004, 0.008],
        'AS': [0.0080, 0.0090, 0.001, 0.0015, 0.002, 0.004, 0.008],
    },
}

avg_response_times = {}

# Simulate data for each provider and region
# Vary the number of data centers
# Vary the number of data centers
for num_dcs in num_dc_list:
    provider_data = []  # Reset provider data for each iteration
    for provider_id in range(1, num_providers + 1):
        for region in ['US', 'EU', 'AS']:
            for dc_id in range(1, num_dcs + 1):                
                for data_id in range(1, num_data + 1):
                    data_size = np.random.uniform(data_size_range[0], data_size_range[1])
                    task_size = np.random.uniform(task_size_range[0], task_size_range[1])
                    
                    base_response_time = 180
                    response_time_variation = np.random.normal(loc=0, scale=10)
                    response_time = max(0, base_response_time + response_time_variation)
                    
                    # Include bandwidth and delay information in the data
                    vm_data = {
                        'provider_id': provider_id,
                        'region': region,
                        'dc_id': dc_id,
                        'num_vms': num_vms_per_dc,
                        'vm_processing_capability': vm_processing_capability,
                        'vm_num_cpus': vm_num_cpus,
                        'vm_ram': vm_ram,
                        'vm_storage_capacity': vm_storage_capacity,
                        'cpu_price': provider_pricing[f'Provider {provider_id}'][region][0],
                        'storage_price': provider_pricing[f'Provider {provider_id}'][region][1],
                        'intra_dc_bw_price': provider_pricing[f'Provider {provider_id}'][region][2],
                        'inter_region_bw_price': provider_pricing[f'Provider {provider_id}'][region][3],
                        'intra_region_bw_price': provider_pricing[f'Provider {provider_id}'][region][4],
                        'response_time_slo': response_time,
                        'availability_slo': 0.95,
                        'task_count': np.random.choice([1000, 2000, 3000, 5000, 7000, 10000]),
                        'data_size': data_size,
                        'task_size': task_size,
                        'inter_region_bw_capacity': inter_region_bw_capacity,
                        'inter_region_bw_delay': inter_region_bw_delay,
                        'intra_region_bw_capacity': intra_region_bw_capacity,
                        'intra_region_bw_delay': intra_region_bw_delay,
                        'intra_dc_bw_capacity': intra_dc_bw_capacity,
                        'intra_dc_bw_delay': intra_dc_bw_delay,
                    }
                    provider_data.append(vm_data)

    # 'provider_data' includes task-related features
    features = [
        'cpu_price', 'storage_price', 'intra_dc_bw_price', 'inter_region_bw_price',
        'total_cost', 'task_count', 'data_size', 'task_size',
        'response_time_slo', 'availability_slo',
        'inter_region_bw_capacity', 'inter_region_bw_delay',
        'intra_region_bw_capacity', 'intra_region_bw_delay',
        'intra_dc_bw_capacity', 'intra_dc_bw_delay' 'provider_id',
    ]
    data = np.array([[vm.get(feature, 0) for feature in features] for vm in provider_data])

    # Calculate average response time
    avg_response_time = np.mean(data[:, features.index('response_time_slo')])
    print(f"Number of Data Centers: {num_dcs}, Average Response Time: {avg_response_time:.2f} ms")


# Data Identification Phase: Spectral Clustering
n_clusters = 3  # Adjust based on the data

# Use Spectral Clustering to identify clusters and correlate with SLA violations
spectral = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors', n_neighbors=5)
cluster_labels = spectral.fit_predict(data)

# Identify data most likely to cause SLA violations if not replicated
# Here, we assume that clusters with higher density represent potential SLA violation areas
cluster_density = np.bincount(cluster_labels)
most_critical_clusters = np.argsort(cluster_density)[-2:]  # Select the top 2 densest clusters

critical_data_indices = []
for cluster in most_critical_clusters:
    cluster_indices = np.where(cluster_labels == cluster)[0]
    critical_data_indices.extend(cluster_indices)

# Critical data information
critical_data = data[critical_data_indices]

# Replica Placement Phase

# Integrate the four main parameters into the fuzzy inference system
# (response time, availability, cost, potential for data correlation)
response_time = np.arange(0, 1.1, 0.1)
availability = np.arange(0, 1.1, 0.1)
cost = np.arange(0, 1.1, 0.1)
data_correlation = np.arange(0, 1.1, 0.1)

# Define membership functions for new parameters
response_time_low = fuzz.trimf(response_time, [0, 0, 0.5])
response_time_high = fuzz.trimf(response_time, [0.5, 1, 1])

availability_low = fuzz.trimf(availability, [0, 0, 0.5])
availability_high = fuzz.trimf(availability, [0.5, 1, 1])

cost_low = fuzz.trimf(cost, [0, 0, 0.5])
cost_high = fuzz.trimf(cost, [0.5, 1, 1])

data_correlation_low = fuzz.trimf(data_correlation, [0, 0, 0.5])
data_correlation_high = fuzz.trimf(data_correlation, [0.5, 1, 1])

# Apply the membership functions to input and output variables
data_transfer_time_ratio_high = fuzz.trimf(response_time, [0.6, 1, 1])
data_transfer_time_ratio_medium = fuzz.trimf(response_time, [0.3, 0.5, 0.7])
data_transfer_time_ratio_low = fuzz.trimf(response_time, [0, 0, 0.3])

virtual_machine_load_high = fuzz.trimf(availability, [0.6, 1, 1])
virtual_machine_load_medium = fuzz.trimf(availability, [0.3, 0.5, 0.7])
virtual_machine_load_low = fuzz.trimf(availability, [0, 0, 0.3])

data_availability_nr = fuzz.trimf(cost, [0, 0, 0.5])
data_availability_r = fuzz.trimf(cost, [0.5, 1, 1])

provider_profit_very_low = fuzz.trimf(data_correlation, [0, 0, 0.2])
provider_profit_low = fuzz.trimf(data_correlation, [0.2, 0.4, 0.6])
provider_profit_medium = fuzz.trimf(data_correlation, [0.4, 0.6, 0.8])
provider_profit_high = fuzz.trimf(data_correlation, [0.6, 0.8, 1])
provider_profit_very_high = fuzz.trimf(data_correlation, [0.8, 1, 1])

placement_potential_very_low = fuzz.trimf(data_correlation, [0, 0, 0.2])
placement_potential_low = fuzz.trimf(data_correlation, [0.2, 0.4, 0.6])
placement_potential_medium = fuzz.trimf(data_correlation, [0.4, 0.6, 0.8])
placement_potential_high = fuzz.trimf(data_correlation, [0.6, 0.8, 1])
placement_potential_very_high = fuzz.trimf(data_correlation, [0.8, 1, 1])

# Fuzzy inference rules for new parameters
rule6 = np.fmin(np.fmin(data_transfer_time_ratio_low, virtual_machine_load_low), data_availability_nr)
rule7 = np.fmin(np.fmin(data_transfer_time_ratio_high, virtual_machine_load_high), data_availability_r)
rule8 = np.fmin(np.fmax(data_correlation_low, data_correlation_high), np.fmax(provider_profit_low, placement_potential_medium))

# Aggregate the rules for new parameters
aggregated = np.fmax(np.fmax(rule6, rule7), rule8)

# Defuzzify the aggregated result
placement_potential_crisp = fuzz.defuzz(data_correlation, aggregated, 'centroid')


# Calculate the amount of SLA violations for each task count considering replication period and weight
sla_violations_count = {}
replication_period = 32  # Replication period in tasks
weight = 0.8  # Weight for violating tasks
threshold_response_time = weight * 180  # Threshold response time for violating tasks

# Print Results
print("Cluster Labels:", cluster_labels)
#print("Critical Data Information:", critical_data)
print("Placement Potential:", placement_potential_crisp)


Number of Data Centers: 6, Average Response Time: 179.91 ms
Number of Data Centers: 9, Average Response Time: 180.04 ms
Number of Data Centers: 12, Average Response Time: 179.99 ms
Number of Data Centers: 15, Average Response Time: 179.99 ms




Cluster Labels: [1 1 1 ... 0 0 1]
Placement Potential: 0.5187265917602997


In [216]:
# Calculate the average response time for each task count
average_response_time = {}

for task_count in [1000, 2000, 3000, 5000, 7000, 10000]:
    task_response_times = []
    for vm in provider_data:
        if vm['task_count'] == task_count:
            response_time = vm.get('response_time_slo', 0)
            task_response_times.append(response_time)

    # Calculate the average response time for the current task count
    if task_response_times:
        average_response_time[task_count] = np.mean(task_response_times)
    else:
        average_response_time[task_count] = 0

# Print the results
print("Average Response Time (in ms) for each task:")
for task_count, avg_rt in average_response_time.items():
    print(f"Task Count: {task_count}, Average Response Time: {avg_rt:.2f} ms")


Average Response Time (in ms) for each task:
Task Count: 1000, Average Response Time: 180.15 ms
Task Count: 2000, Average Response Time: 180.02 ms
Task Count: 3000, Average Response Time: 179.72 ms
Task Count: 5000, Average Response Time: 180.06 ms
Task Count: 7000, Average Response Time: 180.02 ms
Task Count: 10000, Average Response Time: 179.98 ms


In [217]:
# Calculate the amount of SLA violations for each task count and provider
sla_violations_count = {}

# Calculate SLA violations count for each task count and provider
for task_count in [1000, 2000, 3000, 5000, 7000, 10000]:
    sla_violations_count[task_count] = {f'Provider {i}': 0 for i in range(1, num_providers + 1)}
    for vm in provider_data:
        if vm['task_count'] == task_count and vm['response_time_slo'] > threshold_response_time:
            sla_violations_count[task_count][f'Provider {vm["provider_id"]}'] += 1

# Print the results
print("SLA Violations in Terms of Response Time (Per Provider):")
for task_count, violations_per_provider in sla_violations_count.items():
    print(f"Task Count: {task_count}")
    for provider, violations_count in violations_per_provider.items():
        print(f"  {provider}: SLA Violations: {violations_count}")


SLA Violations in Terms of Response Time (Per Provider):
Task Count: 1000
  Provider 1: SLA Violations: 1497
  Provider 2: SLA Violations: 1526
  Provider 3: SLA Violations: 1455
Task Count: 2000
  Provider 1: SLA Violations: 1514
  Provider 2: SLA Violations: 1457
  Provider 3: SLA Violations: 1501
Task Count: 3000
  Provider 1: SLA Violations: 1514
  Provider 2: SLA Violations: 1532
  Provider 3: SLA Violations: 1524
Task Count: 5000
  Provider 1: SLA Violations: 1525
  Provider 2: SLA Violations: 1465
  Provider 3: SLA Violations: 1535
Task Count: 7000
  Provider 1: SLA Violations: 1491
  Provider 2: SLA Violations: 1552
  Provider 3: SLA Violations: 1482
Task Count: 10000
  Provider 1: SLA Violations: 1457
  Provider 2: SLA Violations: 1468
  Provider 3: SLA Violations: 1502


In [218]:
# Calculate Effective Network Usage for each task count
effective_network_usage = {}

for task_count in task_counts:
    task_indices = np.where(data[:, features.index('task_count')] == task_count)[0]
    if len(task_indices) > 0:
        total_data_transfer_time = np.sum(
            data[task_indices, features.index('data_size')] /
            data[task_indices, features.index('inter_region_bw_capacity')] +
            data[task_indices, features.index('data_size')] /
            data[task_indices, features.index('intra_region_bw_capacity')] +
            data[task_indices, features.index('data_size')] /
            data[task_indices, features.index('intra_dc_bw_capacity')]
        )

        # Normalize the Effective Network Usage
        effective_network_usage = total_data_transfer_time / (replication_period * task_count) 
        print(f"Task Count: {task_count}, Normalized Effective Network Usage: {effective_network_usage:.2f}")
    else:
        print(f"Task Count: {task_count}, No data available.")


Task Count: 1000, Normalized Effective Network Usage: 0.28
Task Count: 2000, Normalized Effective Network Usage: 0.14
Task Count: 3000, Normalized Effective Network Usage: 0.10
Task Count: 5000, Normalized Effective Network Usage: 0.06
Task Count: 7000, Normalized Effective Network Usage: 0.04
Task Count: 10000, Normalized Effective Network Usage: 0.03


In [219]:
# Calculate average total monetary profit per provider
providers = ['Provider 1', 'Provider 2', 'Provider 3']

for provider in providers:
    provider_indices = np.where(np.array([vm['provider_id'] for vm in provider_data]) == providers.index(provider) + 1)[0]
    if len(provider_indices) > 0:
        total_sla_violations = np.sum(data[provider_indices, features.index('response_time_slo')] > threshold_response_time)
        total_monetary_profit = np.sum(
            (data[provider_indices, features.index('total_cost')] * (1 - placement_potential_crisp)) +
            (data[provider_indices, features.index('task_count')] * Re) -
            (total_sla_violations * C_penalty)
        )
        average_monetary_profit = total_monetary_profit / len(provider_indices)
        print(f"{provider}, Average Monetary Profit: ${average_monetary_profit:.2f}")
    else:
        print(f"{provider}, No data available.")


Provider 1, Average Monetary Profit: $3222.08
Provider 2, Average Monetary Profit: $3236.78
Provider 3, Average Monetary Profit: $3252.34
