In [1]:
import simpy
import random
from sklearn.cluster import SpectralClustering
import numpy as np

class VirtualMachine:
    def __init__(self, env, id, processing_capability, num_cpus, ram, storage_capacity, data_center, region):
        self.env = env
        self.id = id
        self.processing_capability = processing_capability
        self.num_cpus = num_cpus
        self.ram = ram
        self.storage_capacity = storage_capacity
        self.data_center = data_center
        self.region = region
        self.uptime = 0
        self.total_time = 0
        self.data_access_log = {}
    
    def update_availability(self, time_passed):
        # Call this method periodically to update uptime and total_time
        self.total_time += time_passed  # Update with the actual time passed
        
    def get_availability(self):
        if self.total_time == 0:
            return 1  # Assuming always available when no time has passed
        return self.uptime / self.total_time
    
    def process_task(self, task, target_dc, data_transfer_time=0):
        start_time = self.env.now
        total_processing_time = task['size'] / self.processing_capability + data_transfer_time
        yield self.env.timeout(total_processing_time)
        end_time = self.env.now

        # Calculate response time and check for SLA violation or near violation
        rst = end_time - start_time
        response_time = rst * 60
        sla_rt = 60  # SLORT in seconds
        sla_ma = 0.95  # SLOMA in seconds
        w = 0.8  # Define the weight factor
        th_rt = w * sla_rt  # Threshold for near SLA violation

        penalty = 0
        near_violation = False
        if response_time > sla_rt:
            penalty = 0.0025  # Penalty per violation
        elif response_time > th_rt:
            near_violation = True  # Task is near to causing SLA violation

        cost = self.data_center.calculate_cost(task['size'], target_dc) - penalty
        self.data_center.total_revenue += (0.7 - penalty)  # Update total revenue for the data center
        self.data_center.near_violations += int(near_violation)  # Update near violations count for the data center

        self.data_access_log[task['id']] = {
            'response_time': response_time,
            'penalty': penalty,
            'near_violation': near_violation
        }
        print(f"VM {self.id} processed Task {task['id']} - Response Time={response_time}, Penalty={penalty}, Near Violation={near_violation}")
        
    def calculate_correlation_matrix(self, data_accessed, P):
        n = len(data_accessed)
        correlation_matrix = np.zeros((n, n))

        for i in range(n):
            for j in range(n):
                if i != j:
                    # T_k(di) is the set of tasks accessing data di
                    T_k_di = data_accessed[i]['tasks']
                    T_k_dj = data_accessed[j]['tasks']
                    common_tasks = len(set(T_k_di).intersection(set(T_k_dj)))
                    correlation_matrix[i, j] = common_tasks / P
                else:
                    correlation_matrix[i, j] = 0
        return correlation_matrix

    def perform_spectral_clustering(self, correlation_matrix, K):
        # Apply spectral clustering
        sc = SpectralClustering(n_clusters=K, affinity='precomputed')
        group_labels = sc.fit_predict(correlation_matrix)
        return group_labels

    def select_data_to_replicate(self, data_accessed, group_labels):
        # Calculate average access frequency for each group
        available_vms = [vm for vm in self.data_center.vms if vm.get_availability() >= self.slo_ma]
        unique_labels = set(group_labels)
        groups = {label: [] for label in unique_labels}

        for i, label in enumerate(group_labels):
            groups[label].append(data_accessed[i])

        data_to_replicate = []

        for label, group in groups.items():
            avg_freq = sum(d['access_frequency'] for d in group) / len(group)
            data_to_replicate.extend([d for d in group if d['access_frequency'] >= avg_freq])
            data_to_replicate = [d for d in data_to_replicate if self.is_in_same_region(d)]

        return data_to_replicate
    
    def is_in_same_region(self, data_item):
        # Logic to check if the data item is frequently accessed within the same region
        return all(vm.region == self.region for vm in data_item['accessing_vms'])
    
    # Update the adjust_replicas method to include the new strategy
    def adjust_replicas(self, P, K):
        # This list is populated with actual data accessed information
        data_accessed = [{'id': d, 'access_frequency': self.data_access_log[d]['access_frequency'], 'tasks': self.data_access_log[d]['tasks']} for d in self.data_access_log]
        
        correlation_matrix = self.calculate_correlation_matrix(data_accessed, P)
        group_labels = self.perform_spectral_clustering(correlation_matrix, K)
        data_to_replicate = self.select_data_to_replicate(data_accessed, group_labels)

        for data_item in data_to_replicate:
            # Find the best VM considering both response time and availability
            best_vm = self.find_best_vm_for_replication(data_item)
    
    def find_best_vm_for_replication(self, data_item):
        # Filter VMs based on storage capacity
        suitable_vms = [vm for vm in self.data_center.vms if vm.storage_capacity >= data_item['size'] and vm.get_availability() >= self.slo_ma]

        # Prioritize VMs in the same data center or region to minimize bandwidth costs and transfer time
        same_dc_vms = [vm for vm in suitable_vms if vm.data_center.id == self.data_center.id]
        same_region_vms = [vm for vm in suitable_vms if vm.region == self.region and vm not in same_dc_vms]

        # Select the best VM based on the lowest cost, prioritizing same DC, then same region
        best_vm = None

        if same_dc_vms:
            best_vm = min(same_dc_vms, key=lambda vm: self.data_center.calculate_cost(data_item['size'], vm.data_center))
        elif same_region_vms:
            best_vm = min(same_region_vms, key=lambda vm: self.data_center.calculate_cost(data_item['size'], vm.data_center))
        else:
            best_vm = min(suitable_vms, key=lambda vm: self.data_center.calculate_cost(data_item['size'], vm.data_center)) if suitable_vms else None

        return best_vm

class DataCenter:
    def __init__(self, env, id, num_vms, vm_specs, pricing, bw_params, region):
        self.env = env
        self.id = id
        self.vm_specs = vm_specs
        self.pricing = pricing
        self.bw_params = bw_params
        self.total_revenue = 0
        self.near_violations = 0
        self.region = region
        self.vms = [VirtualMachine(env, f"VM_{id}_{i}", **vm_specs, data_center=self, region=self.region) for i in range(num_vms)]
    
    def get_total_revenue(self):
        return self.total_revenue
    
    def get_near_violations_count(self):
        return self.near_violations
    
    def calculate_cost(self, task_size, target_dc):
        cpu_cost = task_size / 107 * self.pricing['cpu_price']
        storage_cost = self.vm_specs['storage_capacity'] * self.pricing['storage_price']
 
        if self.id == target_dc.id:
            # Intra-DC transfer
            bw_cost = task_size * self.pricing['bw_price']['intra_dc']
        elif self.id.split('_')[1] == target_dc.id.split('_')[1]:
            # Intra-region transfer
            bw_cost = task_size * self.pricing['bw_price']['intra_region']
        else:
            # Inter-region transfer
            bw_cost = task_size * self.pricing['bw_price']['inter_region']

        return cpu_cost + storage_cost + bw_cost

    def calculate_data_transfer_time(self, task_size, target_dc):
        if self.id == target_dc.id:
            # Intra-DC transfer
            bw_capacity = self.bw_params['intra_dc']['capacity']
            bw_delay = self.bw_params['intra_dc']['delay']
        elif self.id.split('_')[1] == target_dc.id.split('_')[1]:
            # Intra-region transfer
            bw_capacity = self.bw_params['intra_region']['capacity']
            bw_delay = self.bw_params['intra_region']['delay']
        else:
            # Inter-region transfer
            bw_capacity = self.bw_params['inter_region']['capacity']
            bw_delay = self.bw_params['inter_region']['delay']

        data_transfer_time = task_size / bw_capacity + bw_delay / 1000  # Convert delay from ms to seconds
        return data_transfer_time

class CloudProvider:
    def __init__(self, env, id, num_regions, dcs_per_region, vm_specs, pricing, bw_params):
        self.env = env
        self.id = id
        self.pricing = pricing
        self.bw_params = bw_params
        region_keys = list(pricing.keys())  # Get the region keys from pricing
        self.regions = [DataCenter(env, f"DC_{id}_{region_keys[r]}", random.randint(*dcs_per_region), vm_specs, pricing[region_keys[r]], bw_params, region=region_keys[r]) for r in range(num_regions)]
    
    def find_target_dc(self, task):
        # Simplified logic: Randomly select a data center
        return random.choice(self.regions)

# Bandwidth parameters
bw_params = {
    'inter_region': {'capacity': 500, 'delay': 150},
    'intra_region': {'capacity': 1000, 'delay': 50},
    'intra_dc': {'capacity': 8000, 'delay': 10}
}

# Simulation parameters
num_providers = 3
num_regions = 3
dcs_per_provider = (2, 5)
num_vms_per_dc = 8
vm_specs = {
    'processing_capability': 1500,  # in MIPS
    'num_cpus': 2,
    'ram': 4,  # in Gb
    'storage_capacity': 8  # in Gb
}
num_data = 10000
task_size_range = (200, 1000)  # Task size range in MI

# Pricing structure
pricing = {
    'Provider_1': {
        'US': {'cpu_price': 0.020, 'storage_price': 0.006, 'bw_price': {'intra_dc': 0.001, 'inter_region': 0.008}},
        'EU': {'cpu_price': 0.025, 'storage_price': 0.006, 'bw_price': {'intra_dc': 0.0015, 'inter_region': 0.008}},
        'AS': {'cpu_price': 0.027, 'storage_price': 0.0066, 'bw_price': {'intra_dc': 0.002, 'inter_region': 0.008}}
    },
    'Provider_2': {
        'US': {'cpu_price': 0.020, 'storage_price': 0.0096, 'bw_price': {'intra_dc': 0.001, 'inter_region': 0.008}},
        'EU': {'cpu_price': 0.018, 'storage_price': 0.008, 'bw_price': {'intra_dc': 0.0015, 'inter_region': 0.008}},
        'AS': {'cpu_price': 0.020, 'storage_price': 0.0096, 'bw_price': {'intra_dc': 0.002, 'inter_region': 0.008}}
    },
    'Provider_3': {
        'US': {'cpu_price': 0.0095, 'storage_price': 0.0012, 'bw_price': {'intra_dc': 0.001, 'inter_region': 0.008}},
        'EU': {'cpu_price': 0.0090, 'storage_price': 0.0096, 'bw_price': {'intra_dc': 0.0015, 'inter_region': 0.008}},
        'AS': {'cpu_price': 0.0080, 'storage_price': 0.0090, 'bw_price': {'intra_dc': 0.002, 'inter_region': 0.008}}
    }
}

env = simpy.Environment()
P = 32 # Number of violating tasks
K = 3 #Number of Cluster

providers = []
for i in range(num_providers):
    provider_key = f'Provider_{i+1}'  # This should match the keys in the pricing dictionary
    if provider_key in pricing:
        provider = CloudProvider(env, provider_key, num_regions, dcs_per_provider, vm_specs, pricing[provider_key], bw_params)
        providers.append(provider)
    else:
        print(f"Warning: Pricing not found for {provider_key}")
        
tasks = [{'id': i, 'size': random.randint(*task_size_range)} for i in range(num_data)]

#for provider in providers:
    #for dc in provider.regions:
        #print(f"Data Center {dc.id}: Total Revenue={dc.get_total_revenue()}, Near Violations={dc.get_near_violations_count()}")

for task in tasks:
    # Example logic for selecting source and target data centers with geographic considerations
    random_provider = random.choice(providers)
    source_dc = random.choice(random_provider.regions)
    target_dc = random.choice(random_provider.regions)

    # Include logic to prefer VMs in the same region if possible
    preferred_vms = [vm for vm in target_dc.vms if vm.region == source_dc.region]
    random_vm = random.choice(preferred_vms if preferred_vms else target_dc.vms)

    data_transfer_time = source_dc.calculate_data_transfer_time(task['size'], target_dc)
    env.process(random_vm.process_task(task, target_dc, data_transfer_time))
env.run()


VM VM_DC_Provider_1_US_1 processed Task 9034 - Response Time=10.1, Penalty=0, Near Violation=False
VM VM_DC_Provider_3_US_0 processed Task 9609 - Response Time=10.1, Penalty=0, Near Violation=False
VM VM_DC_Provider_2_US_1 processed Task 1213 - Response Time=10.1475, Penalty=0, Near Violation=False
VM VM_DC_Provider_1_US_0 processed Task 1771 - Response Time=10.1475, Penalty=0, Near Violation=False
VM VM_DC_Provider_3_AS_1 processed Task 5000 - Response Time=10.1475, Penalty=0, Near Violation=False
VM VM_DC_Provider_1_US_0 processed Task 6537 - Response Time=10.1475, Penalty=0, Near Violation=False
VM VM_DC_Provider_3_EU_1 processed Task 8763 - Response Time=10.1475, Penalty=0, Near Violation=False
VM VM_DC_Provider_3_AS_3 processed Task 2526 - Response Time=10.195, Penalty=0, Near Violation=False
VM VM_DC_Provider_3_EU_0 processed Task 5396 - Response Time=10.195, Penalty=0, Near Violation=False
VM VM_DC_Provider_2_US_1 processed Task 5734 - Response Time=10.195, Penalty=0, Near Viola