In [30]:
import simpy
import random
from sklearn.cluster import SpectralClustering
import numpy as np


class NetworkLink:
    def __init__(self, bandwidth, cost):
        self.bandwidth = bandwidth
        self.cost = cost

class Region:
    def __init__(self, id):
        self.id = id
        self.intra_dc_link = None
        self.intra_region_link = None
        self.data_centers = []

    def add_data_center(self, data_center):
        self.data_centers.append(data_center)
        data_center.region = self

    def set_network_links(self, intra_dc_link, intra_region_link):
        self.intra_dc_link = intra_dc_link
        self.intra_region_link = intra_region_link

class VirtualMachine:
    def __init__(self, env, id, processing_capability, num_cpus, ram, storage_capacity, data_center, provider, slo_ma):
        self.env = env
        self.id = id
        self.processing_capability = processing_capability
        self.num_cpus = num_cpus
        self.ram = ram
        self.storage_capacity = storage_capacity
        self.data_center = data_center
        self.provider = provider
        self.slo_ma = slo_ma
        self.is_offered = False
        self.is_rented = False
        self.total_time = 0
        self.uptime = 0
        self.data_access_log = {}

    def offer_to_federated_cloud(self):
        self.is_offered = True
    
    def rent_from_federated_cloud(self):
        self.is_rented = True

    def update_availability(self, time_passed):
        # Call this method periodically to update uptime and total_time
        self.total_time += time_passed  # Update with the actual time passed
        
    def get_availability(self):
        if self.total_time == 0:
            return 1  # Assuming always available when no time has passed
        return self.uptime / self.total_time
    
    def process_task(self, task, target_dc, data_transfer_time=0):
        start_time = self.env.now
        total_processing_time = task['size'] / self.processing_capability + data_transfer_time
        yield self.env.timeout(total_processing_time)
        end_time = self.env.now

        # Calculate response time and check for SLA violation or near violation
        response_time = end_time - start_time
        sla_rt = 180  # SLORT in seconds
        w = 0.8  # Define the weight factor
        th_rt = w * sla_rt  # Threshold for near SLA violation

        penalty = 0
        near_violation = False
        if response_time > sla_rt:
            penalty = 0.0025  # Penalty per violation
        elif response_time > th_rt:
            near_violation = True  # Task is near to causing SLA violation

        cost = self.data_center.calculate_cost(task['size'], target_dc) - penalty
        self.data_center.total_revenue += (0.7 - penalty)  # Update total revenue for the data center
        self.data_center.near_violations += int(near_violation)  # Update near violations count for the data center

        self.data_access_log[task['id']] = {
            'response_time': response_time,
            'penalty': penalty,
            'near_violation': near_violation,
            'access_frequency': 1,  # Initialize access frequency
            'tasks': [task],        # Initialize list of tasks with the current task
        }
    
    def calculate_correlation_matrix(self, data_accessed, P):
        n = len(data_accessed)
        correlation_matrix = np.zeros((n, n))

        for i in range(n):
            for j in range(n):
                if i != j:
                    # T_k(di) is the set of tasks accessing data di
                    T_k_di = data_accessed[i]['tasks']
                    T_k_dj = data_accessed[j]['tasks']
                    common_tasks = len(set(T_k_di).intersection(set(T_k_dj)))
                    correlation_matrix[i, j] = common_tasks / P
                else:
                    correlation_matrix[i, j] = 0
        return correlation_matrix

    def perform_spectral_clustering(self, correlation_matrix, K):
        # Apply spectral clustering
        sc = SpectralClustering(n_clusters=K, affinity='precomputed')
        group_labels = sc.fit_predict(correlation_matrix)
        return group_labels

    def select_data_to_replicate(self, data_accessed, group_labels):
        # Calculate average access frequency for each group
        available_vms = [vm for vm in self.data_center.vms if vm.get_availability() >= self.slo_ma]
        unique_labels = set(group_labels)
        groups = {label: [] for label in unique_labels}

        for i, label in enumerate(group_labels):
            groups[label].append(data_accessed[i])

        data_to_replicate = []

        for label, group in groups.items():
            avg_freq = sum(d['access_frequency'] for d in group) / len(group)
            data_to_replicate.extend([d for d in group if d['access_frequency'] >= avg_freq])
            data_to_replicate = [d for d in data_to_replicate if self.is_in_same_region(d)]

        return data_to_replicate
    
    def is_in_same_region(self, data_item):
        # Logic to check if the data item is frequently accessed within the same region
        return all(vm.region == self.region for vm in data_item['accessing_vms'])
    
    # Update the adjust_replicas method to include the new strategy
    def adjust_replicas(self, P, K):
        # This list is populated with actual data accessed information
        data_accessed = [{'id': d, 'access_frequency': self.data_access_log[d]['access_frequency'], 'tasks': self.data_access_log[d]['tasks']} for d in self.data_access_log]
        
        correlation_matrix = self.calculate_correlation_matrix(data_accessed, P)
        group_labels = self.perform_spectral_clustering(correlation_matrix, K)
        data_to_replicate = self.select_data_to_replicate(data_accessed, group_labels)

        for data_item in data_to_replicate:
            # Find the best VM considering both response time and availability
            best_vm = self.find_best_vm_for_replication(data_item)
    
    def find_best_vm_for_replication(self, data_item):
        # Filter VMs based on storage capacity
        suitable_vms = [vm for vm in self.data_center.vms if vm.storage_capacity >= data_item['size'] and vm.get_availability() >= self.slo_ma]

        # Prioritize VMs in the same data center or region to minimize bandwidth costs and transfer time
        same_dc_vms = [vm for vm in suitable_vms if vm.data_center.id == self.data_center.id]
        same_region_vms = [vm for vm in suitable_vms if vm.region == self.region and vm not in same_dc_vms]

        # Select the best VM based on the lowest cost, prioritizing same DC, then same region
        best_vm = None

        if same_dc_vms:
            best_vm = min(same_dc_vms, key=lambda vm: self.data_center.calculate_cost(data_item['size'], vm.data_center))
        elif same_region_vms:
            best_vm = min(same_region_vms, key=lambda vm: self.data_center.calculate_cost(data_item['size'], vm.data_center))
        else:
            best_vm = min(suitable_vms, key=lambda vm: self.data_center.calculate_cost(data_item['size'], vm.data_center)) if suitable_vms else None

        return best_vm

class DataCenter:
    def __init__(self, env, id, region, provider, num_vms, vm_specs, pricing, bw_params):
        self.env = env
        self.id = id
        self.region = region
        self.provider = provider
        slo_ma_const = 0.95  # SLOMA in seconds
        self.vms = [VirtualMachine(env, f"VM_{self.id}_{i}", **vm_specs, data_center=self, provider=provider, slo_ma=slo_ma_const) for i in range(num_vms)]
        self.pricing = pricing
        self.vm_specs = vm_specs
        self.bw_params = bw_params
        self.total_revenue = 0
        self.near_violations = 0
        region.add_data_center(self) 

        provider_id = id.split('_')[1] 
        # Now use the provider_id to access the pricing dictionary
        intra_dc_cost = pricing[provider_id]['bw_price']['intra_dc']
        intra_region_cost = pricing[provider_id]['bw_price']['intra_region']
        intra_dc_bandwidth = bw_params['intra_dc']['capacity']
        intra_region_bandwidth = bw_params['intra_region']['capacity']

        # Set network link costs based on the provider's pricing structure
        region_id = self.id.split('_')[0]
        intra_dc_cost = pricing[region_id]['bw_price']['intra_dc']
        intra_region_cost = pricing[region_id]['bw_price']['intra_region']
        intra_dc_bandwidth = bw_params['intra_dc']['capacity']
        intra_region_bandwidth = bw_params['intra_region']['capacity']

        self.region.set_network_links(
            NetworkLink(intra_dc_bandwidth, intra_dc_cost),
            NetworkLink(intra_region_bandwidth, intra_region_cost)
        )

    def get_total_revenue(self):
        return self.total_revenue
    
    def get_near_violations_count(self):
        return self.near_violations
    
    def calculate_cost(self, task_size, target_dc):
        cpu_cost = task_size / 107 * self.pricing['cpu_price']
        storage_cost = self.vm_specs['storage_capacity'] * self.pricing['storage_price']

        if self.id == target_dc.id:
            # Intra-DC transfer
            bw_cost = task_size * self.pricing['bw_price']['intra_dc']
        elif self.id.split('_')[1] == target_dc.id.split('_')[1]:
            # Intra-region transfer
            bw_cost = task_size * self.pricing['bw_price']['intra_region']
        else:
            # Inter-region transfer
            bw_cost = task_size * self.pricing['bw_price']['inter_region']

        return cpu_cost + storage_cost + bw_cost

    def calculate_data_transfer_time(self, task_size, target_dc):
        if self.id == target_dc.id:
            # Intra-DC transfer
            bw_capacity = self.bw_params['intra_dc']['capacity']
            bw_delay = self.bw_params['intra_dc']['delay']
        elif self.id.split('_')[1] == target_dc.id.split('_')[1]:
            # Intra-region transfer
            bw_capacity = self.bw_params['intra_region']['capacity']
            bw_delay = self.bw_params['intra_region']['delay']
        else:
            # Inter-region transfer
            bw_capacity = self.bw_params['inter_region']['capacity']
            bw_delay = self.bw_params['inter_region']['delay']

        data_transfer_time = task_size / bw_capacity + bw_delay / 1000  # Convert delay from ms to seconds
        return data_transfer_time

class CloudProvider:
    def __init__(self, env, id, regions, dcs_per_region, vm_specs, pricing, bw_params):
        self.env = env
        self.id = id
        self.regions = regions
        self.vm_specs = vm_specs
        self.pricing = pricing
        self.bw_params = bw_params
        self.data_centers = [DataCenter(env, f"DC_{self.id}_{i}", region, self, random.randint(*dcs_per_region), vm_specs, pricing, bw_params) for i, region in enumerate(regions)]
        self.offered_vms = []
        self.rented_vms = []

    def offer_vms(self, num_vms):
        # Implement logic to mark VMs as offered
        for vm in self.vms[:num_vms]:
            vm.offer_to_federated_cloud()
            self.offered_vms.append(vm)

    def rent_vms(self, provider, num_vms):
        # Implement logic to rent VMs from another provider
        available_vms = [vm for vm in provider.offered_vms if not vm.is_rented]
        for vm in available_vms[:num_vms]:
            vm.rent_from_federated_cloud()
            self.rented_vms.append(vm)
            provider.offered_vms.remove(vm)
    
    def find_target_dc(self, task):
        # Simplified logic: Randomly select a data center
        return random.choice(self.regions)


class FederatedCloud:
    def __init__(self, env, num_providers, num_regions, dcs_per_region, vm_specs, pricing, bw_params):
        self.env = env
        self.regions = [Region(i) for i in range(num_regions)]
        self.providers = [CloudProvider(env, f'Provider_{i+1}', self.regions, dcs_per_region, vm_specs, pricing[f'Provider_{i+1}'], bw_params) for i in range(num_providers)]
        
    def get_all_data_centers(self):
        return [dc for provider in self.providers for dc in provider.regions]
    
    def get_all_vms(self):
        return [vm for dc in self.get_all_data_centers() for vm in dc.vms]
    
    def manage_vms(self):
        all_vms = self.get_all_vms()
        owned_vms = [vm for vm in all_vms if vm.provider is not None and not vm.is_offered and not vm.is_rented]
        offered_vms = [vm for vm in all_vms if vm.is_offered]
        rented_vms = [vm for vm in all_vms if vm.is_rented]
        managed_vms = owned_vms + offered_vms + rented_vms
        return managed_vms
    

# Bandwidth parameters
bw_params = {
    'inter_region': {'capacity': 500, 'delay': 150},
    'intra_region': {'capacity': 1000, 'delay': 50},
    'intra_dc': {'capacity': 8000, 'delay': 10}
}

# Simulation parameters
num_providers = 3
num_regions = 3
dcs_per_provider = (2, 5)
num_vms_per_dc = 8
vm_specs = {
    'processing_capability': 1500,  # in MIPS
    'num_cpus': 2,
    'ram': 4,  # in Gb
    'storage_capacity': 8  # in Gb
}
num_data = 200
task_size_range = (200, 1000)  # Task size range in MI

# Pricing structure
pricing = {
    'Provider_1': {
        'US': {'cpu_price': 0.020, 'storage_price': 0.006, 'bw_price': {'intra_dc': 0.001, 'inter_region': 0.008}},
        'EU': {'cpu_price': 0.025, 'storage_price': 0.006, 'bw_price': {'intra_dc': 0.0015, 'inter_region': 0.008}},
        'AS': {'cpu_price': 0.027, 'storage_price': 0.0066, 'bw_price': {'intra_dc': 0.002, 'inter_region': 0.008}}
    },
    'Provider_2': {
        'US': {'cpu_price': 0.020, 'storage_price': 0.0096, 'bw_price': {'intra_dc': 0.001, 'inter_region': 0.008}},
        'EU': {'cpu_price': 0.018, 'storage_price': 0.008, 'bw_price': {'intra_dc': 0.0015, 'inter_region': 0.008}},
        'AS': {'cpu_price': 0.020, 'storage_price': 0.0096, 'bw_price': {'intra_dc': 0.002, 'inter_region': 0.008}}
    },
    'Provider_3': {
        'US': {'cpu_price': 0.0095, 'storage_price': 0.0012, 'bw_price': {'intra_dc': 0.001, 'inter_region': 0.008}},
        'EU': {'cpu_price': 0.0090, 'storage_price': 0.0096, 'bw_price': {'intra_dc': 0.0015, 'inter_region': 0.008}},
        'AS': {'cpu_price': 0.0080, 'storage_price': 0.0090, 'bw_price': {'intra_dc': 0.002, 'inter_region': 0.008}}
    }
}

env = simpy.Environment()
P = 32 # Number of violating tasks
K = 3 #Number of Cluster


# Create the federated cloud system
federated_cloud = FederatedCloud(env, num_providers, num_regions, dcs_per_provider, vm_specs, pricing, bw_params)

# Define a number of tasks to be processed
tasks = [{'id': i, 'size': random.randint(*task_size_range)} for i in range(num_data)]

# Start simulation
for task in tasks:
    # Example logic for selecting source and target data centers with geographic considerations
    source_dc = random.choice(federated_cloud.get_all_data_centers())
    target_dc = random.choice(federated_cloud.get_all_data_centers())

    # Include logic to prefer VMs in the same region if possible
    preferred_vms = [vm for vm in target_dc.vms if vm.region == source_dc.region]
    random_vm = random.choice(preferred_vms if preferred_vms else target_dc.vms)

    data_transfer_time = source_dc.calculate_data_transfer_time(task['size'], target_dc)
    env.process(random_vm.process_task(task, target_dc, data_transfer_time))

# Run the simulation
env.run()

KeyError: 'Provider'