In [1]:


import simpy
import random
import numpy as np
import pandas as pd
import time
import skfuzzy as fuzz
from skfuzzy import control as ctrl
from sklearn.cluster import SpectralClustering
import threading
import logging
import traceback


stop_threads = False

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

logger = logging.getLogger(__name__)

global managed_vms_df, offered_vms_df, owned_vms_df, rented_vms_df
managed_vms_df = pd.DataFrame()
offered_vms_df = pd.DataFrame()
owned_vms_df = pd.DataFrame()
rented_vms_df = pd.DataFrame()

# Constants definition
NUM_PROVIDERS = 3
NUM_REGIONS = 3
MIN_DCS_PER_PROVIDER = 2
MAX_DCS_PER_PROVIDER = 5
VM_PER_DC = 8
NUM_TASKS = 100
TASK_SIZE_RANGE = (200, 1000)
NUM_DATA_ITEMS = 200
DATA_SIZE_RANGE = (300, 1000)
REPLICATION_PERIOD = 32
REVENUE_PER_TASK = 0.7
SLORT = 180
w = 0.8
T_hRT = w * SLORT
VM_PROCESSING_CAPACITY = 1500
VM_processing_capability = 1500
MIN_AVAILABILITY = 0.95
CPENALTY = 0.0025
VM_RENT_PRICE = 0.01
VM_STORAGE_CAPACITY = 8000
BW_INTRA_DC = 8000
DELAY_INTRA_DC = 10
BW_INTRA_REGION = 1000
DELAY_INTRA_REGION = 50
BW_INTER_REGION = 500
DELAY_INTER_REGION = 150
K = 3

REGION_MAP = {0: 'US', 1: 'EU', 2: 'AS'}
PROVIDER_MAP = {0: 'Provider1', 1: 'Provider2', 2: 'Provider3'}

PRICING_TABLE = {
    'CPU_PRICE': {
        'Provider1': {'US': 0.020, 'EU': 0.025, 'AS': 0.027},
        'Provider2': {'US': 0.020, 'EU': 0.018, 'AS': 0.020},
        'Provider3': {'US': 0.0095, 'EU': 0.0090, 'AS': 0.0080}
    },
    'STORAGE_PRICE': {
        'Provider1': {'US': 0.0060, 'EU': 0.0066, 'AS': 0.0066},
        'Provider2': {'US': 0.0096, 'EU': 0.0096, 'AS': 0.0096},
        'Provider3': {'US': 0.0120, 'EU': 0.0096, 'AS': 0.0090}
    },
    'BW_PRICE': {
        'IntraDC': 0.001,
        'IntraRegion': 0.002,
        'InterRegions': 0.0080,
        'Provider1': {'US': 0.0015, 'EU': 0.0020, 'AS': 0.0040},
        'Provider2': {'US': 0.0015, 'EU': 0.0020, 'AS': 0.0040},
        'Provider3': {'US': 0.0015, 'EU': 0.0020, 'AS': 0.0040}
    }
}

class VirtualMachine:
    vm_list = []
    bandwidths = {}
    delays = {}

    @classmethod
    def set_network_characteristics(cls, bandwidths, delays):
        cls.bandwidths = bandwidths
        cls.delays = delays

    def __init__(self, vm_id, provider_id, region_id, dc_id, storage_capacity, storage_price_per_gb, transfer_price_per_gb, processing_capacity, env, providers, data_files, owned=None, failure_probability=10.0):
        self.vm_id = vm_id
        self.provider_id = provider_id
        self.region_id = region_id
        self.dc_id = dc_id
        self.storage_capacity = storage_capacity
        self.storage_price_per_gb = storage_price_per_gb
        self.transfer_price_per_gb = transfer_price_per_gb
        self.processing_capacity = VM_PROCESSING_CAPACITY
        self.env = env
        self.providers = providers
        self.data_files = data_files  # Correct attribute name
        self.tasks_assigned = []
        self.tasks_processed = []
        self.queue_capacity = 500
        self.used_storage = 0
        self.owned = owned
        self.offered = False
        self.rented = False
        self.process_cost = self.calculate_tpc()
        self.storage_cost, self.transfer_cost = self.calculate_dpc()
        self.rent_price = VM_RENT_PRICE
        self.failure_probability = failure_probability
        self.response_time = None
        self.sla_violation = False
        self.sla_satisfactions = 0
        self.sla_violations = 0  # Initialize the attribute correctly here
        VirtualMachine.vm_list.append(self)

    def set_network_characteristics_and_update():
        bandwidths, delays, costs = generate_network_characteristics()
        VirtualMachine.set_network_characteristics(bandwidths, delays)
        VMCosts.set_costs(costs)

    def calculate_bandwidth_and_delay(self, target_vm):
        if self.dc_id == target_vm.dc_id:
            bandwidth = BW_INTRA_DC
            delay = DELAY_INTRA_DC
        elif self.region_id == target_vm.region_id:
            bandwidth = BW_INTRA_REGION
            delay = DELAY_INTRA_REGION
        else:
            bandwidth = BW_INTER_REGION
            delay = DELAY_INTER_REGION

        transfer_cost = self.transfer_price_per_gb
        return bandwidth, delay, transfer_cost

    def execute_task(self, task, src_vm=None, bandwidth=None, delay=None):
        try:
            if src_vm:
                if self.dc_id == src_vm.dc_id:
                    bandwidth, delay = BW_INTRA_DC, DELAY_INTRA_DC
                elif self.region_id == src_vm.region_id:
                    bandwidth, delay = BW_INTRA_REGION, DELAY_INTRA_REGION
                else:
                    bandwidth, delay = BW_INTER_REGION, DELAY_INTER_REGION
            else:
                bandwidth, delay = BW_INTRA_DC, DELAY_INTRA_DC  # Default values for same DC

            transfer_time = task.size / bandwidth + delay / 1000  # Convert delay to seconds
            processing_time = task.size / self.processing_capacity
            task.response_time = transfer_time + processing_time

            task.sla_violation = task.response_time > T_hRT

            transfer_cost = self.calculate_transfer_cost(task.size)
            task.transfer_cost = transfer_cost
            task.transfer_time = transfer_time  # Store transfer time in task object
            task.processing_time = processing_time

            # Calculate and set the execution cost for the task
            task.calculate_execution_cost(self)

            self.tasks_processed.append(task)

            # Simulate data accesses
            for data_id in task.data_requirements:
                task.access_data(data_id, self)

            # Effective Network Usage (ENU) Calculation
            N_rfa = sum(1 for access in task.data_accesses if access == 1)
            N_fa = len(task.data_accesses)
            N_lfa = N_fa - N_rfa
            task.enu = (N_rfa + N_fa) / (N_fa + N_lfa)

            # Update SLA tracking
            if task.sla_violation:
                self.sla_violation += 1
                self.sla_satisfactions = 0
            else:
                self.sla_satisfactions += 1

            return processing_time, transfer_time
        except KeyError as e:
            logger.error(f"Error executing task {task.task_id} on VM {self.vm_id}: {e}")
            return None, None

    def calculate_tpc(self):
        provider = PROVIDER_MAP[self.provider_id]
        region_name = REGION_MAP[self.region_id]
        cpu_price = PRICING_TABLE['CPU_PRICE'][provider][region_name]

        total_task_size = sum(task.size for task in self.tasks_processed)
        processing_cost = (total_task_size / self.processing_capacity) * cpu_price

        return processing_cost

    def calculate_dpc(self):
        provider = PROVIDER_MAP[self.provider_id]
        region_name = REGION_MAP[self.region_id]
        
        try:
            storage_price = PRICING_TABLE['STORAGE_PRICE'][provider][region_name]
        except KeyError as e:
            raise ValueError(f"Invalid provider or region: {provider}, {region_name}") from e

        total_data_size = sum(data_file['size'] for data_file in self.data_files if 'size' in data_file)
        storage_cost = total_data_size * storage_price

        transfer_cost = 0
        data_files_processed = 0  # To limit the number of data files processed

        for data_file in self.data_files:
            if 'size' in data_file and 'access_frequencies' in data_file:
                best_site, best_cost = CloudProvider.find_best_site(self, data_file, self.providers)
                if best_site:
                    transfer_cost += data_file['size'] * sum(data_file['access_frequencies']) * best_cost
                    data_files_processed += 1  # Count processed data files

                if data_files_processed >= 100:  # Limit the number of processed data files to 100
                    break
            else:
                raise ValueError("data_file must contain 'size' and 'access_frequencies' keys")

        return storage_cost, transfer_cost

    def add_task(self, task):
        self.tasks_assigned.append(task)

    def add_data_file(self, data_file):
        if isinstance(data_file, DataFile):
            data_file_dict = data_file.to_dict()
        elif isinstance(data_file, dict):
            data_file_dict = data_file
        else:
            raise ValueError("data_file must be a DataFile instance or a dictionary with 'data_id' and 'size' keys")

        if not any(df['data_id'] == data_file_dict['data_id'] for df in self.data_files):
            self.data_files.append(data_file_dict)

    def has_data_file(self, data_file):
        return any(df['data_id'] == data_file['data_id'] for df in self.data_files)

    def calculate_avg_availability(self):
        if not self.data_files:
            return 0.0  # Return 0 if no data files are present

        availabilities = []
        for data_file in self.data_files:
            try:
                availability = DataFile(
                    data_id=data_file['data_id'],
                    size=data_file['size'],
                    creation_date=data_file['creation_date'],
                    modification_date=data_file['modification_date'],
                    blocks=data_file['blocks'],
                    replicas_per_block=data_file['replicas_per_block'],
                    access_frequencies=data_file['access_frequencies'],
                    access_count=data_file.get('number_of_accesses', 0)
                ).calculate_availability()
                availabilities.append(availability)
            except KeyError as e:
                logger.error(f"Error in data_file keys: {e}")

        avg_availability = float(np.mean(availabilities)) if availabilities else 0.0
        return avg_availability

    def calculate_used_storage(self):
        used_storage = sum(data_file['size'] for data_file in self.data_files if 'size' in data_file)
        return used_storage

    def calculate_available_storage(self):
        total_storage = self.storage_capacity
        used_storage = self.calculate_used_storage()
        available_storage = total_storage - used_storage
        return available_storage
    
    def calculate_bandwidth_cost(self):
        # Assuming bandwidth cost is calculated based on intra-DC, intra-region, and inter-region bandwidth pricing
        bw_intra_dc = PRICING_TABLE['BW_PRICE']['IntraDC']
        bw_intra_region = PRICING_TABLE['BW_PRICE']['IntraRegion']
        bw_inter_region = PRICING_TABLE['BW_PRICE']['InterRegions']
        
        # Calculate costs based on the VM's location and data access patterns
        bandwidth_cost = 0
        
        for data_file in self.data_files:
            # Determine the cost based on data access patterns
            if self.dc_id == data_file['dc_id']:
                # Intra-DC transfer
                bandwidth_cost += bw_intra_dc * data_file['size']
            elif self.region_id == data_file['region_id']:
                # Intra-Region transfer
                bandwidth_cost += bw_intra_region * data_file['size']
            else:
                # Inter-Region transfer
                bandwidth_cost += bw_inter_region * data_file['size']
        
        return bandwidth_cost

    def is_storage_full(self):
        available_storage = self.calculate_available_storage()
        return available_storage <= 0

    def calculate_storage_cost(self, data_size_gb):
        return data_size_gb * self.storage_price_per_gb

    def calculate_transfer_cost(self, data_size_gb):
        return data_size_gb * self.transfer_price_per_gb

    def calculate_processing_time(self, vm_processing_capacity):
        total_task_size = sum(task.size for task in self.tasks_processed)
        return total_task_size / vm_processing_capacity

    def calculate_response_time(self):
        processing_time = self.calculate_processing_time(VM_PROCESSING_CAPACITY)
        transfer_time = sum(task.transfer_time for task in self.tasks_processed)  # Assuming each task has a transfer_time attribute
        self.response_time = (transfer_time + processing_time) * 1000
        self.sla_violation = self.response_time > T_hRT
        return self.response_time 

    def calculate_queue_capacity(self):
        total_task_size = sum(task.size for task in self.tasks_processed if hasattr(task, 'size'))
        queue_capacity_usage = total_task_size / self.queue_capacity
        return queue_capacity_usage

    def calculate_processing_capacity_usage(self):
        busy_mips = sum(task.size for task in self.tasks_processed if hasattr(task, 'size'))
        process_capacity_usage = busy_mips / self.processing_capacity
        return process_capacity_usage

    def calculate_vm_load(self):
        queue_cap = self.calculate_queue_capacity()
        process_cap = self.calculate_processing_capacity_usage()
        return 0.5 * (queue_cap + process_cap)

    def calculate_revenues(self):
        revenues = len(self.tasks_processed) * REVENUE_PER_TASK
        return revenues

    def calculate_task_processing_cost(self):
        provider = PROVIDER_MAP.get(self.provider_id)
        region_name = REGION_MAP.get(self.region_id)
        if provider is None or region_name is None:
            raise ValueError(f"Invalid provider_id {self.provider_id} or region_id {self.region_id}")

        cpu_price = PRICING_TABLE.get('CPU_PRICE', {}).get(provider, {}).get(region_name)
        if cpu_price is None:
            raise ValueError(f"CPU price not found for provider {provider} and region {region_name}")

        task_processing_cost = sum((t.size / self.processing_capacity) * cpu_price for t in self.tasks_processed if hasattr(t, 'size') and isinstance(t.size, (int, float)))
        return task_processing_cost

    def check_sla_violations(self):
        for task in self.tasks_processed:
            if task.response_time and task.response_time > T_hRT:
                return True
        return False

    def calculate_penalties(self):
        try:
            penalties = len([t for t in self.tasks_processed if t.sla_violation]) * CPENALTY
            return penalties
        except Exception as e:
            raise

    def calculate_expenditures(self):
        try:
            if self.owned:
                storage_cost, transfer_cost = self.calculate_dpc()
                tpc_cost = self.calculate_tpc()
                penalties = self.calculate_penalties()
                expenditures = tpc_cost + storage_cost + transfer_cost + penalties
            else:
                penalties = self.calculate_penalties()
                expenditures = self.rent_price + penalties
            return expenditures
        except Exception as e:
            raise

    def calculate_profit(self):
        revenues = self.calculate_revenues()
        expenditures = self.calculate_expenditures()
        profit = revenues - expenditures
        return profit

    def calculate_placement_potential(self, data_group):
        total_potential = sum(self.calculate_data_placement_potential(d) for d in data_group)
        return total_potential / len(data_group) if data_group else 0

    def check_failure(self):
        vm_load = self.calculate_vm_load()
        vm_load_threshold = 0.20
        if vm_load > vm_load_threshold:
            return True
        return random.random() < self.failure_probability

    def collect_metrics(self):
        storage_cost, transfer_cost = self.calculate_dpc()
        return {
            'vm_id': self.vm_id,
            'data_files': [df['data_id'] for df in self.data_files], 
            'provider_id': self.provider_id,
            'region_id': self.region_id,
            'avg_availability': self.calculate_avg_availability(),
            'vm_load': self.calculate_vm_load(),
            'placement_potential': self.calculate_placement_potential([]),
            'failure': self.check_failure(),
            'task_processing_cost': self.calculate_task_processing_cost(),
            'data_placement_cost': self.calculate_dpc(),
            'revenues': self.calculate_revenues(),
            'expenditures': self.calculate_expenditures(),
            'profit': self.calculate_profit(),
            'used_storage': self.calculate_used_storage(),
            'available_storage': self.calculate_available_storage(),
            'queue_capacity_usage': self.calculate_queue_capacity(),
            'storage_cost': storage_cost,
            'transfer_cost': transfer_cost,
            'processing_capacity_usage': self.calculate_processing_capacity_usage()
        }

class DataFile:
    def __init__(self, data_id, size, creation_date, modification_date, blocks, replicas_per_block, access_frequencies, 
                 access_count=0, replication_factor=None):
        self.data_id = data_id
        self.size = size
        self.creation_date = creation_date
        self.modification_date = modification_date
        self.blocks = blocks
        self.replicas_per_block = replicas_per_block
        self.access_frequencies = access_frequencies
        self.access_count = access_count
        self.replication_factor = replication_factor if replication_factor is not None else replicas_per_block

    def access(self):
        self.access_count += 1
        self.modification_date = time.time()

    def to_dict(self):
        return {
            'data_id': self.data_id,
            'size': self.size,
            'creation_date': self.creation_date,
            'modification_date': self.modification_date,
            'blocks': self.blocks,
            'replicas_per_block': self.replicas_per_block,
            'number_of_accesses': self.access_count,
            'access_frequencies': self.access_frequencies,
            'replication_factor': self.replication_factor
        }

    def calculate_block_availability(self, MIN_AVAILABILITY=0.95):
        # Block availability
        P_ba_k = MIN_AVAILABILITY
        P_BA_k = 1 - (1 - P_ba_k) ** self.replicas_per_block
        return P_BA_k
    
    def calculate_availability(self):
        block_availability = self.calculate_block_availability()
        availability = 1 - (1 - block_availability) ** self.blocks
        return availability

    """
    def calculate_availability(self):
        total_replicas = sum(block.replicas for block in self.blocks)
        if total_replicas == 0:
            return 0.0
        block_availability = 1 - (1 - self.calculate_block_availability()) ** total_replicas
        availability = (1 - (1 - block_availability) ** len(self.blocks))
        return availability
    """

    def calculate_dtt(self, vm, reqd, bandwidths, delays):
        if not reqd:
            logger.debug(f"No required VMs found for data file {self.data_id}")
            return 0  # Return 0 if reqd list is empty

        total_transfer_time = 0
        for vmr in reqd:
            if vmr != vm:
                cap_bw = bandwidths.get((vm.vm_id, vmr.vm_id), BW_INTRA_DC)
                delay = delays.get((vm.vm_id, vmr.vm_id), DELAY_INTRA_DC)
              
                total_transfer_time += (self.size / cap_bw) + (delay / 1000.0)
        dtt = total_transfer_time / len(reqd) if reqd else 0
        return dtt

    def calculate_ldtt(self):
        min_bw = min(VirtualMachine.bandwidths.values())
        max_delay = max(VirtualMachine.delays.values())
        ldtt = (self.size / min_bw) + (max_delay / 1000.0)
        return ldtt

    def calculate_dttr(self, vm, reqd, bandwidths, delays):
        dtt = self.calculate_dtt(vm, reqd, bandwidths, delays)
        ldtt = self.calculate_ldtt()
        dttr = dtt / ldtt if ldtt != 0 else 0
        return dttr
    
class DataCenter:
    dc_list = []

    def __init__(self, provider_id, region_id, dc_id):
        self.provider_id = provider_id
        self.region_id = region_id
        self.dc_id = dc_id
        self.vms = []
        DataCenter.dc_list.append(self)

class CloudProvider:
    provider_list = []  # Class variable to keep track of all providers

    def __init__(self, provider_id, owned_vms, rented_vms=None, providers=None, offered_vms=None):
        self.provider_id = provider_id
        self.owned_vms = owned_vms
        self.rented_vms = rented_vms if rented_vms is not None else []
        self.offered_vms = offered_vms if offered_vms is not None else []
        self.managed_vms = [vm for vm in self.owned_vms if vm not in self.offered_vms] + self.rented_vms
        self.data_centers = []
        self.clients = []
        self.providers = providers
        CloudProvider.provider_list.append(self)

    def manage_vms(self):
        self.managed_vms = [vm for vm in self.owned_vms if vm not in self.offered_vms] + self.rented_vms

    def collect_managed_vms(self):
        managed_vms_data = []
        for vm in self.managed_vms:
            try:
                storage_cost, transfer_cost = vm.calculate_dpc()
                tpc_cost = vm.calculate_tpc()
                vm_data = {
                    'vm_id': vm.vm_id,
                    'provider_id': vm.provider_id,
                    'region_id': vm.region_id,
                    'dc_id': vm.dc_id,
                    'queue_capacity': vm.queue_capacity,
                    'processing_capacity': vm.processing_capacity,
                    'owned': vm.owned,
                    'total_processed_tasks': len(vm.tasks_processed),
                    'tpc': tpc_cost,
                    'dpc': sum([storage_cost, transfer_cost]),
                    'penalties': vm.calculate_penalties(),
                    'expenditures': vm.calculate_expenditures(),
                    'revenues': vm.calculate_revenues(),
                    'profit': vm.calculate_profit(),
                    'availability': vm.calculate_avg_availability(),
                    'response_time': vm.calculate_response_time(),
                    'process_cost': tpc_cost,
                    'storage_cost': storage_cost,
                    'transfer_cost': transfer_cost
                }
                managed_vms_data.append(vm_data)
            except Exception as e:
                logger.debug(f"Error collecting data for Managed VM {vm.vm_id}: {e}")

        global managed_vms_df
        managed_vms_df = pd.DataFrame(managed_vms_data)

    def collect_owned_vms(self):
        owned_vms_data = []
        for vm in self.owned_vms:
            try:


                storage_cost, transfer_cost = vm.calculate_dpc()
    
                tpc_cost = vm.calculate_tpc()
    
                expenditures = vm.calculate_expenditures()
    
                owned_vms_data.append({
                    'vm_id': vm.vm_id,
                    'provider_id': vm.provider_id,
                    'region_id': vm.region_id,
                    'dc_id': vm.dc_id,
                    'queue_capacity': vm.queue_capacity,
                    'processing_capacity': vm.processing_capacity,
                    'owned': vm.owned,
                    'total_processed_tasks': len(vm.tasks_processed),
                    'tpc': tpc_cost,
                    'dpc': storage_cost + transfer_cost,
                    'penalties': vm.calculate_penalties(),
                    'expenditures': expenditures,
                    'revenues': vm.calculate_revenues(),
                    'profit': vm.calculate_profit(),
                    'availability': vm.calculate_avg_availability(),
                    'response_time': self.calculate_average_response_time(vm),
                    'process_cost': tpc_cost,
                    'storage_cost': storage_cost,
                    'transfer_cost': transfer_cost
                })

                # Log costs for verification
          
            except Exception as e:
                logger.debug(f"Error collecting data for Owned VM {vm.vm_id}: {e}")
                import traceback
                traceback.print_exc()

        global owned_vms_df
        owned_vms_df = pd.DataFrame(owned_vms_data)

    def calculate_average_response_time(self, vm):
        total_response_time = sum(task.response_time for task in vm.tasks_processed if task.response_time is not None)
        average_response_time = total_response_time / len(vm.tasks_processed) if vm.tasks_processed else 0
        return average_response_time

    def rent_vms_from_other_providers(self, other_providers):
        for provider in other_providers:
            if provider != self:
                rented_vms = provider.rent_vms(self)
                self.rented_vms.extend(rented_vms)

    def rent_vms(self, target_provider):
        rented_vms = []
        for vm in self.offered_vms:
            storage_cost = vm.calculate_dpc()[0]
            process_cost = vm.calculate_task_processing_cost()
            bandwidth_cost = VirtualMachine.calculate_bandwidth_cost()
            total_cost = storage_cost + process_cost + bandwidth_cost

            if target_provider.request_vm(vm, total_cost):
                rented_vms.append(vm)

        self.rented_vms.extend(rented_vms)
        return rented_vms

    def collect_rented_vms(self):
        rented_vms_data = []
        for vm in self.rented_vms:
            try:
                storage_cost, transfer_cost = vm.calculate_dpc()
                tpc_cost = vm.calculate_tpc()
                expenditures = vm.calculate_expenditures()
   
                rented_vms_data.append({
                    'vm_id': vm.vm_id,
                    'provider_id': vm.provider_id,
                    'region_id': vm.region_id,
                    'dc_id': vm.dc_id,
                    'queue_capacity': vm.queue_capacity,
                    'processing_capacity': vm.processing_capacity,
                    'owned': vm.owned,
                    'total_processed_tasks': len(vm.tasks_processed),
                    'tpc': tpc_cost,
                    'dpc': storage_cost + transfer_cost,
                    'penalties': vm.calculate_penalties(),
                    'expenditures': expenditures,
                    'revenues': vm.calculate_revenues(),
                    'profit': vm.calculate_profit(),
                    'availability': vm.calculate_avg_availability(),
                    'response_time': vm.calculate_response_time(),
                    'process_cost': tpc_cost,
                    'storage_cost': storage_cost,
                    'transfer_cost': transfer_cost
                })
                
            except Exception as e:
                logger.debug(f"Error collecting data for rented VM {vm.vm_id}: {e}")
                import traceback
                traceback.print_exc()

        global rented_vms_df
        rented_vms_df = pd.DataFrame(rented_vms_data)

    def get_offered_vms(self):
        offered_vms_list = []
        for provider in CloudProvider.provider_list:
            offered_vms_list.extend(provider.offered_vms)
        return sorted(offered_vms_list, key=lambda x: x.rent_price)

    def collect_offered_vms(self):
        offered_vms_data = []
        for vm in self.offered_vms:
            try:
                storage_cost, transfer_cost = vm.calculate_dpc()
                tpc_cost = vm.calculate_tpc()
                expenditures = vm.calculate_expenditures()
                logger.debug(f"VM {vm.vm_id} - Storage Cost: {storage_cost}, Transfer Cost: {transfer_cost}, TPC Cost: {tpc_cost}, Expenditures: {expenditures}")
                offered_vms_data.append({
                    'vm_id': vm.vm_id,
                    'provider_id': vm.provider_id,
                    'region_id': vm.region_id,
                    'dc_id': vm.dc_id,
                    'queue_capacity': vm.queue_capacity,
                    'processing_capacity': vm.processing_capacity,
                    'owned': vm.owned,
                    'total_processed_tasks': len(vm.tasks_processed),
                    'tpc': tpc_cost,
                    'dpc': storage_cost + transfer_cost,
                    'penalties': vm.calculate_penalties(),
                    'expenditures': vm.calculate_expenditures(),
                    'revenues': vm.calculate_revenues(),
                    'profit': vm.calculate_profit(),
                    'availability': vm.calculate_avg_availability(),
                    'response_time': vm.calculate_response_time(),
                    'process_cost': tpc_cost,
                    'storage_cost': storage_cost,
                    'transfer_cost': transfer_cost
                })

            except Exception as e:
                logger.debug(f"Error collecting data for offered VM {vm.vm_id}: {e}")
                import traceback
                traceback.print_exc()

        global offered_vms_df
        offered_vms_df = pd.DataFrame(offered_vms_data)

    def add_client(self, client):
        self.clients.append(client)

    def manage_tasks(self):
        for client in self.clients:
            self.execute_tasks(client.tasks)

    def execute_tasks(self, tasks):
        for task in tasks:
            suitable_vm = self.find_suitable_vm(task)
            if suitable_vm:
                suitable_vm.execute_task(task)

    def find_suitable_vm(self, task):
        for dc in self.data_centers:
            for vm in dc.vms:
                if vm.calculate_used_storage() + task.size <= vm.storage_capacity and vm.calculate_vm_load() < 1:
                    return vm
        return None

    def find_best_site(vm, data_file, providers):
        best_site = None
        min_cost = float('inf')

        for provider in providers:
            for dc in provider.data_centers:
                for candidate_vm in dc.vms:
                    if candidate_vm.has_data_file(data_file):
                        cap_bw, _, _ = candidate_vm.calculate_bandwidth_and_delay(vm)
                        transfer_cost = candidate_vm.calculate_transfer_cost(data_file['size'])
                        total_cost = cap_bw * transfer_cost

                        if total_cost < min_cost:
                            min_cost = total_cost
                            best_site = candidate_vm

        return best_site, min_cost

    def print_vms_dfs(self):
        global managed_vms_df, offered_vms_df, owned_vms_df, rented_vms_df
        print("Managed VMs DataFrame:")
        print(managed_vms_df)
        print("\nOffered VMs DataFrame:")
        print(offered_vms_df)
        print("\nOwned VMs DataFrame:")
        print(owned_vms_df)
        print("\nRented VMs DataFrame:")
        print(rented_vms_df)

    def aggregate_revenues(self):
            return sum(vm.calculate_revenues() for vm in self.managed_vms)

    def aggregate_expenditures(self):
            return sum(vm.calculate_expenditures() for vm in self.managed_vms)

    def aggregate_profit(self):
            return sum(vm.calculate_profit() for vm in self.managed_vms)


    def collect_vm_data(self, vm):
        storage_cost, transfer_cost = vm.calculate_dpc([self])
        return {
            'vm_id': vm.vm_id,
            'provider_id': vm.provider_id,
            'region_id': vm.region_id,
            'dc_id': vm.dc_id,
            'queue_capacity': vm.queue_capacity,
            'process_capacity': vm.process_capacity,
            'owned': vm.owned,
            'total_processed_tasks': len(vm.tasks_processed),
            'tpc': vm.calculate_tpc(),
            'dpc': storage_cost + transfer_cost,
            'penalties': vm.calculate_penalties(),
            'expenditures': vm.calculate_expenditures(),
            'revenues': vm.calculate_revenues(),
            'profit': vm.calculate_profit(),
            'availability': vm.calculate_avg_availability(),
            'used_storage': vm.calculate_used_storage(),  # Ensure this method returns storage in MB
            'storage_cost': storage_cost,
            'transfer_cost': transfer_cost,
            'storage_price': vm.storage_price_per_gb  # Add storage price column
        }

class Task:
    def __init__(self, task_id, size, data_requirements, dc_id, region_id, access_frequencies, response_time=None):
        self.task_id = task_id
        self.size = size
        self.data_requirements = data_requirements  # Ensure this is correctly populated
        self.region_id = region_id
        self.dc_id = dc_id
        self.data_size = random.randint(DATA_SIZE_RANGE[0], DATA_SIZE_RANGE[1])
        self.mips = random.randint(50, 150)
        self.transfer_time = 0
        self.response_time = response_time or 0.0
        self.sla_violation = False
        self.enu = 0
        self.data_accesses = np.zeros(NUM_DATA_ITEMS)
        self.data_accesses_list = []
        self.access_frequencies = access_frequencies
        self.creation_date = time.time()
        self.modification_date = time.time()
        self.execution_cost = 0  # Initialize the execution cost

    def calculate_access_frequency(self, data_id, frequency):
        if data_id not in self.access_frequencies:
            self.access_frequencies[data_id] = 0
        self.access_frequencies[data_id] += frequency

    def access_data(self, data_id, vm):
        self.data_accesses[data_id] += 1
        self.data_accesses_list.append(data_id)
        data_file = next((df for df in vm.data_files if df['data_id'] == data_id), None)
        if data_file:
            data_file['number_of_accesses'] += 1
            self.modification_date = time.time()

    def calculate_processing_time(self, process_capacity):
        processing_time = self.size / process_capacity
        if not isinstance(processing_time, float):
            processing_time = float(processing_time)
        return processing_time

    def calculate_transfer_time(self, src_vm, dest_vm, data_size, bandwidth, delay):
        transfer_time = data_size / bandwidth + delay / 1000.0
        return transfer_time

    def calculate_transfer_cost(self, vm, src_vm):
        region = REGION_MAP[vm.region_id]
        provider = PROVIDER_MAP[vm.provider_id]
        
        if vm.dc_id == src_vm.dc_id:
            bw_price = PRICING_TABLE['BW_PRICE']['IntraDC']
        elif vm.region_id == src_vm.region_id:
            bw_price = PRICING_TABLE['BW_PRICE']['IntraRegion']
        else:
            bw_price = PRICING_TABLE['BW_PRICE']['InterRegions']
        
        transfer_cost = self.size * bw_price
        self.transfer_cost = transfer_cost
        return transfer_cost

    def set_transfer_time(self, src_vm, dest_vm, bandwidth, delay):
        transfer_time = self.calculate_transfer_time(src_vm, dest_vm, self.data_size, bandwidth, delay)
        self.transfer_time = transfer_time
        self.calculate_response_time()

    def calculate_response_time(self):
        processing_time = self.calculate_processing_time(VM_PROCESSING_CAPACITY)
        self.response_time = (self.transfer_time + processing_time) * 1000
        self.sla_violation = self.response_time > T_hRT

    def calculate_execution_cost(self, vm):
        # Define the cost per unit size (e.g., per MB) based on provider and region
        provider = PROVIDER_MAP.get(vm.provider_id)
        region_name = REGION_MAP.get(vm.region_id)
        if provider is None or region_name is None:
            raise ValueError(f"Invalid provider_id {vm.provider_id} or region_id {vm.region_id}")

        cpu_price = PRICING_TABLE.get('CPU_PRICE', {}).get(provider, {}).get(region_name)
        if cpu_price is None:
            raise ValueError(f"CPU price not found for provider {provider} and region {region_name}")

        # Calculate the execution cost based on the task size and the VM's processing capacity
        self.execution_cost = (self.size / vm.processing_capacity) * cpu_price
        return self.execution_cost

    def collect_metrics(self):
        return {
            'task_id': self.task_id,
            'size': self.size,
            'data_requirements': self.data_requirements,
            'dc_id': self.dc_id,
            'region_id': self.region_id,
            'transfer_time': self.transfer_time,
            'response_time': self.response_time,
            'sla_violation': self.sla_violation,
            'enu': self.enu,
            'creation_date': self.creation_date,
            'modification_date': self.modification_date,
            'data_accesses': self.data_accesses_list,
            'execution_cost': self.execution_cost  # Include execution cost in metrics
        }

class VMCosts:
    costs = {}

    @classmethod
    def set_costs(cls, costs):
        cls.costs = costs

    @classmethod
    def get_cost(cls, vm1_id, vm2_id):
        return cls.costs.get((vm1_id, vm2_id), PRICING_TABLE['BW_PRICE']['IntraDC'])

def initialize_simulation_parameters():
    providers = []  # Replace with actual provider initialization
    task_list = []  # Replace with actual task list initialization
    vm_list = []  # Replace with actual VM list initialization
    placement_decision = None  # Replace with actual placement decision logic
    reimplemented_results = []  # Placeholder for results
    return providers, task_list, vm_list, placement_decision, reimplemented_results

def collect_simulation_data(providers, vm_list, task_list):
    vm_metrics = [vm.collect_metrics() for vm in vm_list]  # Replace with actual VM metrics collection
    task_df = pd.DataFrame(task_list)  # Replace with actual task metrics collection
    vm_metrics_df = pd.DataFrame(vm_metrics)
    return vm_metrics_df, task_df

def generate_network_characteristics():
    bandwidths = {}
    delays = {}
    vm_ids = {}

    for p in range(NUM_PROVIDERS):
        for r in range(NUM_REGIONS):
            for dc in range(MIN_DCS_PER_PROVIDER, MAX_DCS_PER_PROVIDER + 1):
                vm_ids[(p, r, dc)] = [f"VM{p}-{r}-{dc}-{v}" for v in range(VM_PER_DC)]

    for (p1, r1, dc1), vms1 in vm_ids.items():
        for (p2, r2, dc2), vms2 in vm_ids.items():
            for vm1 in vms1:
                for vm2 in vms2:
                    if (p1, r1, dc1) == (p2, r2, dc2):
                        bandwidths[(vm1, vm2)] = BW_INTRA_DC
                        delays[(vm1, vm2)] = DELAY_INTRA_DC
                    elif (p1, r1) == (p2, r2):
                        bandwidths[(vm1, vm2)] = BW_INTRA_REGION
                        delays[(vm1, vm2)] = DELAY_INTRA_REGION
                    else:
                        bandwidths[(vm1, vm2)] = BW_INTER_REGION
                        delays[(vm1, vm2)] = DELAY_INTER_REGION

    costs = {}  # Initialize costs if needed
    return bandwidths, delays, costs


def get_storage_price(provider_id, region_id):
    provider_name = PROVIDER_MAP[provider_id]
    region_name = REGION_MAP[region_id]
    return PRICING_TABLE['STORAGE_PRICE'][provider_name][region_name]

def get_transfer_price(provider_id, region_id):
    provider_name = PROVIDER_MAP[provider_id]
    region_name = REGION_MAP[region_id]
    return PRICING_TABLE['BW_PRICE'][provider_name][region_name]


class SimulationSetup:
    @staticmethod
    def setup_environment(num_providers, num_regions, max_dcs_per_provider, vm_per_dc, vm_storage_capacity, vm_processing_capacity):
        env = simpy.Environment()
        providers = []
        vm_list = []

        for provider_id in range(num_providers):
            owned_vms = []
            data_centers = []
            for region_id in range(num_regions):
                for dc_id in range(max_dcs_per_provider):
                    dc = DataCenter(provider_id, region_id, dc_id)  # Create DataCenter instance
                    data_centers.append(dc)  # Add DataCenter to the list
                    for vm_id in range(vm_per_dc):
                        storage_price_per_gb = get_storage_price(provider_id, region_id)
                        transfer_price_per_gb = get_transfer_price(provider_id, region_id)
                        vm = VirtualMachine(
                            vm_id=f"VM{provider_id}-{region_id}-{dc_id}-{vm_id}",
                            provider_id=provider_id,
                            region_id=region_id,
                            dc_id=dc_id,
                            storage_capacity=vm_storage_capacity,
                            storage_price_per_gb=storage_price_per_gb,
                            transfer_price_per_gb=transfer_price_per_gb,
                            processing_capacity=vm_processing_capacity,
                            env=env,
                            providers=providers,
                            data_files=[]
                        )
                        dc.vms.append(vm)  # Add VM to the data center
                        owned_vms.append(vm)
                        vm_list.append(vm)
            
            provider = CloudProvider(provider_id, owned_vms, providers=providers)
            provider.data_centers = data_centers  # Assign data centers to the provider
            providers.append(provider)

        return env, providers, vm_list


    def assign_tasks_to_vms(providers):

        task_list = []  # This should be initialized correctly
        task_id_counter = 0

        while task_id_counter < NUM_TASKS:

            for provider in providers:
                for dc in provider.data_centers:
                    for vm in dc.vms:
                        if task_id_counter >= NUM_TASKS:
                            break
                        size = random.randint(*TASK_SIZE_RANGE)
                        data_requirements = random.sample(range(NUM_DATA_ITEMS), random.randint(1, 5))
                        access_frequencies = [random.uniform(0.1, 1.0) for _ in data_requirements]
                        task = Task(task_id_counter, size, data_requirements, vm.dc_id, vm.region_id, access_frequencies)
                        task_list.append(task)  # Correctly append task to task_list
                        vm.add_task(task)
                        task_id_counter += 1

                        if task_id_counter >= NUM_TASKS:
                            break
                    if task_id_counter >= NUM_TASKS:
                        break
                if task_id_counter >= NUM_TASKS:
                    break

        return task_list  # Return the populated task_list



def generate_data_files(num_data_items, current_time):

    data_files = [
        DataFile(
            data_id=i,
            size=float(random.randint(*DATA_SIZE_RANGE)),
            creation_date=current_time,
            modification_date=current_time,
            blocks=float(random.randint(1, 4)),
            replicas_per_block=float(random.randint(1, 5)),
            access_frequencies=[random.uniform(0.1, 1.0) for _ in range(10)],
            access_count=0
        ).to_dict() for i in range(num_data_items)
    ]
    logger.debug(f"Finished generating {len(data_files)} data files")
    return data_files

def assign_data_files_to_vms(providers, data_files):
    total_assigned = 0
    assigned_files = set()

    for provider in providers:
        for dc in provider.data_centers:
            for vm in dc.vms:
                for data_file in data_files:
                    if data_file['data_id'] in assigned_files:
                        continue  # Skip if already assigned

                    if not vm.is_storage_full() and vm.calculate_available_storage() >= data_file['size']:
                        vm.add_data_file(data_file)
                        assigned_files.add(data_file['data_id'])
                        total_assigned += 1
                   
    logger.debug(f"Total assigned data files: {total_assigned}/{len(data_files)}")
    return data_files

class FuzzyLogicSystem:
    def __init__(self):
        self.universe = np.arange(0, 1.01, 0.01)
        self.data_transfer_time_ratio = ctrl.Antecedent(self.universe, 'data_transfer_time_ratio')
        self.vm_load = ctrl.Antecedent(self.universe, 'vm_load')
        self.data_availability = ctrl.Antecedent(self.universe, 'data_availability')
        self.profit = ctrl.Antecedent(self.universe, 'profit')
        self.placement_potential = ctrl.Consequent(self.universe, 'placement_potential')

        # Automatically create membership functions
        self.data_transfer_time_ratio.automf(3)
        self.vm_load.automf(3)
        self.data_availability.automf(2, names=['not_respected', 'respected'])
        self.profit.automf(2, names=['non_profitable', 'profitable'])
        
        self.placement_potential['very_low'] = fuzz.trimf(self.placement_potential.universe, [0, 0, 0.2])
        self.placement_potential['low'] = fuzz.trimf(self.placement_potential.universe, [0.1, 0.3, 0.5])
        self.placement_potential['medium'] = fuzz.trimf(self.placement_potential.universe, [0.4, 0.5, 0.6])
        self.placement_potential['high'] = fuzz.trimf(self.placement_potential.universe, [0.5, 0.7, 0.9])
        self.placement_potential['very_high'] = fuzz.trimf(self.placement_potential.universe, [0.8, 1, 1])

    def define_control_system(self):
        rules = [
            ctrl.Rule(self.data_transfer_time_ratio['poor'] | self.vm_load['poor'] | self.data_availability['not_respected'] | self.profit['non_profitable'], self.placement_potential['very_low']),
            ctrl.Rule(self.data_transfer_time_ratio['average'] | self.vm_load['average'] | self.data_availability['respected'] | self.profit['non_profitable'], self.placement_potential['very_low']),
            ctrl.Rule(self.data_transfer_time_ratio['good'] & self.vm_load['good'] & self.data_availability['respected'] & self.profit['profitable'], self.placement_potential['very_high']),
            ctrl.Rule(self.data_transfer_time_ratio['good'] & self.vm_load['poor'] & self.data_availability['respected'] & self.profit['profitable'], self.placement_potential['very_high']),
            ctrl.Rule(self.data_transfer_time_ratio['average'] & self.vm_load['poor'] & self.data_availability['respected'] & self.profit['profitable'], self.placement_potential['high']),
            ctrl.Rule(self.data_transfer_time_ratio['poor'] & self.vm_load['poor'] & self.data_availability['respected'] & self.profit['profitable'], self.placement_potential['medium']),
            ctrl.Rule(self.data_transfer_time_ratio['good'] & self.vm_load['average'] & self.data_availability['respected'] & self.profit['profitable'], self.placement_potential['high']),
            ctrl.Rule(self.data_transfer_time_ratio['average'] & self.vm_load['average'] & self.data_availability['respected'] & self.profit['profitable'], self.placement_potential['medium']),
            ctrl.Rule(self.data_transfer_time_ratio['poor'] & self.vm_load['average'] & self.data_availability['respected'] & self.profit['profitable'], self.placement_potential['low']),
            ctrl.Rule(self.data_transfer_time_ratio['good'] & self.vm_load['good'] & self.data_availability['respected'] & self.profit['profitable'], self.placement_potential['medium']),
            ctrl.Rule(self.data_transfer_time_ratio['average'] & self.vm_load['good'] & self.data_availability['respected'] & self.profit['profitable'], self.placement_potential['low']),
            ctrl.Rule(self.data_transfer_time_ratio['poor'] & self.vm_load['good'] & self.data_availability['respected'] & self.profit['profitable'], self.placement_potential['very_low']),
            ctrl.Rule(self.data_transfer_time_ratio['good'] & self.vm_load['poor'] & self.data_availability['not_respected'] & self.profit['profitable'], self.placement_potential['medium']),
            ctrl.Rule(self.data_transfer_time_ratio['average'] & self.vm_load['poor'] & self.data_availability['not_respected'] & self.profit['profitable'], self.placement_potential['low']),
            ctrl.Rule(self.data_transfer_time_ratio['poor'] & self.vm_load['poor'] & self.data_availability['not_respected'] & self.profit['profitable'], self.placement_potential['very_low']),
            ctrl.Rule(self.data_transfer_time_ratio['good'] & self.vm_load['average'] & self.data_availability['not_respected'] & self.profit['profitable'], self.placement_potential['low']),
            ctrl.Rule(self.data_transfer_time_ratio['average'] & self.vm_load['average'] & self.data_availability['not_respected'] & self.profit['profitable'], self.placement_potential['very_low']),
            ctrl.Rule(self.data_transfer_time_ratio['poor'] & self.vm_load['average'] & self.data_availability['not_respected'] & self.profit['profitable'], self.placement_potential['very_low']),
            ctrl.Rule(self.data_transfer_time_ratio['good'] & self.vm_load['good'] & self.data_availability['not_respected'] & self.profit['profitable'], self.placement_potential['very_low']),
            ctrl.Rule(self.data_transfer_time_ratio['average'] & self.vm_load['good'] & self.data_availability['not_respected'] & self.profit['profitable'], self.placement_potential['very_low']),
            ctrl.Rule(self.data_transfer_time_ratio['poor'] & self.vm_load['good'] & self.data_availability['not_respected'] & self.profit['profitable'], self.placement_potential['very_low']),
            ctrl.Rule(self.data_transfer_time_ratio['good'] & self.vm_load['poor'] & self.data_availability['respected'] & self.profit['non_profitable'], self.placement_potential['low']),
            ctrl.Rule(self.data_transfer_time_ratio['average'] & self.vm_load['poor'] & self.data_availability['respected'] & self.profit['non_profitable'], self.placement_potential['very_low']),
            ctrl.Rule(self.data_transfer_time_ratio['poor'] & self.vm_load['poor'] & self.data_availability['respected'] & self.profit['non_profitable'], self.placement_potential['very_low']),
            ctrl.Rule(self.data_transfer_time_ratio['good'] & self.vm_load['average'] & self.data_availability['respected'] & self.profit['non_profitable'], self.placement_potential['very_low']),
            ctrl.Rule(self.data_transfer_time_ratio['average'] & self.vm_load['average'] & self.data_availability['respected'] & self.profit['non_profitable'], self.placement_potential['very_low']),
            ctrl.Rule(self.data_transfer_time_ratio['poor'] & self.vm_load['average'] & self.data_availability['respected'] & self.profit['non_profitable'], self.placement_potential['very_low']),
        ]

        placement_control = ctrl.ControlSystem(rules)
        placement_decision = ctrl.ControlSystemSimulation(placement_control)
        return placement_decision

def evaluate_placement_potential(data_group, candidate_vms, req_d, bandwidths, delays, placement_decision):
    placement_potentials = []

    for candidate_vm in candidate_vms:
        dttr_values = []
        vm_load_values = []
        data_availability_values = []
        profit_values = []

        for data in data_group:
            data_file = next((df for df in candidate_vm.data_files if df['data_id'] == data['data_id']), None)
            if data_file is None:
                raise ValueError(f"Data file with id {data['data_id']} not found in VM {candidate_vm.vm_id}")

            dttr = data_file.calculate_dttr(candidate_vm, req_d, bandwidths, delays)
            vm_load = candidate_vm.calculate_vm_load()
            data_availability = candidate_vm.calculate_avg_availability()
            profit = candidate_vm.calculate_profit()

            dttr_values.append(dttr)
            vm_load_values.append(vm_load)
            data_availability_values.append(data_availability)
            profit_values.append(profit)

        avg_dttr = np.mean(dttr_values)
        avg_vm_load = np.mean(vm_load_values)
        avg_data_availability = np.mean(data_availability_values)
        avg_profit = np.mean(profit_values)

        placement_decision.input['data_transfer_time_ratio'] = avg_dttr
        placement_decision.input['vm_load'] = avg_vm_load
        placement_decision.input['data_availability'] = avg_data_availability
        placement_decision.input['profit'] = avg_profit

        placement_decision.compute()
        placement_potential = placement_decision.output['placement_potential']

        placement_potentials.append((candidate_vm, placement_potential))

    return placement_potentials

def select_candidate_vms(data_id, providers):
    candidate_vms = []

    for provider in providers:
        for dc in provider.data_centers:
            for vm in dc.vms:
                data_file = next((df for df in vm.data_files if df['data_id'] == data_id), None)
                if data_file and vm.calculate_used_storage() + data_file['size'] <= vm.storage_capacity:
                    candidate_vms.append(vm)

    return candidate_vms


def adjust_data_files(vm, data_clusters, providers, placement_decision):
    for data_items in data_clusters.values():
        for data_id in data_items:
            data_file = next((df for df in vm.data_files if df['data_id'] == data_id), None)
            if data_file:
                if not any(task.sla_violation for task in vm.tasks_processed):
                    ReplicaManagement.Adjust_Replicas_Number(vm)
                ProposedReplicationAlgorithm.Fuzzy_Replicas_Placement(vm, data_items, providers, placement_decision)

def periodic_replication_check(providers, placement_decision, interval=10):
    global stop_threads
    ThRT_satisfied = int(w * SLORT)

    while not stop_threads:
        logger.debug("Entered periodic_replication_check loop")
        for provider in providers:
            logger.debug(f"Checking provider: {provider}")
            for vm in provider.managed_vms:
                logger.debug(f"Checking VM: {vm.vm_id}")
                ProposedReplicationAlgorithm.proposed_replication_algorithm(vm, REPLICATION_PERIOD, SLORT, MIN_AVAILABILITY, ThRT_satisfied, K, providers, placement_decision)

        time.sleep(interval)  # Ensure this sleep interval is appropriate
        logger.debug("Finished one iteration of periodic_replication_check loop")

    logger.debug("Exiting periodic_replication_check loop")


def calculate_group_placement_potential(data_group, candidate_vm):
    placement_potentials = [evaluate_placement_potential(data_group, candidate_vm, [candidate_vm], VirtualMachine.bandwidths, VirtualMachine.delays, placement_decision) for _ in data_group]
    return sum(placement_potentials) / len(placement_potentials) if placement_potentials else 0

class SpectralClusteringAlgorithm:
    def __init__(self):
        pass

    def calculate_correlations(self, tasks):
        logger.debug(f"Calculating correlations for tasks: {[task.task_id for task in tasks]}")
        data_ids = set(data_id for task in tasks for data_id in task.data_requirements)
        data_id_list = list(data_ids)
        data_id_index = {data_id: index for index, data_id in enumerate(data_id_list)}

        correlation_matrix = np.zeros((len(data_id_list), len(data_id_list)))

        for task in tasks:
            for i, data_id_i in enumerate(task.data_requirements):
                for j, data_id_j in enumerate(task.data_requirements):
                    if i != j:
                        correlation_matrix[data_id_index[data_id_i]][data_id_index[data_id_j]] += 1

        logger.debug(f"Correlation matrix: \n{correlation_matrix}")
        return correlation_matrix, data_id_list

    def perform_spectral_clustering(self, correlation_matrix, num_clusters):
        sc = SpectralClustering(n_clusters=num_clusters, affinity='precomputed', random_state=0)
        labels = sc.fit_predict(correlation_matrix)
        logger.debug(f"Spectral clustering labels: {labels}")
        return labels

    def apply_spectral_clustering(self, data_group):
        from sklearn.cluster import SpectralClustering
      
        logger.debug(f"Initializing data group with {len(data_group)} items.")
        
        # Create the correlation matrix
        n = len(data_group)
        correlation_matrix = np.zeros((n, n))

        for i in range(n):
            for j in range(n):
                if i != j:
                    if 'tasks' in data_group[i] and 'tasks' in data_group[j]:
                        common_tasks = len(set(data_group[i]['tasks']).intersection(set(data_group[j]['tasks'])))
                        correlation_matrix[i][j] = common_tasks / REPLICATION_PERIOD
                    else:
                        raise KeyError("One of the data files does not have a 'tasks' key")
                else:
                    correlation_matrix[i][j] = 0

        # Call the print function here
        self.print_data_group_and_matrix(data_group, correlation_matrix)

        # Perform spectral clustering
        labels = self.perform_spectral_clustering(correlation_matrix, K)

        
        clusters = {}
        for label, data_file in zip(labels, data_group):
            if label not in clusters:
                clusters[label] = []
            clusters[label].append(data_file)

        return clusters

    def print_data_group_and_matrix(self, data_group_log, correlation_matrix):
        logger.debug(f"Data Group Size: {len(data_group_log)}")
        for item in data_group_log:
            # Extracting task IDs for readability
            task_ids = [task.task_id for task in item['tasks']]
            logger.debug(f"Data ID: {item['data_id']}, Tasks: {task_ids}, Access Frequencies: {item['access_frequencies']}")
        logger.debug(f"Correlation Matrix:\n{correlation_matrix}")


    def group_data_by_labels(data_id_list, labels):
        groups = {}
        for data_id, label in zip(data_id_list, labels):
            if label not in groups:
                groups[label] = []
            groups[label].append(data_id)
        logger.debug(f"Grouped data: {groups}")
        return groups
    def print_clusters(clusters):
        for label, data_files in clusters.items():
            print(f"Cluster {label}:")
            for data_file in data_files:
                print(f"  Data ID: {data_file['data_id']}")

        # Call this function after clustering
#    print_clusters(data_clusters)


class DataProcessingUtilities:
    def __init__(self):
        pass

    def sort_data_clusters_by_frequency(data_clusters, vm):
        sorted_data_clusters = sorted(data_clusters.items(), key=lambda x: DataProcessingUtilities.calculate_average_access_frequency(x[1], vm), reverse=True)
        logger.debug(f"Sorted data clusters: {sorted_data_clusters}")
        return sorted_data_clusters

    def find_data_file_by_id(data_id, vms):
        if not isinstance(vms, list):
            vms = [vms]
        for vm in vms:
            for data_file in vm.data_files:
                if data_file['data_id'] == data_id:
                    return data_file
        return None

    @staticmethod
    def calculate_average_access_frequency(data_group, vm):
        total_access_frequency = 0
        for data_id in data_group:
            data_file = DataProcessingUtilities.find_data_file_by_id(data_id, [vm])
            if data_file:
                total_access_frequency += sum(data_file['access_frequencies'])
        average_access_frequency = total_access_frequency / len(data_group) if data_group else 0
        return average_access_frequency


class ReplicaManagement:
    def __init__(self):
        pass
        
    def group_data_by_cluster(labels):
        clusters = {i: [] for i in range(K)}
        for index, cluster_id in enumerate(labels):
            clusters[cluster_id].append(index)
        return clusters

    def Replica_Identification(vm, k):
        logger.info("Running Replica_Identification")
        logger.info("Algorithm 3 Step 1: Initialize CG")
        CG = []

        logger.info("Algorithm 3 Step 2: Extracting groups using spectral clustering")
        tasks = vm.tasks_processed
        correlation_matrix, data_id_list = SpectralClusteringAlgorithm.calculate_correlations(tasks)
        labels = SpectralClusteringAlgorithm.perform_spectral_clustering(correlation_matrix, k)
        data_clusters = SpectralClusteringAlgorithm.group_data_by_labels(data_id_list, labels)

        logger.info("Algorithm 3 Step 3: Removing data from group if found in VM")
        for Gg in data_clusters.values():
            for data_id in list(Gg):
                data_file = DataProcessingUtilities.find_data_file_by_id(data_id, [vm])
                if data_file:
                    Gg.remove(data_id)

        logger.info("Algorithm 3 Step 4: Calculating average access frequency and sorting groups")
        for Gg in data_clusters.values():
            access_freq = DataProcessingUtilities.calculate_average_access_frequency(Gg)
            CG.append((Gg, access_freq))  # Append the group and its access frequency to CG

        # Sort CG by access frequency
        CG.sort(key=lambda x: x[1], reverse=True)

        logger.info("Algorithm 3 Step 5: Sorted correlated groups by average access frequency")

        correlated_groups = [group for group, _ in CG]
        print("Here identification of replication is applied", correlated_groups)
        return correlated_groups

class ReplicaManagement:
    @staticmethod
    def Adjust_Replicas_Number(vm):
        logger.info("Algorithm 2 Step 1: Select set of stored replicas")
        replicas = vm.data_files  # Assuming data_files is a list of dictionaries representing the replicas

        for replica in replicas:
            logger.info(f"Algorithm 2 Step 2: Processing replica {replica['data_id']}")
            
            # Log the replica before calculating total access frequency
            logger.debug(f"Replica before calculating total access frequency: {replica}")
            
            # Estimate the minimum replica factor
            mrf = estimate_min_replica_factor(replica)
            
            # Calculate total access frequency
            try:
                total_access_freq_value = ProposedReplicationAlgorithm.total_access_freq(replica)
                logger.debug(f"Total access frequency for replica {replica['data_id']}: {total_access_freq_value}")
            except Exception as e:
                logger.error(f"Error calculating total access frequency for replica {replica['data_id']}: {e}")
                import traceback
                traceback.print_exc()
                continue
            
            logger.info(f"Algorithm 2 Step 3: MRF = {mrf}, Total Access Frequency = {total_access_freq_value}")

            if 'replication_factor' in replica and replica['replication_factor'] > mrf:
                replica['replication_factor'] = mrf

        # Sort replicas by access frequency
        sorted_replicas = sorted(replicas, key=lambda x: x.get('number_of_accesses', 0))
        logger.info("Algorithm 2 Step 4: Sorted replicas by access frequency")

        for replica in sorted_replicas:
            if replica.get('replica_per_block', 0) >= mrf:
                logger.info(f"Algorithm 2 Step 5: Deleting replica {replica['data_id']} from VM {vm.vm_id}")
                vm.data_files.remove(replica)
                logger.info(f"Deleted replica {replica['data_id']} from VM {vm.vm_id}")

class ProposedReplicationAlgorithm:
    def __init__(self):
        pass
    @staticmethod
    def total_access_freq(replica):
        logger.debug(f"Calculating total access frequency for replica: {replica}, Type: {type(replica)}")
        
        if isinstance(replica, dict):
            if 'access_frequencies' in replica:
                logger.debug(f"Access frequencies found in replica dict: {replica['access_frequencies']}")
                return sum(replica.get('access_frequencies', []))
            else:
                logger.error(f"'access_frequencies' key not found in replica dict: {replica}")
        elif hasattr(replica, 'access_frequencies'):
            logger.debug(f"Access frequencies found in replica object: {getattr(replica, 'access_frequencies', [])}")
            return sum(getattr(replica, 'access_frequencies', []))
        elif isinstance(replica, list):
            logger.debug(f"Access frequencies found in list: {replica}")
            return sum(replica)
        
        logger.error(f"Unknown replica type or missing 'access_frequencies': {replica}, Type: {type(replica)}")
        return 0

    def proposed_replication_algorithm( vm, REPLICATION_PERIOD, SLORT, MIN_AVAILABILITY, ThRT_satisfied, K, providers, placement_decision):
        ThRT = w * SLORT
        ThRT_satisfied = min(ThRT, MIN_AVAILABILITY)
        logger.info(f"Algorithm 1 Step 1: Initialize ThRT = {ThRT}")

        for task in vm.tasks_processed:
            logger.info(f"Algorithm 1 Step 2: Processing task {task.task_id} on VM {vm.vm_id}")

            if task.response_time > ThRT:
                logger.info(f"Algorithm 1 Step 3: SLA violation detected for task {task.task_id}")
                vm.sla_violations += 1
                vm.sla_satisfactions = 0
            else:
                logger.info(f"Algorithm 1 Step 4: SLA satisfaction for task {task.task_id}")
                vm.sla_satisfactions += 1

            logger.debug(f"SLA Violations: {vm.sla_violations}, SLA Satisfactions: {vm.sla_satisfactions}")

            if vm.sla_violations >= REPLICATION_PERIOD:
                logger.info("Algorithm 1 Step 5: Replication period reached")
                CG = ReplicaManagement.Replica_Identification(vm, K)
                logger.info(f"Identified correlated groups: {CG}")
                for Gg in CG:
                    logger.info(f"Placing replicas for correlated group {Gg}")
                    ProposedReplicationAlgorithm.Fuzzy_Replicas_Placement(vm, Gg, providers, placement_decision)
                vm.sla_violations = 0

            if vm.sla_satisfactions >= ThRT_satisfied:
                logger.info("Algorithm 1 Step 6: Threshold of satisfied tasks reached")
                ReplicaManagement.Adjust_Replicas_Number(vm)
                vm.sla_satisfactions = 0

        logger.info("Exiting proposed_replication_algorithm")

    def Fuzzy_Replicas_Placement(vm, group, providers, placement_decision):
            logger.info("Algorithm 4 Step 1: Select set of managed VMs")
            placement_potentials = []
            for data_id in group:
                candidate_vms = select_candidate_vms(data_id, providers)
                if candidate_vms:
                    for candidate_vm in candidate_vms:
                        data_file = DataProcessingUtilities.find_data_file_by_id(data_id, [candidate_vm])
                        if data_file is not None:
                            data_file_instance = DataFile(
                                data_id=data_file['data_id'],
                                size=data_file['size'],
                                creation_date=data_file['creation_date'],
                                modification_date=data_file['modification_date'],
                                blocks=data_file['blocks'],
                                replicas_per_block=data_file['replicas_per_block'],
                                access_frequencies=data_file['access_frequencies'],
                                access_count=data_file['number_of_accesses']
                            )
                            dttr = data_file_instance.calculate_dttr(candidate_vm, [vm], VirtualMachine.bandwidths, VirtualMachine.delays)
                            vm_load = candidate_vm.calculate_vm_load()
                            data_availability = candidate_vm.calculate_avg_availability()
                            profit = candidate_vm.calculate_profit()

                            placement_decision.input['data_transfer_time_ratio'] = dttr
                            placement_decision.input['vm_load'] = vm_load
                            placement_decision.input['data_availability'] = data_availability
                            placement_decision.input['profit'] = profit

                            placement_decision.compute()
                            placement_potential = placement_decision.output['placement_potential']
                            placement_potentials.append((candidate_vm, placement_potential))

            if placement_potentials:
                best_vm, best_potential = max(placement_potentials, key=lambda x: x[1])
                if best_vm:
                    data_file = next((df for df in vm.data_files if df['data_id'] == data_id), None)
                    if data_file:
                        best_vm.add_data_file(data_file)
                        print(f"Placed data file {data_id} in VM {best_vm.vm_id} with placement potential {best_potential}")

def simulate_task_execution(env, providers, task_list, vm_list, placement_decision, reimplemented_results):
    global task_metrics_df, fuzzy_input_df, combined_csv_data
    total_tasks = 0
    CounterSLA_violations = 0
    CounterSLA_satisfactions = 0
    T_hRT_satisfied = 32

    start_time = time.time()

    response_times = []
    avg_response_times = {}
    task_intervals = [1000, 2000, 3000, 5000, 7000, 10000]

    processed_tasks = set()  # Track processed tasks

    while total_tasks < len(task_list):
        for vm in vm_list:
            if vm.tasks_assigned:
                task = vm.tasks_assigned.pop(0)
                
                if task.task_id in processed_tasks:
                    continue

                src_vm = random.choice(vm_list)

                if vm.dc_id == src_vm.dc_id:
                    bandwidth, delay = BW_INTRA_DC, DELAY_INTRA_DC
                elif vm.region_id == src_vm.region_id:
                    bandwidth, delay = BW_INTRA_REGION, DELAY_INTRA_REGION
                else:
                    bandwidth, delay = BW_INTER_REGION, DELAY_INTER_REGION

                try:
                    processing_time, transfer_time = vm.execute_task(task, src_vm, bandwidth, delay)
                    if processing_time is None or transfer_time is None:
                        raise ValueError("execute_task returned None values")

                except Exception as e:
                    logger.error(traceback.format_exc())  # Log the full traceback
                    continue

                total_tasks += 1

                transfer_cost = vm.calculate_transfer_cost(task.size)
                task.transfer_cost = transfer_cost

                response_time = task.response_time if task.response_time is not None else 0.0
                response_times.append(response_time)

                if total_tasks in task_intervals:
                    avg_response_times[total_tasks] = np.mean(response_times)
                    reimplemented_results.append((total_tasks, avg_response_times[total_tasks]))

                try:
                    availability = vm.calculate_avg_availability()
                except AttributeError as e:
                    logger.error(f"AttributeError: {e}")
                    continue

                try:
                    task_metrics_df.loc[len(task_metrics_df)] = [
                        task.task_id, vm.vm_id, vm.provider_id, vm.region_id, response_time, task.sla_violation, task.enu, task.transfer_cost
                    ]
                except Exception as e:
                    logger.error(f"Error inserting task metrics for task {task.task_id}: {e}")
                    logger.error(traceback.format_exc())  # Log the full traceback
                    continue

                data_transfer_time_ratio = None  # Initialize data_transfer_time_ratio

                for data_id in task.data_requirements:
                    data_file = DataProcessingUtilities.find_data_file_by_id(data_id, vm_list)
                    if data_file is not None:
                        if 'number_of_accesses' not in data_file:
                            data_file['number_of_accesses'] = 0  # Ensure the key exists

                        data_file_instance = DataFile(
                            data_id=data_file['data_id'],
                            size=data_file['size'],
                            creation_date=data_file['creation_date'],
                            modification_date=data_file['modification_date'],
                            blocks=data_file['blocks'],
                            replicas_per_block=data_file['replicas_per_block'],
                            access_frequencies=data_file['access_frequencies'],
                            access_count=data_file['number_of_accesses']
                        )
                        try:
                            # Correctly derive `reqd` from the data requirements of the task
                            reqd = [vmr for vmr in vm_list if any(df['data_id'] == data_id for df in vmr.data_files)]
                            
                            dtt = data_file_instance.calculate_dtt(vm, reqd, VirtualMachine.bandwidths, VirtualMachine.delays)
                            ldtt = data_file_instance.calculate_ldtt()
                            data_transfer_time_ratio = dtt / ldtt if ldtt != 0 else 0

                            
                            fuzzy_input_df.loc[len(fuzzy_input_df)] = [
                                task.task_id, task.size, vm.vm_id, data_file['data_id'], data_file['size'],
                                response_time, data_transfer_time_ratio, availability, vm.calculate_vm_load(), vm.calculate_profit()
                            ]
                        except Exception as e:
                            logger.error(f"Error calculating data_transfer_time_ratio for data_id {data_id} in task {task.task_id}: {e}")
                            logger.error(traceback.format_exc())  # Log the full traceback
                            continue

                if data_transfer_time_ratio is None:
                    # If data_transfer_time_ratio is not calculated, skip this task
                    logger.error(f"data_transfer_time_ratio is not calculated for task {task.task_id}")
                    continue

                if task.response_time > T_hRT:
                    task.sla_violation = True
                    CounterSLA_violations += 1
                    CounterSLA_satisfactions = 0
                else:
                    CounterSLA_satisfactions += 1

                if CounterSLA_violations >= T_hRT:
                    ReplicaManagement.Adjust_Replicas_Number(vm)
                    CounterSLA_violations = 0

                if CounterSLA_satisfactions >= T_hRT_satisfied:
                    ReplicaManagement.Adjust_Replicas_Number(vm)
                    CounterSLA_satisfactions = 0

                combined_csv_data.append([src_vm.vm_id, vm.vm_id, task.task_id, data_transfer_time_ratio, bandwidth, delay, availability, task.data_accesses_list[0] if task.data_accesses_list else None, vm.vm_id])

                processed_tasks.add(task.task_id)
                yield env.timeout(1)

    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Simulation Execution Time: {execution_time:.2f} seconds")
    print(f"Total tasks executed: {total_tasks}")


def calculate_rented_resources_percentage(providers):
    total_vms = sum(len(provider.owned_vms) for provider in providers)
    rented_vms = sum(len(provider.rented_vms) for provider in providers)
    if total_vms == 0:
        return 0.0
    return (rented_vms / total_vms) * 100

def calculate_popularity_degree(access_times):
    print("here to check the popularity degree", access_times)
    return len(access_times)

def estimate_min_replica_factor(data_file):
    min_replica_factor = max(1, int(data_file['replicas_per_block'] / 2))
    logger.debug(f"Estimated minimum replica factor for data_id {data_file['data_id']}: {min_replica_factor}")
    return min_replica_factor

def collect_task_metrics(task_list):
    task_metrics = []
    for task in task_list:
        task_metrics.append({
            'task_id': task.task_id,
            'response_time': task.response_time,
            'execution_cost': task.execution_cost,
            'transfer_cost': task.transfer_cost
        })
    return pd.DataFrame(task_metrics)


def collect_vm_metrics(providers):
    vm_data = []
    for provider in providers:
        for dc in provider.data_centers:
            for vm in dc.vms:
                storage_cost, transfer_cost = vm.calculate_dpc()
                vm_metric = {
                    'vm_id': vm.vm_id,
                    'provider_id': vm.provider_id,
                    'region_id': vm.region_id,
                    'dc_id': dc.dc_id,
                    'queue_capacity': vm.queue_capacity,
                    'process_capacity': vm.processing_capacity,
                    'owned': vm.owned,
                    'total_processed_tasks': len(vm.tasks_processed),
                    'tpc': vm.calculate_tpc(),
                    'dpc': vm.calculate_dpc(),
                    'penalties': vm.calculate_penalties(),
                    'expenditures': vm.calculate_expenditures(),
                    'revenues': vm.calculate_revenues(),
                    'profit': vm.calculate_profit(),
                    'vm_load': vm.calculate_vm_load(),
                    'availability': vm.calculate_avg_availability(),
                    'response_time': vm.calculate_response_time(),
                    'process_cost': vm.calculate_task_processing_cost(),
                    'storage_cost': storage_cost,
                    'storage_price': vm.storage_price_per_gb
                }

                vm_data.append(vm_metric)

    return pd.DataFrame(vm_data)

def collect_overall_metrics(providers):
    overall_metrics = {
        'num_rented_vms': sum(len(provider.rented_vms) for provider in providers),
        'num_managed_vms': sum(len(provider.managed_vms) for provider in providers),
        'num_owned_vms': sum(len(provider.owned_vms) for provider in providers),
        'num_offered_vms': sum(len(provider.offered_vms) for provider in providers)
    }
    return pd.DataFrame([overall_metrics])

def print_task_metrics(task_metrics):
    print("=== Task Metrics ===")
    df = pd.DataFrame(task_metrics)
    print(df.to_string(index=False))

def print_vm_metrics(vm_metrics):
    print("=== VM Metrics ===")
    df = pd.DataFrame(vm_metrics)
    print(df.to_string(index=False))

def print_overall_metrics(overall_metrics):
    print("=== Overall Metrics ===")
    for key, value in overall_metrics.items():
        print(f"{key}: {value}")

reimplemented_results = []

task_metrics_df = pd.DataFrame(columns=['task_id', 'vm_id', 'provider_id', 'region_id', 'response_time', 'sla_violation', 'enu', 'transfer_cost'])
fuzzy_input_df = pd.DataFrame(columns=[ 'task_id', 'task_size', 'vm_id', 'data_id', 'data_size', 'response_time', 'dttr', 'availability', 'load', 'profit'])
combined_csv_data = []


def verify_task_assignment(providers):
    for provider in providers:
        for dc in provider.data_centers:
            for vm in dc.vms:
                if not vm.tasks_assigned:
                    logger.warning(f"VM {vm.vm_id} has no tasks assigned.")
                for task in vm.tasks_assigned:
                    if not task.data_requirements:
                        logger.warning(f"Task {task.task_id} in VM {vm.vm_id} has no data requirements.")

def ensure_tasks_have_data(tasks):
    for task in tasks:
        if not task.data_requirements:
            logger.warning(f"Task {task.task_id} has no data requirements.")
            tasks.remove(task)

def initialize_data_group(task_list, vm):
    data_group = []
    data_ids = set()  # To avoid duplicates

    for task in task_list:
        for data_file_id in task.data_requirements:
            if data_file_id not in data_ids:
                if any(df['data_id'] == data_file_id for df in vm.data_files):
                    data_ids.add(data_file_id)
                    data_group.append({
                        'data_id': data_file_id,
                        'tasks': [task],  # Start with the current task
                        'access_frequencies': [1]  # Initialize with access frequency 1
                    })
            else:
                # If data_id already exists in data_group, append the task and increment access frequency
                for data in data_group:
                    if data['data_id'] == data_file_id:
                        data['tasks'].append(task)
                        data['access_frequencies'].append(1)  # Increment access frequency

    # Enhanced logging to show task IDs
    data_group_log = []
    for data in data_group:
        task_ids = [t.task_id for t in data['tasks']]  # Assuming tasks have an attribute 'task_id'
        data_group_log.append({
            'data_id': data['data_id'],
            'tasks': task_ids,
            'access_frequencies': data['access_frequencies']
        })
    logger.debug(f"No another one Initialized data group: {data_group_log}")

    return data_group


def run_simulation():
    global task_metrics_df, fuzzy_input_df, combined_csv_data

    # Initialize simulation parameters
    providers, task_list, vm_list, placement_decision, reimplemented_results = initialize_simulation_parameters()

    # Initialize the environment and providers
    num_providers = NUM_PROVIDERS
    num_regions = NUM_REGIONS
    max_dcs_per_provider = MAX_DCS_PER_PROVIDER
    vm_per_dc = VM_PER_DC
    vm_storage_capacity = VM_STORAGE_CAPACITY
    vm_processing_capacity = VM_PROCESSING_CAPACITY

    env, providers, vm_list = SimulationSetup.setup_environment(
        num_providers, num_regions, max_dcs_per_provider,
        vm_per_dc, vm_storage_capacity, vm_processing_capacity
    )
    VirtualMachine.set_network_characteristics_and_update()
    current_time = time.time()  # Set the current time for data file creation

    data_files = generate_data_files(NUM_DATA_ITEMS, current_time)
    assign_data_files_to_vms(providers, data_files)

    # Ensure tasks have data and assign tasks to VMs
    task_list = SimulationSetup.assign_tasks_to_vms(providers)

    # Function to check SLA violations and satisfactions
    def check_sla_violations_and_satisfactions(vm_list):
        for vm in vm_list:
            for task in vm.tasks_processed:
                task.calculate_response_time()
                if task.sla_violation:
                    vm.sla_violations += 1
                else:
                    vm.sla_satisfactions += 1

    check_sla_violations_and_satisfactions(vm_list)

    data_group = initialize_data_group(task_list, vm_list[0])  # Pass the first VM

    # Create an instance of the SpectralClusteringAlgorithm class
    sc_algorithm = SpectralClusteringAlgorithm()

    # Apply spectral clustering
    data_clusters = sc_algorithm.apply_spectral_clustering(data_group)

    fuzzy_system = FuzzyLogicSystem()
    placement_decision = fuzzy_system.define_control_system()

    # Start periodic replication check thread
    periodic_check_thread = threading.Thread(target=periodic_replication_check, args=(providers,))
    periodic_check_thread.start()

    simulation_time = 100
    env.process(simulate_task_execution(env, providers, task_list, vm_list, placement_decision, reimplemented_results))
    env.run(until=simulation_time)

    # Collect VM data for analysis
    for provider in providers:
        provider.collect_managed_vms()
        provider.collect_owned_vms()
        provider.collect_rented_vms()
        provider.collect_offered_vms()

    if periodic_check_thread.is_alive():
        periodic_check_thread.join(timeout=1)

    # Collect task metrics
    task_df = collect_task_metrics(task_list)
    vm_metrics = [vm.collect_metrics() for vm in vm_list]

    # Save results to CSV
    vm_metrics_df = pd.DataFrame(vm_metrics)
    vm_metrics_df.to_csv('vm_metrics_11.csv', index=False)
    task_df.to_csv('task_metrics_11.csv', index=False)
    fuzzy_input_df.drop_duplicates(inplace=True)  # Ensure this line has access to fuzzy_input_df
    fuzzy_input_df.to_csv('fuzzy_input_parameters_11.csv', index=False)

    # Print metrics
    print("Task Metrics:")
    print(task_df)
    print("VM Metrics:")
    print(vm_metrics_df)

    print_task_metrics(task_df)
    print_vm_metrics(vm_metrics_df)

    rented_resources_percentage = calculate_rented_resources_percentage(providers)
    logger.debug(f"Rented Resources Percentage: {rented_resources_percentage:.2f}%")

if __name__ == "__main__":
    run_simulation()


2024-08-05 16:25:27,902 - DEBUG - Finished generating 200 data files
2024-08-05 16:25:27,912 - DEBUG - Total assigned data files: 200/200
2024-08-05 16:25:27,914 - DEBUG - No another one Initialized data group: [{'data_id': 4, 'tasks': [3], 'access_frequencies': [1]}, {'data_id': 2, 'tasks': [6, 35, 42, 84], 'access_frequencies': [1, 1, 1, 1]}, {'data_id': 1, 'tasks': [16], 'access_frequencies': [1]}, {'data_id': 0, 'tasks': [17, 73, 98], 'access_frequencies': [1, 1, 1]}, {'data_id': 6, 'tasks': [19, 24], 'access_frequencies': [1, 1]}, {'data_id': 3, 'tasks': [40, 76], 'access_frequencies': [1, 1]}, {'data_id': 9, 'tasks': [75], 'access_frequencies': [1]}, {'data_id': 8, 'tasks': [76], 'access_frequencies': [1]}, {'data_id': 5, 'tasks': [93], 'access_frequencies': [1]}, {'data_id': 10, 'tasks': [93], 'access_frequencies': [1]}]
2024-08-05 16:25:27,915 - DEBUG - Initializing data group with 10 items.
2024-08-05 16:25:27,915 - DEBUG - Data Group Size: 10
2024-08-05 16:25:27,916 - DEBUG -

Task Metrics:
    task_id  response_time  execution_cost  transfer_cost
0         0       0.625917        0.010373         1.1670
1         1       1.088333        0.008307         0.9345
2         2       1.323333        0.005867         0.6600
3         3       1.098333        0.008387         0.9435
4         4       1.488667        0.006693         0.7530
..      ...            ...             ...            ...
95       95       1.920667        0.011952         2.6560
96       96       2.126000        0.013338         2.9640
97       97       2.462000        0.015606         3.4680
98       98       0.446667        0.004284         0.9520
99       99       0.658333        0.006570         1.4600

[100 rows x 4 columns]
VM Metrics:
         vm_id                                         data_files  \
0    VM0-0-0-0             [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12]   
1    VM0-0-0-1   [11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24]   
2    VM0-0-0-2  [23, 25, 26, 27, 28, 29, 30, 31