# UCB-Improved Algorithm

In [1]:
import numpy as np
import csv
import math

## Class BanditAlgorithm Initialization

In [2]:
class BanditAlgorithm:
    def __init__(self, name):
        self.name = name
        self.results = []

    def add_result(self, param, iteration, total_reward, suboptimal_arms_count, total_regret, zeros_count, ones_count):
        self.results.append([param, iteration, total_reward, suboptimal_arms_count, round(total_regret, 2), zeros_count, ones_count])

    def save_results_to_csv(self, filename):
        with open(filename, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Timestep', 'Iteration', 'Total Reward', 'Suboptimal Arms Count', 'Total Regret', 'Zeros Count', 'Ones Count'])
            for result in self.results:
                writer.writerow(result)

    def calculate_average_results(self):
        avg_results = {}
        for result in self.results:
            param = result[0]
            if param not in avg_results:
                avg_results[param] = [0, 0, 0, 0, 0]
            avg_results[param][0] += result[2]  # Total Reward
            avg_results[param][1] += result[3]  # Suboptimal Arms Count
            avg_results[param][2] += result[4]  # Total Regret
            avg_results[param][3] += result[5]  # Zeros Count
            avg_results[param][4] += result[6]  # Ones Count
        
        for param in avg_results:
            avg_results[param] = [param] + [x / 100 for x in avg_results[param]]
        return list(avg_results.values())

## Definition UCB-Improved

In [3]:
def UCB_Improved_simulation(algorithm, arm_means, time_horizon):
    num_arms = len(arm_means)
    rewards = np.zeros(num_arms)
    pulls = np.zeros(num_arms)
    total_reward = 0
    suboptimal_arms_count = 0
    total_regret = 0
    zeros_count = 0
    ones_count = 0

    delta = 1
    B = list(range(num_arms))
    UCB_Improved_max = np.zeros(num_arms)

    first_phase_end = int(np.floor(0.5 * np.log2(time_horizon / np.exp(1))))
    
    t = 0  # Laufvariable für Phase
    z = 0  # Zeitschritte
    
    while z < time_horizon:
        if t < first_phase_end:
            for m in B:
                max_pulls = math.ceil((2 * np.log(time_horizon * delta**2)) / delta**2)
                if pulls[m] <= max_pulls:
                    reward = np.random.binomial(1, arm_means[m])
                    rewards[m] += reward
                    pulls[m] += 1
                    total_reward += reward
                    total_regret += np.max(arm_means) - arm_means[m]
                    if reward == 0:
                        zeros_count += 1
                    else:
                        ones_count += 1
                    if m != np.argmax(arm_means):
                        suboptimal_arms_count += 1
                    z += 1  # Erhöhe die Zeitschritte nach jedem Ziehen
                    if z >= time_horizon:
                        break
            
            if z < time_horizon:  # Nur eliminieren, wenn noch Zeit übrig ist
                UCB_Improved_max = max((rewards[j] / pulls[j]) - np.sqrt((np.log(time_horizon * delta**2)) / (2 * max_pulls)) for j in B)
                B = [k for k in B if (rewards[k] / pulls[k]) + np.sqrt((np.log(time_horizon * delta**2)) / (2 * max_pulls)) >= UCB_Improved_max]
                delta /= 2
                t += 1  # Nächste Phase starten
        else:
            # Zweite Phase
            if len(B) == 1:
                best_arm = B[0]
            else:
                best_arm = max(B, key=lambda k: rewards[k] / pulls[k])
            reward = np.random.binomial(1, arm_means[best_arm])
            total_reward += reward
            pulls[best_arm] += 1
            total_regret += np.max(arm_means) - arm_means[best_arm]
            if reward == 0:
                zeros_count += 1
            else:
                ones_count += 1
            if best_arm != np.argmax(arm_means):
                suboptimal_arms_count += 1
            z += 1  # Erhöhe die Zeitschritte nach jedem Ziehen


    return total_reward, suboptimal_arms_count, total_regret, zeros_count, ones_count

## Run Simulation Function

In [4]:
def run_simulation(algorithm, parameters, arm_means):
    for iteration in range(1, 101):
        for param in parameters:
            total_reward, suboptimal_arms_count, total_regret, zeros_count, ones_count = UCB_Improved_simulation(algorithm, arm_means, param)
            algorithm.add_result(param, iteration, total_reward, suboptimal_arms_count, total_regret, zeros_count, ones_count)

## UCB-Improved for different time horizons

In [5]:
time_horizons = [2, 3, 100, 200, 2000, 10000, 20000, 40000, 60000, 80000, 100000]

# Beispiel-Algorithmen
algorithms = [
    BanditAlgorithm("8_UCB-Improved"),
]

# Beispiel-Mittelwerte der Arme
arm_means = np.array([0.495, 0.5])

# Simulation durchführen und Ergebnisse speichern
for algorithm in algorithms:
    run_simulation(algorithm, time_horizons, arm_means)
    results_path = r'C:/Users/canis/OneDrive/Dokumente/uni/uni-surface/FSS 2024/BA/bachelorarbeit_vrlfg/BA/github/BA_code/2_algorithms_results'
    algorithm.save_results_to_csv(f'{results_path}/{algorithm.name}_results_subopt_ver3.csv')
    avg_results = algorithm.calculate_average_results()
    with open(f'{results_path}/{algorithm.name}_average_results_subopt_ver3.csv', mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Timestep', 'Average Total Reward', 'Average Suboptimal Arms', 'Average Regret', 'Average Zeros Count', 'Average Ones Count'])
        for result in avg_results:
            writer.writerow(result)

  best_arm = max(B, key=lambda k: rewards[k] / pulls[k])
