# UCB-Improved Algorithm

In [16]:
import numpy as np
import csv
import math

# Class BanditAlgorithm: Initialization

In [17]:
class BanditAlgorithm:
    def __init__(self, name):
        self.name = name
        self.results = []

    def add_result(self, param, iteration, total_reward, suboptimal_arms_count, total_regret, zeros_count, ones_count):
        self.results.append([param, iteration, total_reward, suboptimal_arms_count, round(total_regret, 2), zeros_count, ones_count])

    def save_results_to_csv(self, filename):
        with open(filename, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Time Horizon', 'Iteration', 'Total Reward', 'Suboptimal Arms Count', 'Total Regret', 'Zeros Count', 'Ones Count'])
            for result in self.results:
                writer.writerow(result)

    def calculate_average_results(self):
        avg_results = {}
        for result in self.results:
            param = result[0]
            if param not in avg_results:
                avg_results[param] = [0, 0, 0, 0, 0]
            avg_results[param][0] += result[2]  # Total Reward
            avg_results[param][1] += result[3]  # Suboptimal Arms Count
            avg_results[param][2] += result[4]  # Total Regret
            avg_results[param][3] += result[5]  # Zeros Count
            avg_results[param][4] += result[6]  # Ones Count
        
        for param in avg_results:
            avg_results[param] = [param] + [x / 100 for x in avg_results[param]]
        return list(avg_results.values())

## Definition UCB-Improved

In [18]:
def UCB_Improved_simulation(algorithm, arm_means, time_horizon):
    K = len(arm_means)
    T_k = np.zeros(K)  # Number of times each arm has been played
    X_k = np.zeros(K)  # Sum of rewards for each arm
    zeros_count = 0
    ones_count = 0
    
    B_t = set(range(K))
    delta_t = 1
    total_reward = 0
    suboptimal_arms_count = 0
    total_regret = 0
    
    def play_arm(arm):
        nonlocal zeros_count, ones_count, total_reward, suboptimal_arms_count, total_regret
        reward = np.random.binomial(1, arm_means[arm])
        T_k[arm] += 1
        X_k[arm] += reward
        total_reward += reward
        if reward == 0:
            zeros_count += 1
        else:
            ones_count += 1
        if arm != np.argmax(arm_means):
            suboptimal_arms_count += 1
            total_regret += np.max(arm_means) - arm_means[arm]
        return reward
    
    # Initialization: Play each arm once
    #for arm in range(K):
        play_arm(arm)
    
    # Exploration phase
    max_t = int(math.floor(0.5 * math.log2(time_horizon / math.exp(1))))
    for t in range(max_t + 1):
        if len(B_t) > 1:
            for arm in list(B_t):
                n_plays = int(math.ceil(2 * math.log(time_horizon * delta_t ** 2) / delta_t ** 2))
                for _ in range(n_plays):
                    play_arm(arm)
        
        # Calculate UCB-Improved and prune arms
        ucb_improved = {arm: (X_k[arm] / T_k[arm]) - math.sqrt(math.log(time_horizon * delta_t ** 2) / (2 * T_k[arm])) for arm in B_t}
        ucb_improved_star = max(ucb_improved.values())
        
        for arm in list(B_t):
            if T_k[arm] > 0 and (X_k[arm] / T_k[arm]) + math.sqrt(math.log(time_horizon * delta_t ** 2) / (2 * T_k[arm])) < ucb_improved_star:
                B_t.remove(arm)
        
        delta_t /= 2
    
    # Exploitation phase
    best_arm = max(B_t, key=lambda arm: X_k[arm] / T_k[arm] if T_k[arm] > 0 else float('-inf'))
    for _ in range(int(time_horizon - sum(T_k))):
        play_arm(best_arm)
    
    return total_reward, suboptimal_arms_count, total_regret, zeros_count, ones_count

## Run Simulation Function

In [19]:
def run_simulation(algorithm, parameters, arm_means):
    for iteration in range(1, 101):
        for param in parameters:
            total_reward, suboptimal_arms_count, total_regret, zeros_count, ones_count = UCB_Improved_simulation(algorithm, arm_means, param)
            algorithm.add_result(param, iteration, total_reward, suboptimal_arms_count, total_regret, zeros_count, ones_count)


## UCB-Improved for different time horizons

In [20]:
#time_horizons = [2, 3, 100, 200, 2000, 10000, 20000, 40000, 60000, 80000, 100000]

time_horizons = [2, 3, 100, 200, 2000]

# Beispiel-Algorithmen
algorithms = [
    BanditAlgorithm("UCB-Improved"),
]

# Beispiel-Mittelwerte der Arme
arm_means = np.array([0.9, 0.8])

# Simulation durchführen und Ergebnisse speichern
for algorithm in algorithms:
    run_simulation(algorithm, time_horizons, arm_means)
    results_path = r'C:/Users/canis/OneDrive\Dokumente/uni/uni-surface/FSS 2024/BA/bachelorarbeit_vrlfg/BA/github/BA_code/2_algorithms_results'
    algorithm.save_results_to_csv(f'{results_path}\{algorithm.name}_results.csv')
    avg_results = algorithm.calculate_average_results()
    with open(f'{results_path}\{algorithm.name}_average_results.csv', mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Timestep', 'Average Total Reward', 'Average Suboptimal Arms', 'Average Regret', 'Average Zeros Count', 'Average Ones Count'])
        for result in avg_results:
            writer.writerow(result)
