In [None]:
def UCB_Improved_simulation(algorithm, arm_means, time_horizon):
    K = len(arm_means)
    T_k = np.zeros(K)  # Number of times each arm has been played
    X_k = np.zeros(K)  # Sum of rewards for each arm
    zeros_count = 0
    ones_count = 0
    
    B_t = set(range(K))
    delta_t = 1
    total_reward = 0
    suboptimal_arms_count = 0
    total_regret = 0
    
    def play_arm(arm):
        nonlocal zeros_count, ones_count, total_reward, suboptimal_arms_count, total_regret
        reward = np.random.binomial(1, arm_means[arm])
        T_k[arm] += 1
        X_k[arm] += reward
        total_reward += reward
        if reward == 0:
            zeros_count += 1
        else:
            ones_count += 1
        if arm != np.argmax(arm_means):
            suboptimal_arms_count += 1
            total_regret += np.max(arm_means) - arm_means[arm]
        return reward
    
    # Exploration phase
    max_t = int(math.floor(0.5 * math.log2(time_horizon / math.exp(1))))

    for t in range(max_t + 1):
        if len(B_t) == 1:
            for arm in list(B_t):
                play_arm(arm)
        else:
            n_plays = int(math.ceil(2 * math.log(time_horizon * delta_t ** 2) / delta_t ** 2))
            for arm in list(B_t):
                if T_k[arm] <= n_plays:
                    play_arm(arm)

            # Calculate UCB-Improved and prune arms
            ucb_improved = {arm: (X_k[arm] / T_k[arm]) - math.sqrt(math.log(time_horizon * delta_t ** 2) / (2 * T_k[arm])) for arm in B_t}
            ucb_improved_star = max(ucb_improved.values())
            
            for arm in list(B_t):
                if T_k[arm] > 0 and (X_k[arm] / T_k[arm]) + math.sqrt(math.log(time_horizon * delta_t ** 2) / (2 * T_k[arm])) < ucb_improved_star:
                    B_t.remove(arm)
            
            delta_t /= 2
    
    for t in range(max_t + 2, time_horizon +1):
        if len(B_t) == 1:
            for arm in list(B_t):
                play_arm(arm)
        else: 
            ucb_improved_exploit = {arm: (X_k[arm] / T_k[arm]) + math.sqrt(math.log(time_horizon * delta_t ** 2) / (2 * T_k[arm])) for arm in B_t}
            ucb_improved_exploit_star = np.argmax(ucb_improved_exploit.values())
            play_arm(ucb_improved_exploit_star)
    
    return total_reward, suboptimal_arms_count, total_regret, zeros_count, ones_count