In [None]:
import numpy as np
import pandas as pd 

# A customized multi armed bandit environment with Bernoulli distribution is provided 
# Rewards associated with each arm are modeled by Bernoulli distributions which returns 1 or 0.

class MultiArmedBandit:
    def __init__(self, reward_probability_dist=[0.3, 0.5, 0.8]):

        self.reward_probability_dist = reward_probability_dist

    def step(self, action):

        if action > len(self.reward_probability_dist):
            raise Exception("MULTI ARMED BANDIT][ERROR] the action" + str(action) + " is out of range, total actions: " + str(len(self.reward_probability_dist)))
        p = self.reward_probability_dist[action]
        q = 1.0-p
        return np.random.choice(2, p=[q, p])

import gym
import random

# 1. Create an eight - armed bandit here
def create_bandit(reward_dist):
    

    
    if abs(sum(reward_dist )-1 ) > 0.1e-2:
        print("Adjust the reward distribution to sum to one..")
    env = MultiArmedBandit(reward_probability_dist=reward_distribution)
    num_arms= len(reward_distribution)
    return env,num_arms
    
    
# 2. Given the total number of episodes and steps, write the code to define an eight - armed bandit 
reward_distribution = [0.1,0.06,0.06,0.4,0.3,0.02,0.03,0.03]
tot_episodes=250
tot_steps =150
env,tot_arms = create_bandit(reward_distribution)
tot_arms = len(reward_distribution)



# 3. Write the code to compute average rewardS received by selecting the arms in a random manner

def random_selection():
    
        

    average_value_function = np.zeros(tot_arms)
    cumulated_reward_list = list()


    for episode in range(tot_episodes):
        cumulated_reward = 0
        reward_counter_array = np.zeros(tot_arms)
        action_counter_array = np.full(tot_arms, 1.0e-5)
        for step in range(tot_steps):
            action = np.random.randint(low=0, high=tot_arms)
            reward = env.step(action)
            reward_counter_array[action] += reward
            action_counter_array[action] += 1
            cumulated_reward += reward
        cumulated_reward_list.append(cumulated_reward)
        value_function = np.true_divide(reward_counter_array, action_counter_array)
        average_value_function += value_function
    return average_value_function,cumulated_reward_list,action_counter_array
    
# Print the average rewards received towards the end


random_average_value_fn ,random_rewards,random_actions = random_selection()
out=np.mean(random_rewards)
print(out)

# 4. Write the output here in the format given below
data= [out]
output=pd.DataFrame(data)
output.to_csv('/code/output/output.csv', header=False, index=False)
   