In [1]:
import numpy as np

class KArmBandit:
    def __init__(self, k, M, rho, lambda_teacher, lambda_student, gamma_student):
        self.k = k
        self.M = M
        self.rho = rho
        self.lambda_teacher = lambda_teacher
        self.lambda_student = lambda_student
        self.gamma_student = gamma_student
        self.arm_values = np.zeros(k)  # Expected reward estimates for each arm
        self.history_student = []
        self.history_teacher = []

    def split_data(self, data):
        # Splits data into training, validation, and test sets
        n = len(data)
        return data[:int(0.6 * n)], data[int(0.6 * n):int(0.8 * n)], data[int(0.8 * n):]

    def batch_data(self, train_data, batch_size):
        # Divides data into M batches
        return np.array_split(train_data, batch_size)
    
    def train_student(self, s_r, s_b, d_u):
        # Placeholder function for student update on s_r, s_b, d_u
        pass

    def train_teacher(self, s_r):
        # Placeholder function for teacher update on s_r
        pass

    def get_top_arms(self, scores):
        # Selects top arms based on rho
        top_indices = np.argsort(scores)[-int(self.rho * len(scores)):]
        return top_indices

    def epsilon_non_greedy(self, dr, db, du):
        dr_train, dr_val, dr_test = self.split_data(dr)
        dr_batches = self.batch_data(dr_train, self.M)
        db_batches = self.batch_data(db, self.M)

        theta_student = np.zeros(self.k)  # Initialize student parameters
        for i in range(self.M):
            # Evaluate and score the current batch
            scores = np.dot(db_batches[i], theta_student)  # Example scoring
            top_winners = self.get_top_arms(scores)

            # Select top `rho` portion based on scores
            s_b = db_batches[i][top_winners]
            s_r = dr_batches[i]

            # Update teacher and student
            theta_teacher = self.train_teacher(s_r)
            theta_student = self.train_student(s_r, s_b, du)

        return theta_student


In [2]:
import numpy as np

# Sample data representing rewards for each arm
np.random.seed(42)
k = 5  # Number of arms
M = 10  # Number of batches
rho = 0.2  # Proportion of top actions to keep

# Placeholder datasets for Dr (rewards dataset), Db (action dataset), and Du (unused)
Dr = np.random.rand(100, k)
Db = np.random.rand(100, k)
Du = np.random.rand(20, k)

# Hyperparameters
lambda_teacher = 0.1
lambda_student = 0.05
gamma_student = 0.9

# Define the KArmBandit class as per the previous code, with these modifications:


In [4]:
# Initialize bandit
bandit = KArmBandit(k, M, rho, lambda_teacher, lambda_student, gamma_student)

# Run epsilon non-greedy algorithm
final_student_params = bandit.epsilon_non_greedy(Dr, Db, Du)
print("Final Student Parameters:", final_student_params)


Batch 1: Teacher Params: [0.03552587 0.04370196 0.0626958  0.0467139  0.0306611 ], Student Params: [0.60825912 0.34573048 0.4364451  0.55976072 0.64103153]
Batch 2: Teacher Params: [0.06819936 0.03576085 0.0308777  0.0671534  0.04621082], Student Params: [0.49946853 0.75400596 0.44224118 0.5659378  0.8056632 ]
Batch 3: Teacher Params: [0.06034853 0.04558606 0.04464928 0.04260004 0.04789575], Student Params: [0.84530412 0.31739472 0.45251562 0.73921765 0.58496373]
Batch 4: Teacher Params: [0.03425539 0.05412774 0.05624168 0.05041241 0.05060252], Student Params: [0.8268805  0.19353765 0.28405527 0.76235735 0.83299866]
Batch 5: Teacher Params: [0.04957494 0.05527252 0.05002577 0.0195101  0.05361435], Student Params: [0.84788414 0.57416604 0.70387378 0.47360125 0.63660464]
Batch 6: Teacher Params: [0.05342229 0.04287182 0.04375652 0.04111443 0.05296239], Student Params: [0.77647824 0.71992187 0.51624568 0.73630175 0.79647466]
Batch 7: Teacher Params: [0.03490986 0.04565798 0.06837589 0.063