In [1]:
import torch
import os

from src.rl.evaluators.evaluator_dqn import EvaluatorDQN
from src.rl.evaluators.evaluator_c51 import EvaluatorC51
from src.rl.evaluators.evaluator_qr import EvaluatorQR
from src.rl.evaluators.evaluator_iqn import EvaluatorIQN
from src.rl.evaluators.evaluator_fqf import EvaluatorFQF

In [2]:
%load_ext autoreload
%autoreload 2

# Random Baseline

In the following cell, we create random predictions to be used as a lower bound baseline. We get random baselines for the average return on the development set, as well as predictions on the test set to be evaluated via the MIND competition.

In [None]:
DEVELOPMENT = True

eval_data = read_pickled_data([
    DEV_PATH if DEVELOPMENT else TEST_PATH,
    "preprocessed",
    "behaviors.pkl"
])

if DEVELOPMENT:
    SEEDS = [42, 100, 7]
    GAMMA = 0.9
else:
    SEED = 42
    PREDICTIONS_DIR = os.path.join("./predictions", "random-baseline", f"{SEED}")
    if not os.path.exists(PREDICTIONS_DIR):
        os.makedirs(PREDICTIONS_DIR)


### Test Set

In [None]:
# Prepare prediction buffer
predictions = StringIO()
csv_writer = writer(predictions)
columns=["impression_id", "ranking"]
csv_writer.writerow(columns)

#! Set seed
np.random.seed(SEED)

for row in tqdm(eval_data.itertuples(), total=len(eval_data)):
    # Get impression id and number of candidates
    impression_id = row.id
    num_candidates = len(row.shown_news)

    # Create random ranking, add 1 (lowest rank is 1)
    ranking = np.random.permutation(num_candidates) + 1

    # Write prediction
    pred = [impression_id, ranking]
    csv_writer.writerow(pred)

print(f"[INFO] writing predictions file to {}")
predictions.seek(0)
data_predictions = pd.read_csv(predictions)
data_predictions["ranking"] = data_predictions["ranking"].progress_apply(
    lambda x: f"[{','.join(x[1:-1].split())}]"
)
data_predictions.to_csv(
    os.path.join(PREDICTIONS_DIR, "prediction.txt"),
    sep=' ',
    index=False,
    header=False
)

### Development Set

We compute the average return for multiple random baselines, as well as the average over all baselines.

In [None]:
# Collect all average returns
mean_returns = []
std_returns = []

# Evaluate multiple random baselines
for seed in SEEDS:
    print(f"[INFO] evaluating random baseline, seed: {seed}")
    
    #! Set seed
    np.random.seed(seed)
    
    # Collect returns
    returns = []

    for row in tqdm(eval_data.itertuples(), total=len(eval_data)):
        shown_news = row.shown_news
        clicked_news = set(row.clicked_news)

        # Randomly order candidates
        np.random.shuffle(shown_news)
        
        # Compute return
        G = 0
        for t, news_id in enumerate(shown_news):
            reward = 0
            if news_id in clicked_news:
                reward = 1
            G += ((GAMMA**t) * reward)
        returns.append(G)

    # Compute average return
    mean_return = np.array(returns).mean()
    std_return = np.array(returns).std()
    print(f"[RESULT] Return: {mean_return:.4f} +/- {std_return:.4f}")

    mean_returns.append(mean_return)
    std_returns.append(std_return)

total_mean = np.array(mean_returns).mean()
total_mean_std = np.array(std_returns).mean()
print(f"\n[RESULT] Average return over all baselines: {total_mean:.4f} +/- {total_mean_std:.4f}")

# DQN

In [4]:
development = True
model_name = "DQN-n-m-noweight"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

evaluator = EvaluatorDQN(development, model_name, device, seed=7)
evaluator.set_evaluatee()
evaluator.evaluate()

[INFO] device: cuda
[INFO] preparing predictions directory
[INFO] reading config files
[INFO] model checkpoints: ['dqn_1.pth', 'dqn_2.pth', 'dqn_3.pth', 'dqn_4.pth', 'dqn_5.pth', 'dqn_6.pth', 'dqn_7.pth', 'dqn_8.pth', 'dqn_final.pth']
[INFO] preparing data and sampler
[DONE] evaluator initialized
[INFO] evaluating 'DQN-n-m-noweight', checkpoint 'dqn_1.pth'


  0%|          | 0/376471 [00:00<?, ?it/s]

[RESULT] Return: 0.6432
[INFO] evaluating 'DQN-n-m-noweight', checkpoint 'dqn_2.pth'


  0%|          | 0/376471 [00:00<?, ?it/s]

[RESULT] Return: 0.7153
[INFO] evaluating 'DQN-n-m-noweight', checkpoint 'dqn_3.pth'


  0%|          | 0/376471 [00:00<?, ?it/s]

[RESULT] Return: 0.7122
[INFO] evaluating 'DQN-n-m-noweight', checkpoint 'dqn_4.pth'


  0%|          | 0/376471 [00:00<?, ?it/s]

[RESULT] Return: 0.7569
[INFO] evaluating 'DQN-n-m-noweight', checkpoint 'dqn_5.pth'


  0%|          | 0/376471 [00:00<?, ?it/s]

[RESULT] Return: 0.7648
[INFO] evaluating 'DQN-n-m-noweight', checkpoint 'dqn_6.pth'


  0%|          | 0/376471 [00:00<?, ?it/s]

[RESULT] Return: 0.7693
[INFO] evaluating 'DQN-n-m-noweight', checkpoint 'dqn_7.pth'


  0%|          | 0/376471 [00:00<?, ?it/s]

[RESULT] Return: 0.7748
[INFO] evaluating 'DQN-n-m-noweight', checkpoint 'dqn_8.pth'


  0%|          | 0/376471 [00:00<?, ?it/s]

[RESULT] Return: 0.7802
[INFO] evaluating 'DQN-n-m-noweight', checkpoint 'dqn_final.pth'


  0%|          | 0/376471 [00:00<?, ?it/s]

[RESULT] Return: 0.7803
[INFO] writing evaluation results file to c:\workbench\developer\drlnrs\src\models\DQN-n-m-noweight\predictions_7
[DONE] evaluation completed
