In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import argparse
from matplotlib import colors as mcolors

plt.rcParams.update({"font.size": 20})
sns.set_theme(style="whitegrid")
dpi = 300
cmap = 'coolwarm'

In [None]:
def parse_args():
    parser = argparse.ArgumentParser(description = 'Hyperparameters')
    parser.add_argument('--main_seed', type = int, default = 1) # main seed for the experiments
    ### Dataset hyperparams
    parser.add_argument('--p_eval', type = int, default = 2048) # p for mod p
    parser.add_argument('--num_as', type = int, default = 16) # number of as
    parser.add_argument('--num_cs', type = int, default = 16) # number of cs
    parser.add_argument('--num_examples_per_prng', type = int, default = 1) # number of examples
    parser.add_argument('--total_examples', type = int, default = 1000_000) # number of examples
    parser.add_argument('--context_len', type = int, default = 256) # number of examples
    parser.add_argument('--chunk_size', type = int, default = 32) # number of examples
    parser.add_argument('--period_min', type = int, default = 0) # min period of training
    parser.add_argument('--period_max', type = int, default = 512) # max period of training
    ### Model hyperparams
    parser.add_argument('--n_layer', type = int, default = 1) # number of layers
    parser.add_argument('--n_head', type = int, default = 1) # number of heads
    parser.add_argument('--n_embd', type = int, default = 768)  # embedding dimension
    parser.add_argument('--head_dim', type = int, default = 768) # number of heads
    parser.add_argument('--act_name', type = str, default = 'relu') # activation
    ### Optimization hyperparams
    # parser.add_argument('--step', type = int, default = 2000) # number of training steps
    parser.add_argument('--num_steps', type = int, default = 100_000) # number of training steps
    parser.add_argument('--warmup_steps', type = int, default = 2048) # number of warmup steps
    parser.add_argument('--lr_trgt', type = float, default = 3e-4) # the target learning rate
    parser.add_argument('--lr_init', type = float, default = 1e-6) # initial learning rate
    parser.add_argument('--lr_min', type = float, default = 1e-6) # final learning rate
    parser.add_argument('--batch_size', type = int, default = 256) # batch size
    # adamw hyperparams
    parser.add_argument('--weight_decay', type = float, default = 1.0) # weight decay
    parser.add_argument('--beta1', type = float, default = 0.9) # beta1 
    parser.add_argument('--beta2', type = float, default = 0.99) # beta2
    ### Evaluation hyperparams
    parser.add_argument('--results_dir', type = str, default = './results')
    parser.add_argument('--plots_dir', type = str, default = './plots')
    # Other
    parser.add_argument('--shifts', type = int, default = 0) # position of 1 to p_eval numbers in the sequence
    
    return parser.parse_args(["--act_name=relu", "--context_len=256", "--batch_size=256", "--n_layer=1", "--p_eval=2048", "--total_examples=1000000", "--n_embd=768", \
        "--n_head=1", "--head_dim=768", "--warmup_steps=2048", "--num_steps=100000", "--num_examples_per_prng=1", "--lr_trgt=3e-04", "--weight_decay=1.0"])

config = parse_args()

config.vocab_size = config.p_eval

# # if I am not wrong, this seed only takes care of torch and not numpy
# np.random.seed(config.main_seed)
# torch.manual_seed(config.main_seed)
# torch.cuda.manual_seed(config.main_seed)

# Color
N = (config.p_eval // 6) + 1  # number of colors to extract from each of the base_cmaps below
base_cmaps = ['Greys', 'Purples', 'Reds', 'Oranges', 'Blues', 'Greens']

n_base = len(base_cmaps)
# we go from 0.2 to 0.8 below to avoid having several whites and blacks in the resulting cmaps
colors = np.concatenate([plt.get_cmap(name)(np.linspace(0.2, 0.8, N)) for name in base_cmaps])
custom_cmap = mcolors.ListedColormap(colors)

In [None]:
def get_rows_where_col_equals(df, col, value):
    return df.loc[df[col] == value].copy()

In [None]:
eval_path = f'{config.results_dir}/{config.act_name}/eval_with_train_r_star_p{config.p_eval}_np1_Tn{config.context_len}_N{config.total_examples}_ne{config.num_examples_per_prng}_n{config.n_embd}_h{config.n_head}_d{config.n_layer}_I{config.main_seed}_lr{config.lr_trgt:0.6f}_Tw{config.warmup_steps}_T{config.num_steps}_B{config.batch_size}_wd{config.weight_decay}.tab'
df_eval = pd.read_csv(eval_path, sep = '\t', index_col = 0)
df_eval['token_idx'] = df_eval['token_idx'] + 1
df_plot = get_rows_where_col_equals(df_eval, 'token_idx', df_eval['token_idx'].max())

In [None]:
colors = sns.color_palette('tab10', 10)
fontsize = 25

plt.figure(figsize=(7.5, 6))
# Basic plot settings
plt.setp(plt.gca().spines.values(), color='black', linewidth=1)
plt.xscale('log')
plt.grid(True, which='both', linestyle='--', alpha=0.3)
plt.ylim(-0.05, 1.05)
plt.xlabel('steps', fontsize=fontsize)
plt.ylabel('Accuracy', fontsize=fontsize)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.tick_params(axis='x', which='both', bottom=True, labelbottom=True)
plt.tick_params(axis='y', which='both', left=True, labelleft=True)
plt.axhline(y=1.0, linestyle='--', color='gray', alpha=0.5)
# plt.title(f'p = {config.p_eval}')
sns.lineplot(x = 'step', y = 'train_token_accuracy', data = df_plot, linewidth = 2.5, alpha = 0.8, label = 'Train', color = colors[0], errorbar = 'se',)
sns.lineplot(x = 'step', y = 'test_token_accuracy', data = df_plot, linewidth = 2.5, alpha = 0.8, label = 'Test', color = colors[1])
# sns.lineplot(x = 'step', y = 'train_token_accuracy', data = df_eval, palette = 'tab10', label='avg')
# sns.lineplot(x = 'step', y = 'test_token_accuracy', data = df_eval, palette = 'tab10', label='avg')
plt.legend(fontsize=fontsize)

## Save the figure. Replace the filename and directory with your own.
# plt.savefig(f'{config.plots_dir}/accs_main_{config.p_eval}_d{config.n_layer}_h{config.n_head}_ne{config.n_embd}.pdf', format='pdf', dpi=400, bbox_inches='tight')

plt.show()