In [None]:
import json
import pandas as pd
import seaborn as sns
import os
from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
from pathlib import Path

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


In [None]:
BATCH_SIZE = 64
DEVICE = 'cuda'
LR = 1e-5
NUM_ITER = 30

out_dir_base = Path('data/exp_access/reliance_model')

In [None]:
import load_exp_results
valid_user, task_result, f_score, _, _ = load_exp_results.load()

f_score = f_score[f_score['condition'].map(lambda x: 'RandomCueUser' in x)]
task_result = task_result[task_result['user_class'].map(lambda x: 'RandomCueUser' in x)]

In [None]:
import torch
import numpy as np

import inference
inference.init()

def which_dtype(array):
    if type(array) == torch.Tensor:
        return array.dtype
    if np.issubdtype(array.dtype, np.integer):
        return torch.long
    elif np.issubdtype(array.dtype, np.floating):
        return torch.float32
    elif np.issubdtype(array.dtype, np.bool_):
        return torch.bool

def get_input(user_id, target_index):
    example = task_result[task_result['user_id'] == user_id].sort_values('episode_id')

    src = dict()
    src['middles'] = np.vstack(example.apply(lambda raw: inference.get_y(raw['dataset_name'], 'captcha-09az+capital-color', raw['ground_truth'], ['middle'])[0], axis=1).values).astype(np.float32)
    src['cues'] = example['system_action_token'].values.copy()
    src['decisions'] = example['user_decision_token'].values.copy()

    src = {key: torch.tensor(value, dtype=which_dtype(value)) for key, value in src.items()}
    mask = {'middles': torch.zeros((src['middles'].shape[0]), dtype=torch.bool)}

    mask['middles'][target_index+1:] = True

    #ys = np.vstack(example.apply(lambda raw: inference.get_y(raw['dataset_name'], 'captcha-09az+capital-color', raw['ground_truth'], ['y'])[0], axis=1).values).astype(np.float32)
    ys = example.apply(lambda raw: inference.get_y(raw['dataset_name'], 'captcha-09az+capital-color', raw['ground_truth'], ['y'])[0], axis=1).values[target_index]
    return src, mask, ys

class Dataset(torch.utils.data.Dataset):
    def __init__(self, user_ids, rtn_acc=False):
        self.user_ids = user_ids
        self.n_episode = 60
        self.rtn_acc = rtn_acc
        return

    def __len__(self):
        return self.n_episode * len(self.user_ids)

    def __getitem__(self, idx):
        user_id = self.user_ids[idx // self.n_episode]
        episode_idx = idx % self.n_episode
        src, mask, ys = get_input(user_id, episode_idx)
        length = src['middles'].shape[0]

        src = {key: self._padding(value, self.n_episode, 0) for key, value in src.items()}
        mask = {key: self._padding(value, self.n_episode, True) for key, value in mask.items()}

        if length <= episode_idx:
            # ignore_index
            label = [torch.tensor(-100)]
        else:
            # 0: AI, 1: SELF
            label = src['decisions'][episode_idx].clone() - 1
        # add [MASK] token
        src['decisions'][episode_idx:] = MASK_TOKEN

        if self.rtn_acc:
            conf = inference.instance_confidence_calculators['captcha-09az+capital-color'].calc_score(
                ys.reshape((1, 36, 5))
            )
            acc = inference.acc_estimators['captcha-09az+capital-color'].predict_proba(conf.reshape((1, 1)))[0,1]
            return (src, mask, idx % self.n_episode), label, acc
        return (src, mask, idx % self.n_episode), label

    def _padding(self, array, length, value=None):
        orig_length = array.shape[0]
        if orig_length == length:
            return array
        elif orig_length > length:
            raise ValueError('Input array too long.')
        addition = torch.concat([torch.empty_like(array[:1])] * (length - array.shape[0]))
        if value is not None:
            addition[:] = value
        return torch.concat([array, addition])

In [None]:
from sklearn.model_selection import StratifiedKFold

import trust_model
import train_model
from const import MASK_TOKEN
def train(out_dir_base, targets=None):
    out_dir_base.mkdir(exist_ok=True, parents=True)

    ids = pd.DataFrame(sum(
        [[{'name': f'{line["condition"]}-{line["rate"]}', 'index': item} for item in line[0]] 
            for _, line in pd.DataFrame(f_score.groupby(['condition', 'rate']).apply(lambda grp: grp.index)).reset_index().iterrows()],
        list())).set_index('index')

    skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
    results = list()

    for i_split, (idxs_train, idxs_test) in enumerate(skf.split(ids.index, ids['name'])):
        out_dir = out_dir_base / str(i_split)
        out_dir.mkdir(exist_ok=True)
        
        user_ids_train = ids.index[idxs_train]
        user_ids_test = ids.index[idxs_test]
        dataset_train, dataset_test = Dataset(np.hstack(user_ids_train)), Dataset(np.hstack(user_ids_test))
        dataset_train, dataset_test = map(tuple, [dataset_train, dataset_test])

        dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
        dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=False)

        if targets is None:
            # full
            model = trust_model.SimpleTransformerEncoder_Access(n_head=16, n_feature=128, dropout=0.2, n_hidden=2048, n_layers=3, n_out=2)
        else:
            # ablation
            model = trust_model.SimpleTransformerEncoder_Access_Ablation(n_head=16, n_feature=128, dropout=0.2, n_hidden=2048, n_layers=3, n_out=2, targets=targets)
        model = model.to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=LR)
        criterion = torch.nn.CrossEntropyLoss()

        for i in range(NUM_ITER):
            result_train = train_model.train(model, optimizer, criterion, dataloader_train, device=DEVICE)
            result_test = train_model.test(model, criterion, dataloader_test, device=DEVICE)
            results.append(
                dict(i_split=i_split, i=i, train=True, **result_train)
            )
            results.append(
                dict(i_split=i_split, i=i, train=False, **result_test)
            )
            print(i, result_train, result_test)
            torch.save(model.state_dict(), out_dir/f'{i}.pth')
    results = pd.DataFrame(results)
    results.to_json(out_dir_base/'log.json')

out_dir_base = Path('data/exp_access/reliance_model')
train(out_dir_base)


In [None]:
import pandas as pd
from matplotlib import pyplot as plt

results = pd.read_json(out_dir_base/'log.json')

sns.lineplot(data=results[~results['train']], x='i', y='loss')
plt.vlines(19, 0.4, 0.65)
plt.show()
sns.lineplot(data=results[~results['train']], x='i', y='correct')
plt.vlines(19, 0.6, 0.85)
plt.show()
sns.lineplot(data=results[results['train']], x='i', y='loss')
plt.vlines(19, 0.4, 0.65)
plt.show()
results[~results['train']].groupby('i').mean(), results[~results['train']].groupby('i').std()