In [1]:
import pandas as pd
import torch
import numpy as np
import json

from sklearn.model_selection import train_test_split

In [2]:
import inference
import trust_model
import train_model

inference.init()

In [3]:
model = trust_model.SimpleTransformerEncoder2(n_head=16, n_feature=128, dropout=0.1, n_hidden=2048, n_layers=3, n_out=2)

In [4]:
def which_dtype(array):
    if type(array) == torch.Tensor:
        return array.dtype
    if np.issubdtype(array.dtype, np.integer):
        return torch.long
    elif np.issubdtype(array.dtype, np.floating):
        return torch.float32
    elif np.issubdtype(array.dtype, np.bool_):
        return torch.bool

In [5]:
captcha_answers = pd.read_json('data/paper_data.json')
def action_to_number(a):
    if a == 'AI':
        return 0
    else:
        return 2
captcha_answers['action'] = captcha_answers['action'].map(action_to_number)

RANDOM_MODES =  {'RANDOM0',
 'RANDOM12',
 'RANDOM24',
 'RANDOM36',
 'RANDOM48',
 'RANDOM60'}

captcha_answers = captcha_answers[captcha_answers['mode'].isin(RANDOM_MODES)]
user_ids = captcha_answers['user_id']

def get_input(user_id, target_index, n=1, no_mask=False):
    example = captcha_answers[captcha_answers['user_id'] == user_id].sort_index()

    src = dict()
    src['middles'] = np.vstack(example.apply(lambda raw: inference.get_y(raw['dataset_name'], 'captcha-09az+capital-color', raw['ground_truth'], ['middle'])[0], axis=1).values).astype(np.float32)

    src['instance_confs'] = example['instance_conf'].values[:,np.newaxis].copy()
    src['domain_confs'] = np.empty_like(src['instance_confs'])

    src['actions'] = example['action'].values.copy()
    src['actions'][src['actions'] == 2] = 1
    def calc_feedback(raw):
        if raw['action'] == 0:
            return 2
        if raw['answer'] == raw['AI_answer']:
            return 0
        else:
            return 1
    src['feedbacks'] = example.apply(calc_feedback, axis=1).values

    src = {key: torch.tensor(value, dtype=which_dtype(value)) for key, value in src.items()}
    mask = {name: torch.zeros((value.shape[0]), dtype=torch.bool) for name, value in src.items()}


    #cues = example['cue'].apply(lambda c: pd.Series(json.loads(c))).reset_index()
    src['domain_confs'] *= 0
    src['domain_confs'] += -100
    #print(example['instance_conf_shown'].reeset_index() == False)
    src['instance_confs'][example['instance_conf_shown'].values == False] = -100

    for name in src.keys():
        if name in ['trust', 'actions', 'feedbacks']:
            mask[name][target_index+n:] = True
        else:
            mask[name][target_index+n:] = True

    return src, mask

In [6]:

class Dataset(torch.utils.data.Dataset):
    def __init__(self, user_ids, with_trust=False, n=1, no_mask=False):
        self.user_ids = user_ids
        self.n_episode = 60
        self.with_trust = with_trust
        self.n = n
        self.no_mask = no_mask
        return

    def __len__(self):
        return self.n_episode * len(self.user_ids)

    def __getitem__(self, idx):
        user_id = self.user_ids[idx // self.n_episode]
        episode_idx = idx % self.n_episode
        src, mask = get_input(user_id, episode_idx, self.n, self.no_mask)
        length = src['middles'].shape[0]
        #if length < self.n_episode:
        src = {key: self._padding(value, self.n_episode, 0) for key, value in src.items()}
        mask = {key: self._padding(value, self.n_episode, True) for key, value in mask.items()}

        if length <= episode_idx:
            # ignore_index
            if (not self.with_trust) or self.n != 1:
                label = torch.tensor(-100)
            else:
                label = [torch.tensor(-100)] * self.n
        else:
            if self.with_trust:
                if self.n == 1:
                    label = src['trust'][episode_idx].clone()
                else:
                    label = [src['trust'][i].clone() \
                        for i in range(episode_idx, min(episode_idx+self.n, length))] + \
                        [torch.tensor(-100)] * max(0, episode_idx + self.n - length )
            else:
                label = src['actions'][episode_idx].clone()
        # add [MASK] token
        src['actions'][episode_idx:] = 3
        src['feedbacks'][episode_idx:] = 3


        return (src, mask, idx % self.n_episode), label

    def _padding(self, array, length, value=None):
        orig_length = array.shape[0]
        if orig_length == length:
            return array
        elif orig_length > length:
            raise ValueError('Input array too long.')
        addition = torch.concat([torch.empty_like(array[:1])] * (length - array.shape[0]))
        if value is not None:
            addition[:] = value
        return torch.concat([array, addition])


In [7]:
#user_ids = set(captcha_answers['user_id']
user_ids = captcha_answers.groupby('user_id')['mode'].apply(set)
users_groupby_mode = {mode: user_ids[user_ids == {mode}].index.tolist() for mode in RANDOM_MODES}
user_ids_train, user_ids_test = list(zip(*map(lambda key: train_test_split(users_groupby_mode[key], random_state=42, test_size=0.2),
    users_groupby_mode
    )
))

dataset_train, dataset_test = Dataset(np.hstack(user_ids_train), with_trust=False, n=1), Dataset(np.hstack(user_ids_test), with_trust=False)
dataset_train, dataset_test = map(tuple, [dataset_train, dataset_test])

In [8]:
batch_size=128
device ='cuda'
lr = 1e-5
num_iter = 50
out_dir = 'results'

dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

model = trust_model.SimpleTransformerEncoder2(n_head=16, n_feature=128, dropout=0.1, n_hidden=2048, n_layers=3, n_out=2)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#criterion = torch.nn.MSELoss()
criterion = torch.nn.CrossEntropyLoss()
model = model.to(device)

train_results = list()
test_results = list()

for i in range(num_iter):
    train_results.append(train_model.train(model, optimizer, criterion, dataloader_train))
    test_results.append(train_model.test(model, criterion, dataloader_test))
    loss = test_results[-1]['loss']
    print(i, train_results[-1], test_results[-1])

0 {'loss': 0.6788532733917236} {'loss': 0.6453506350517273, 'correct': 0.75}
1 {'loss': 0.6412965655326843} {'loss': 0.6044837832450867, 'correct': 0.7489583492279053}
2 {'loss': 0.6075401306152344} {'loss': 0.5701028108596802, 'correct': 0.7541666626930237}
3 {'loss': 0.5765319466590881} {'loss': 0.5444603562355042, 'correct': 0.7598958611488342}
4 {'loss': 0.5508473515510559} {'loss': 0.5211256742477417, 'correct': 0.7682291865348816}
5 {'loss': 0.527217447757721} {'loss': 0.5003982782363892, 'correct': 0.7770833373069763}
6 {'loss': 0.4995335042476654} {'loss': 0.4885295033454895, 'correct': 0.7828124761581421}
7 {'loss': 0.4884278476238251} {'loss': 0.4754669964313507, 'correct': 0.7822916507720947}
8 {'loss': 0.47342780232429504} {'loss': 0.4672887921333313, 'correct': 0.7880208492279053}
9 {'loss': 0.460038959980011} {'loss': 0.4626694321632385, 'correct': 0.7895833253860474}
10 {'loss': 0.45185792446136475} {'loss': 0.4589918553829193, 'correct': 0.7911458611488342}
11 {'loss': 