Set debug for mini_local mode. I'll use yaml for easy configuration, which requires 'directory args' only.

debug -> internal mode (disable below modes)

mini_local -> mini_test in local if True

mini_lambda -> search hyperparameter if True

main_lambda -> try main_test if True

In [None]:
from IPython import get_ipython

In [None]:
debug =  True

In [None]:
import yaml
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--yaml_path', default='configs/baseline.yaml', type=str)
parser.add_argument('--mode', default='mini_local', type=str)
args = parser.parse_args(args=[])

config = yaml.load(open(args.yaml_path, 'r'), Loader = yaml.Loader)
config = config[args.mode]
yaml_name = args.yaml_path[8:-5]

In [None]:
import os
try:
    os.mkdir(f'models')
    os.mkdir(f'res')
    os.mkdir(f'cache')
    os.mkdir(f'supervised_model')
except FileExistsError:
    pass

In [None]:
import torch

if torch.cuda.is_available():
        device = torch.device('cuda')
else:
        device = torch.device('mps')

Set most parts of the hyperparameters

In [None]:
which_model = \
{'mine':{
    'sent_emb_model_path':'sorryhyun/sentence-embedding-klue-large',
    'use_cls':False,
    'hidden_size':1024,
    }
}

In [None]:
save_cache=False
model_set_dict = which_model['mine']
num_proc = 4 if debug is False else 0

Download pre-split files via *(deprecated hyperlink)*

Auths for sharing are limited to a-ha empolyees

In [None]:
from datasets import load_from_disk

if config['shard_num']>1:
    train_question_dataset = load_from_disk('data/train_question_dataset').shard(config['shard_num'], 0)
    train_answer_dataset = load_from_disk('data/train_answer_dataset').shard(config['shard_num'], 0)
    test_question_dataset = load_from_disk('data/test_question_dataset').shard(int(config['shard_num']/2), 0)
    test_answer_dataset = load_from_disk('data/test_answer_dataset').shard(int(config['shard_num']/2), 0)
else:
    train_question_dataset = load_from_disk('data/train_question_dataset')
    train_answer_dataset = load_from_disk('data/train_answer_dataset')
    test_question_dataset = load_from_disk('data/test_question_dataset')
    test_answer_dataset = load_from_disk('data/test_answer_dataset')

train_question_dataset = train_question_dataset.train_test_split(test_size=0.1)
train_answer_dataset = train_answer_dataset.train_test_split(test_size=0.1)

In [None]:
from datasets import Dataset

def extract_contents(q_dataset, a_dataset):
    question_contents_extracted = {}
    answer_contents_extracted = {}

    question_extracted = [(x['a'], x['b'], x['c'], x['d']) for x in q_dataset] # embargo

    a,b,c,d = zip(*question_extracted)
    question_contents_extracted = {
        'paragraph': list(paragraph)
    }
    answer_extracted = [(x['a'], x['b'], x['c']) for x in a_dataset] # embargo
    a, b, c = zip(*answer_extracted)
    answer_contents_extracted = {
        'paragraph': list(paragraph)
    }

    return Dataset.from_dict(question_contents_extracted), Dataset.from_dict(answer_contents_extracted)

q_contents, a_contents = extract_contents(train_question_dataset['train'], train_answer_dataset['train'])

In [None]:
from funcs.preprocessors import Preprocessor
preprocessor = Preprocessor(debug=debug, config=config)

q_preprocessed= preprocessor.preprocess(q_contents)
a_preprocessed = preprocessor.preprocess(a_contents)

A method make_reps() will concat representations of question and question title automatically.

In [None]:
from funcs.embedders import Embedder
from funcs.tokenized_loader import Tokenized_loader
from torch.utils.data import DataLoader
import numpy as np

def make_reps(preprocessed_data, is_question=True):
    tokenized_loader = Tokenized_loader(sent_emb_model_path=model_set_dict['sent_emb_model_path'], config=config)

    all_outputs, contents_len, tokenized_data, indices, split_contents_flatten   =\
        tokenized_loader.set_for_rep(preprocessed_data, is_question = is_question)

    dataloader = DataLoader(tokenized_data, batch_size=config['batch_size'], num_workers=num_proc, pin_memory=True)

    embedder = Embedder(sent_emb_model_path=model_set_dict['sent_emb_model_path'], device=device, config=config)
    all_outputs_res = embedder.run_rep_mean_pool(all_outputs, dataloader)

    all_outputs_res = all_outputs_res.detach().cpu().numpy()

    del embedder
    return all_outputs_res, indices, split_contents_flatten

q_reps, q_indices, q_flatten = make_reps(q_preprocessed, is_question=True)
a_reps, a_indices, a_flatten = make_reps(a_preprocessed, is_question=False)

flatten_texts = q_flatten + a_flatten
reps = np.concatenate((q_reps, a_reps))

In [None]:
# import h5py, json
# if save_cache:
#     with h5py.File(f'cache/3q_reps', 'w') as hdf5_file:
#         dset = hdf5_file.create_dataset('q', data=q_reps, chunks=True, compression="gzip", compression_opts=9)
#     with open(f'cache/3q_indices.json','w') as f: json.dump(q_indices,f)

#     with h5py.File(f'cache/3a_reps', 'w') as hdf5_file:
#         dset = hdf5_file.create_dataset('a', data=a_reps, chunks=True, compression="gzip", compression_opts=9)
#     with open(f'cache/3a_indices.json','w') as f: json.dump(a_indices,f)

In [None]:
import numpy as np
from utils.customized_com_det import community_detection

def sample_and_cluster(reps, flatten_texts, threshold=0.6):

    cluster_idx_dict = {} # cluster_label : index_of_reps
    cluster_text_dict = {} # cluster_label : sentence

    if debug:
        sample_size = 2000
        sampled_indices = np.random.choice(len(reps), size=sample_size, replace=False)
        sampled_reps= reps[sampled_indices]
        sampled_texts = [flatten_texts[x] for x in sampled_indices]

    elif 'main' in config['mode']:
        sample_size = len(flatten_texts)
        sampled_reps = reps
        sampled_texts = flatten_texts

    else:
        sampled_reps = reps
        sampled_texts = flatten_texts
        sampled_indices = np.random.choice(len(reps), size=config['sample_size'], replace=False)
        sampled_reps= reps[sampled_indices]
        sampled_texts = [flatten_texts[x] for x in sampled_indices]

    clusters  = community_detection(sampled_reps,  \
        batch_size=config['batch_size'], min_community_size=config['comm_size'], threshold=threshold, show_progress_bar=True)

    cluster_criteria = [torch.from_numpy(reps[x[0],:]) for x in clusters]
    cluster_criteria = torch.stack(cluster_criteria)

    # out of clustered data would be regarded as 'ooc'
    # this will be the last label in supervised model
    ooc = len(clusters)
    flatten_cluster = [ooc for _ in range(config['sample_size'])]

    for idx in range(len(clusters)):
        cluster_idx_dict[idx] = clusters[idx]
        cluster_text_dict[idx] = [sampled_texts[x] for x in clusters[idx]]
        for clustered_idx in clusters[idx]:
            flatten_cluster[clustered_idx] = idx

    return cluster_criteria, cluster_idx_dict, cluster_text_dict, flatten_cluster

cluster_criteria, cluster_idx_dict, cluster_text_dict, flatten_cluster =\
    sample_and_cluster(reps, flatten_texts, threshold=config['cosine_threshold'])

clustered_data = []
cluster_label = []
for key in cluster_text_dict.keys():
    clustered_data.append(cluster_text_dict[key])
    cluster_label.append([key for _ in range(len(cluster_text_dict[key]))])

Sample & test average performance. This is for mini-test-lambda.

In [None]:
from transformers import set_seed
from utils.utils import get_flatten
import pickle, json
import os

try:
    os.mkdir(f'models/{yaml_name}')
    os.mkdir(f'res/{yaml_name}')
    os.mkdir(f'cache/{yaml_name}')
except FileExistsError:
    pass

for n, seed_num in enumerate(config['seed_list']):
    set_seed(seed_num)
    cluster_criteria, cluster_idx_dict, cluster_text_dict, flatten_cluster =\
        sample_and_cluster(reps, flatten_texts, threshold=config['cosine_threshold'])

    clustered_data = []
    cluster_label = []
    for key in cluster_text_dict.keys():
        clustered_data.append(cluster_text_dict[key])
        cluster_label.append([key for _ in range(len(cluster_text_dict[key]))])

    train_text, _ = get_flatten(clustered_data)
    train_label, _ = get_flatten(cluster_label)
    train_dataset = Dataset.from_dict({'text':train_text, 'label':train_label})
    train_dataset.save_to_disk(f'cache/{yaml_name}/train_dataset_{str(n)}')

    torch.save(cluster_criteria, f'models/{yaml_name}/cluster_criteria_{str(n)}')
    with open(f'res/{yaml_name}/cluster_dict_{str(n)}.pickle', 'wb') as f:
        pickle.dump(cluster_idx_dict, f, pickle.HIGHEST_PROTOCOL)
    with open(f'res/{yaml_name}/cluster_text_dict_{str(n)}.json', 'w') as f:
        json.dump(cluster_text_dict, f, ensure_ascii=False, indent=4)

    torch.cuda.empty_cache()

Do same procedure on split set, dev and test.

In [None]:
dev_q_contents, dev_a_contents = extract_contents(
    train_question_dataset['test'], train_answer_dataset['test'])

dev_q_preprocessed= preprocessor.preprocess(dev_q_contents)
dev_a_preprocessed= preprocessor.preprocess(dev_a_contents)

q_reps, q_indices, q_flatten = make_reps(dev_q_preprocessed, is_question=True)
a_reps, a_indices, a_flatten = make_reps(dev_a_preprocessed, is_question=False)

dev_flatten_texts = q_flatten + a_flatten
dev_reps = np.concatenate((q_reps, a_reps))

In [None]:
test_q_contents, test_a_contents = extract_contents(test_question_dataset, test_answer_dataset)

test_q_preprocessed= preprocessor.preprocess(test_q_contents)
test_a_preprocessed= preprocessor.preprocess(test_a_contents)

q_reps, q_indices, q_flatten = make_reps(test_q_preprocessed, is_question=True)
a_reps, a_indices, a_flatten = make_reps(test_a_preprocessed, is_question=False)

test_flatten_texts = q_flatten + a_flatten
test_reps = np.concatenate((q_reps, a_reps))

Predict their cluster, like pseudo-labeling

In [None]:
def predict_cluster(rep_tensor, cluster_criteria, threshold=0.6):
    rep_tensor = torch.from_numpy(rep_tensor)
    eps = 1e-8
    rep_norm = torch.nn.functional.normalize(rep_tensor, p=2, dim=1).to(device)
    criteria_norm = torch.nn.functional.normalize(cluster_criteria, p=2, dim=1).to(device)
    cos_sim_matrix = torch.mm(rep_norm, criteria_norm.t())
    sim_mt = cos_sim_matrix.clamp(min=eps)
    res = sim_mt.argmax(dim=1)
    ooc = len(cluster_criteria)
    return [res[x].item() if sim_mt[x,res[x]]>=threshold else ooc
            for x in range(rep_tensor.shape[0])]

In [None]:
for n in range(len(config['seed_list'])):
    cluster_criteria_path = f'models/{yaml_name}/cluster_criteria_{n}'
    cluster_criteria = torch.load(cluster_criteria_path,  map_location=device)

    dev_labels = predict_cluster(dev_reps, cluster_criteria, threshold=config['cosine_threshold'])
    dev_dataset = Dataset.from_dict({'text':dev_flatten_texts, 'label':dev_labels})

    test_labels = predict_cluster(test_reps, cluster_criteria, threshold=config['cosine_threshold'])
    test_dataset = Dataset.from_dict({'text':test_flatten_texts, 'label':test_labels})

Evaluate via distillation using psuedo-Supervised-model

In [None]:
from funcs.trainer import BERT_Trainer
import json

BERT_model_name = 'monologg/koelectra-base-v3-discriminator'
predicted_res = []
achieved_scores= []

for seed in config['seed_list']:
    set_seed(seed)
    if 'main' in args.mode:
        output_path = f'supervised_model_main/{yaml_name}_seed_{str(seed)}'
    else:
        output_path = f'supervised_model/{yaml_name}_seed_{str(seed)}'

    try:
        os.mkdir(output_path)
    except FileExistsError:
        pass

    bert_trainer = BERT_Trainer(
        pretrained_model_path =BERT_model_name, output_path=output_path,
        debug=debug, num_labels = len(cluster_criteria), config=config)

    bert_trainer.train_with_optimization(train_dataset = train_dataset, dev_dataset = dev_dataset, mode=config['mode'], batch_size = config['batch_size'])
    predicted_labels = bert_trainer.test(test_dataset = test_dataset)
    predicted_res.append(list(predicted_labels.label_ids))
    achieved_scores.append(predicted_labels.metrics['test_f1'])

    del bert_trainer
    torch.cuda.empty_cache()

total_res = {'mean_score':np.mean(achieved_scores), 'score_std':np.std(achieved_scores)}
if 'main' in args.mode:
    with open(f'res_main/{yaml_name}_results.json','w') as f: json.dump(total_res, f, indent=4)
    with open(f'res_main/{yaml_name}_results_labels.pickle','wb') as f: pickle.dump(predicted_res, f)
else:
    with open(f'res/{yaml_name}_results.json','w') as f: json.dump(total_res, f, indent=4)
    with open(f'res/{yaml_name}_results_labels.pickle','wb') as f: pickle.dump(predicted_res, f)