In [None]:
%pip install transformers==4.15.0

In [None]:
import csv
import os
import argparse
import random
from tqdm import tqdm, trange

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

#from modeling import BertConfig, BertForSequenceClassification
from transformers import BertForSequenceClassification, BertConfig, AdamW, BertTokenizer

import json

In [None]:
n_class = 4
BATCH_SIZE = 3
lr = 5e-5
EPOCH = 1
data_dir = "."
vocab_file = "vocab.txt"
max_seq_length = 128

tokenizer = BertTokenizer(
        vocab_file=vocab_file, do_lower_case=False)

In [None]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None, text_c=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.text_c = text_c
        self.label = label


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


class c3Processor():
    def __init__(self):
        random.seed(42)
        self.D = [[], [], []]

        for sid in range(3):
            data = []
            with open("data/"+["train_HW3dataset.json", "dev_HW3dataset.json", "test_HW3dataset.json"][sid], "r", encoding="utf8") as f:
                data += json.load(f)
            if sid == 0:
                random.shuffle(data)
            for i in range(len(data)):
                for j in range(len(data[i][1])):
                    d = ['\n'.join(data[i][0]).lower(), data[i][1][j]["question"].lower()]
                    for k in range(len(data[i][1][j]["choice"])):
                        d += [data[i][1][j]["choice"][k].lower()]
                    for k in range(len(data[i][1][j]["choice"]), 4):
                        d += ['']
                    if sid!=2:
                        d += [data[i][1][j]["answer"].lower()]
                    else:
                        d += [data[i][1][j]["choice"][0].lower()]
                    self.D[sid] += [d]
    
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
                self.D[0], "train")

    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
                self.D[2], "test")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
                self.D[1], "dev")

    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _create_examples(self, data, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, d) in enumerate(data):
            for k in range(4):
                if data[i][2+k] == data[i][6]:
                    answer = str(k)
                    
            label = answer

            for k in range(4):
                guid = "%s-%s-%s" % (set_type, i, k)
                text_a = data[i][0]
                text_b = data[i][k+2]
                text_c = data[i][1]
                examples.append(
                        InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, text_c=text_c))
            
        return examples



def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    print("#examples", len(examples))

    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i

    features = [[]]
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = tokenizer.tokenize(example.text_b)

        tokens_c = tokenizer.tokenize(example.text_c)

        _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_seq_length - 4)
        tokens_b = tokens_c + ["[SEP]"] + tokens_b

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        label_id = label_map[example.label]

        features[-1].append(
                InputFeatures(
                        input_ids=input_ids,
                        input_mask=input_mask,
                        segment_ids=segment_ids,
                        label_id=label_id))
        if len(features[-1]) == n_class:
            features.append([])

    if len(features[-1]) == 0:
        features = features[:-1]
    print('#features', len(features))
    return features




def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
    """Truncates a sequence tuple in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b) + len(tokens_c)
        if total_length <= max_length:
            break
        if len(tokens_a) >= len(tokens_b) and len(tokens_a) >= len(tokens_c):
            tokens_a.pop()
        elif len(tokens_b) >= len(tokens_a) and len(tokens_b) >= len(tokens_c):
            tokens_b.pop()
        else:
            tokens_c.pop()            


def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    print(outputs,labels)
    return np.sum(outputs==labels)

In [None]:
do_train = True
do_eval = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()


random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if n_gpu > 0:
    torch.cuda.manual_seed_all(42)

bert_config = BertConfig.from_json_file("bert_config.json")

processor = c3Processor()
label_list = processor.get_labels()



train_examples = None
num_train_steps = None
if do_train:
    train_examples = processor.get_train_examples(data_dir)
    num_train_steps = int(
        len(train_examples) / n_class / BATCH_SIZE * EPOCH)

model = BertForSequenceClassification.from_pretrained('bert-base-chinese', return_dict=True, num_labels=4)
model.to(device)
model.num_labels = 4
# nnv = NNView().to(device)
model.classifier = nn.Linear(model.config.hidden_size, 1).to(device)

if n_gpu > 1:
    model = torch.nn.DataParallel(model)

no_decay = ['bias']
optimizer_parameters = [
    {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01},
    {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0}
    ]

optimizer = AdamW(optimizer_parameters,
                        lr=lr)

In [None]:
train_features = convert_examples_to_features(
        train_examples, label_list, max_seq_length, tokenizer)
eval_examples = processor.get_dev_examples(data_dir)
eval_features = convert_examples_to_features(
    eval_examples, label_list, max_seq_length, tokenizer)

input_ids = []
input_mask = []
segment_ids = []
label_id = []

for f in eval_features:
    input_ids.append([])
    input_mask.append([])
    segment_ids.append([])
    for i in range(n_class):
        input_ids[-1].append(f[i].input_ids)
        input_mask[-1].append(f[i].input_mask)
        segment_ids[-1].append(f[i].segment_ids)
    label_id.append([f[0].label_id])                

all_input_ids = torch.tensor(input_ids, dtype=torch.long)
all_input_mask = torch.tensor(input_mask, dtype=torch.long)
all_segment_ids = torch.tensor(segment_ids, dtype=torch.long)
all_label_ids = torch.tensor(label_id, dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=BATCH_SIZE)

In [None]:
best_accuracy = 0
print("***** Running training *****")
print("  Num examples = %d", len(train_examples))
print("  Batch size = %d", BATCH_SIZE)
print("  Num steps = %d", num_train_steps)

input_ids = []
input_mask = []
segment_ids = []
label_id = []
for f in train_features:
    input_ids.append([])
    input_mask.append([])
    segment_ids.append([])
    for i in range(n_class):
        input_ids[-1].append(f[i].input_ids)
        input_mask[-1].append(f[i].input_mask)
        segment_ids[-1].append(f[i].segment_ids)
    label_id.append([f[0].label_id])                

all_input_ids = torch.tensor(input_ids, dtype=torch.long)
all_input_mask = torch.tensor(input_mask, dtype=torch.long)
all_segment_ids = torch.tensor(segment_ids, dtype=torch.long)
all_label_ids = torch.tensor(label_id, dtype=torch.long)

train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)


for _ in range(EPOCH):
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        targetlen = input_ids.size(2)
        input_ids = input_ids.view(-1,targetlen)
        input_mask = input_mask.view(-1,targetlen)
        segment_ids = segment_ids.view(-1,targetlen)
        
        # print(label_ids.shape)
        # print(input_ids.shape)
        outputs = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
        loss = outputs.loss
        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        loss.backward()
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        optimizer.step()
        model.zero_grad()

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    logits_all = []
    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)
        targetlen = input_ids.size(2)
        input_ids = input_ids.view(-1,targetlen)
        input_mask = input_mask.view(-1,targetlen)
        segment_ids = segment_ids.view(-1,targetlen)
        with torch.no_grad():
            outputs = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
            tmp_eval_loss = outputs.loss
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        logits = logits.reshape((-1,n_class))
        label_ids = label_ids.to('cpu').numpy()
        for i in range(len(logits)):
            logits_all += [logits[i]]
        
        tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1))

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples

    if do_train:
        result = {'eval_loss': eval_loss,
                    'eval_accuracy': eval_accuracy,
                    'loss': tr_loss/nb_tr_steps}
    else:
        result = {'eval_loss': eval_loss,
                    'eval_accuracy': eval_accuracy}

    print("***** Eval results *****")
    for key in sorted(result.keys()):
        print("  %s = %s", key, str(result[key]))

    if eval_accuracy >= best_accuracy:
        torch.save(model.state_dict(), "model_best.pt")
        best_accuracy = eval_accuracy
        
model.load_state_dict(torch.load("model_best.pt"))
torch.save(model.state_dict(), "model.pt")

model.load_state_dict(torch.load("model.pt"))


In [None]:
print(logits.shape)
print(label_ids.shape)

In [None]:
global_step = 0

print("***** Running evaluation *****")
print("  Num examples = %d", len(eval_examples))
print("  Batch size = %d", BATCH_SIZE)

model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
logits_all = []

for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
    
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)
    targetlen = input_ids.size(2)
    input_ids = input_ids.view(-1,targetlen)
    input_mask = input_mask.view(-1,targetlen)
    segment_ids = segment_ids.view(-1,targetlen)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
        tmp_eval_loss, logits = outputs.loss, outputs.logits

    logits = logits.detach().cpu().numpy()
    label_ids = label_ids.to('cpu').numpy()
    logits = logits.reshape((-1,n_class))
    for i in range(len(logits)):
        logits_all += [logits[i]]
    
    tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1))
    # print(logits.shape, label_ids.shape)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += input_ids.size(0)
    nb_eval_steps += 1

eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_examples

if do_train:
    result = {'eval_loss': eval_loss,
                'eval_accuracy': eval_accuracy,
                'global_step': global_step,
                'loss': tr_loss/nb_tr_steps}
else:
    result = {'eval_loss': eval_loss,
                'eval_accuracy': eval_accuracy}

eval_examples = processor.get_test_examples(data_dir)
eval_features = convert_examples_to_features(
    eval_examples, label_list, max_seq_length, tokenizer)

print("***** Running evaluation *****")
print("  Num examples = %d", len(eval_examples))
print("  Batch size = %d", BATCH_SIZE)

input_ids = []
input_mask = []
segment_ids = []
label_id = []

for f in eval_features:
    input_ids.append([])
    input_mask.append([])
    segment_ids.append([])
    for i in range(n_class):
        input_ids[-1].append(f[i].input_ids)
        input_mask[-1].append(f[i].input_mask)
        segment_ids[-1].append(f[i].segment_ids)
    label_id.append([f[0].label_id])                

all_input_ids = torch.tensor(input_ids, dtype=torch.long)
all_input_mask = torch.tensor(input_mask, dtype=torch.long)
all_segment_ids = torch.tensor(segment_ids, dtype=torch.long)
all_label_ids = torch.tensor(label_id, dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=BATCH_SIZE)

model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
logits_all = []
for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)
    targetlen = input_ids.size(2)
    input_ids = input_ids.view(-1,targetlen)
    input_mask = input_mask.view(-1,targetlen)
    segment_ids = segment_ids.view(-1,targetlen)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
        tmp_eval_loss = outputs.loss
        logits = outputs.logits
    
    logits = logits.detach().cpu().numpy()
    label_ids = label_ids.to('cpu').numpy()
    logits = logits.reshape((-1,n_class))
    for i in range(len(logits)):
        logits_all += [logits[i]]
    
    tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1))

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += input_ids.size(0)
    nb_eval_steps += 1

eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_examples

predict_path = "predict.csv"
with open(predict_path, "w", newline='') as fcsv:
    writer = csv.writer(fcsv)
    writer.writerow(['index','answer'])
    for i in range(len(logits_all)):
        max_idx = 0.0
        max_val = -5.0
        for j in range(len(logits_all[i])):
            if(logits_all[i][j]>=max_val):
                max_val = logits_all[i][j]
                max_idx = j
            if j == len(logits_all[i])-1:
                writer.writerow([str(i), str(max_idx+1)])