In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
except ModuleNotFoundError as e:
    print("not in colab")
    pass
import os
base_dir = "/content/drive/MyDrive/semeval2022"
if not os.path.exists(base_dir):
  !pip install -r requirements.txt
  base_dir = ""
else:
  !pip install -r /content/drive/MyDrive/semeval2022/requirements.txt
  !cp -rf /content/drive/MyDrive/semeval2022/*.py . 
  !cp -rf /content/drive/MyDrive/semeval2022/utils .
  !cp -rf /content/drive/MyDrive/semeval2022/model .

In [None]:
from utils.util import get_entity_vocab, get_reader, train_model, create_model, save_model, parse_args, get_tagset, wnut_iob, write_submit_result, load_model, luke_iob
import time

In [None]:
encoder_model = "distilbert-base-uncased"
encoder_model = "roberta-base"
encoder_model = "bert-base-uncased"
track = "EN-English/en"
train_file = os.path.join(base_dir, "training_data/{}_train.conll".format(track))
dev_file = os.path.join(base_dir, "training_data/{}_dev.conll".format(track))
output_dir = os.path.join(base_dir, "{}".format(track), "{}-train".format(encoder_model))
submission_file = os.path.join(base_dir, "submission", "{}.pred.conll".format(track))
iob_tagging = wnut_iob
use_crf = False

In [None]:
entity_vocab = get_entity_vocab()
train_data = get_reader(file_path=train_file, target_vocab=iob_tagging, encoder_model=encoder_model, max_instances=-1, max_length=100, entity_vocab=entity_vocab)
dev_data = get_reader(file_path=dev_file, target_vocab=iob_tagging, encoder_model=encoder_model, max_instances=-1, max_length=100, entity_vocab=entity_vocab)

model = create_model(train_data=train_data, dev_data=dev_data, tag_to_id=iob_tagging,
                     dropout_rate=0.1, batch_size=32, stage='fit', lr=2e-5,
                     encoder_model=encoder_model, num_gpus=1, use_crf=use_crf)

trainer = train_model(model=model, out_dir=output_dir, epochs=20, monitor="f1")

# use pytorch lightnings saver here.
out_model_path, best_checkpoint = save_model(trainer=trainer, out_dir=output_dir, model_name=encoder_model, timestamp=time.time())
submission_file = os.path.join(os.path.dirname(out_model_path), "{}.pred.conll".format(track))

model = load_model(best_checkpoint, iob_tagging, use_crf=use_crf)

record_data = write_submit_result(model, dev_data, submission_file)

In [None]:
record_data.head()

In [None]:
record_data.groupby(by=['label', 'pred', 'word']).count()

In [None]:
error_data = record_data[record_data["label"] != record_data["pred"]]
ans = error_data.groupby(by=['label', 'pred']).count().reset_index()
print(ans.to_string())
#ans.sort_values('count')

In [None]:
error_data.groupby(by=['label', 'pred']).agg('count')