In [1]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

# imports
from vocab_mismatch_utils import *
from data_formatter_utils import *
from datasets import DatasetDict
from datasets import Dataset
from datasets import list_datasets
from datasets import load_dataset, load_metric
import transformers
import pandas as pd
import operator
from collections import OrderedDict
from tqdm import tqdm, trange
from seqeval.metrics import sequence_labeling

import collections
import os
import unicodedata
from typing import List, Optional, Tuple

from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from transformers.utils import logging
import torch
logger = logging.get_logger(__name__)
import numpy as np
import copy
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
from word_forms.word_forms import get_word_forms
from functools import partial
import matplotlib.ticker as mticker

import numpy as np
from sklearn.dummy import DummyClassifier

import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "DejaVu Serif"
font = {'family' : 'DejaVu Serif',
        'size'   : 20}
plt.rc('font', **font)

FILENAME_CONFIG = {
    "sst3" : "sst-tenary",
    "cola" : "cola",
    "mnli" : "mnli",
    "snli" : "snli",
    "mrpc" : "mrpc",
    "qnli" : "qnli",
    "conll2003" : "conll2003",
    "en_ewt" : "en_ewt"
}
TASK_CONFIG = {
    "wiki-text": ("text", None),
    "sst3": ("text", None),
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "snli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "conll2003" : ("tokens", None),
    "en_ewt" : ("tokens", None)
}
TAG_CONFIG = {
    "conll2003" : "ner_tags",
    "en_ewt" : "upos"
}

cache_dir = "../tmp/"

Random Classifer for Token Classification

In [2]:
# task setups
task_name = "conll2003"
# random seeds
# WARNING: this may change your results as well. Try it a few different seeds.
seed = 8
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [3]:
if task_name == "conll2003":
    dataset = load_dataset("conll2003", cache_dir=cache_dir)
    train_df = dataset["train"]
    eval_df = dataset["validation"]
    test_df = dataset["test"]
elif task_name == "en_ewt":
    dataset = load_dataset("universal_dependencies", "en_ewt", cache_dir=cache_dir)
    train_df = dataset["train"]
    eval_df = dataset["validation"]
    test_df = dataset["test"]
else:
    # handle token data differently
    train_df = pd.read_csv(os.path.join(external_output_dirname, FILENAME_CONFIG[task_name], 
                                        f"train.tsv"), 
                           delimiter="\t")
    eval_df = pd.read_csv(os.path.join(external_output_dirname, FILENAME_CONFIG[task_name], 
                                       f"dev.tsv"), 
                          delimiter="\t")
    test_df = pd.read_csv(os.path.join(external_output_dirname, FILENAME_CONFIG[task_name], 
                                       f"test.tsv"), 
                          delimiter="\t")

    train_df = Dataset.from_pandas(train_df)
    eval_df = Dataset.from_pandas(eval_df)
    test_df = Dataset.from_pandas(test_df)

Reusing dataset conll2003 (../tmp/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


In [4]:
if task_name == "conll2003":
    from datasets import ClassLabel
    features = train_df.features

    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list
    label_column_name = TAG_CONFIG[task_name]
    if isinstance(features[label_column_name].feature, ClassLabel):
        label_list = features[label_column_name].feature.names
        # No need to convert the labels since they are already ints.
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        label_list = get_label_list(inoculation_train_df[label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)

In [5]:
if task_name == "conll2003":
    print(label_list)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [None]:
print("**** Dataset Statistics ****")
print(f"training example = {len(train_df)}")
print(f"validation example = {len(eval_df)}")
print(f"testing example = {len(test_df)}")
print("****************************")
datasets = {
    "train" : train_df, 
    "validation" : eval_df,
    "test" : test_df
}

In [None]:
test_df[1]

In [None]:
dummy_clf = DummyClassifier(strategy="stratified") # a dummy classifier just with the label prob
all_labels_train = []
for i in range(0, len(train_df)):
    all_labels_train.extend(train_df[i][TAG_CONFIG[task_name]])
mock_x = [0] * len(all_labels_train)
dummy_clf.fit(mock_x, all_labels_train)

In [185]:
actual_labels = []
predicted_labels = []
for i in range(0, len(test_df)):
    dummy_labels = dummy_clf.predict(test_df[i][TAG_CONFIG[task_name]])
    if task_name == "conll2003":
        # need to do ner special handlings
        actual_labels.append([label_list[label_to_id[ele]] for ele in test_df[i][TAG_CONFIG[task_name]]])
        predicted_labels.append([label_list[label_to_id[ele]] for ele in dummy_labels.tolist()])
    else:
        actual_labels.extend([str(ele) for ele in test_df[i][TAG_CONFIG[task_name]]])
        predicted_labels.extend([str(ele) for ele in dummy_labels.tolist()])

In [186]:
if task_name == "conll2003":
    seqeval = load_metric("seqeval")
    results = seqeval.compute(predictions=predicted_labels, references=actual_labels)
    print(results)
else:
    from sklearn.metrics import classification_report
    print(classification_report(actual_labels, predicted_labels, digits=5))

{'LOC': {'precision': 0.02319451765946231, 'recall': 0.026378896882494004, 'f1': 0.02468443197755961, 'number': 1668}, 'MISC': {'precision': 0.009930486593843098, 'recall': 0.014245014245014245, 'f1': 0.011702750146284378, 'number': 702}, 'ORG': {'precision': 0.018469656992084433, 'recall': 0.025285972305839857, 'f1': 0.021346886912325287, 'number': 1661}, 'PER': {'precision': 0.015561015561015561, 'recall': 0.02350030921459493, 'f1': 0.018723823601872382, 'number': 1617}, 'overall_precision': 0.01758530183727034, 'overall_recall': 0.023725212464589234, 'overall_f1': 0.02019897497738921, 'overall_accuracy': 0.6927533110800043}


CRF

In [187]:
TASK_CONFIG = {
    "sst3": ("text", None),
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "snli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"), 
    "conll2003" : ("tokens", None),
    "en_ewt" : ("tokens", None)
}
# WARNING: you dont need BERT tokenizer
# original_vocab = load_bert_vocab("../data-files/bert_vocab.txt")
# original_tokenizer = transformers.BertTokenizer(
#     vocab_file="../data-files/bert_vocab.txt")
# Just use some basic white space tokenizor here!
modified_basic_tokenizer = ModifiedBasicTokenizer()
max_length = 128
per_device_train_batch_size = 128
per_device_eval_batch_size = 128
no_cuda = True
device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
n_gpu = torch.cuda.device_count() if not no_cuda else 1 # 1 means just on cpu
seed = 42
lr = 1e-3
num_train_epochs = 10
sentence1_key, sentence2_key = TASK_CONFIG[task_name]

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0 and not no_cuda:
    torch.cuda.manual_seed_all(args.seed)
# get the vocab i think?

In [188]:
def sanity_check_non_empty(sentece):
    if sentece != None and len(sentece) != 0:
        return True
    return False

# create the vocab file
vocab_index = 0
original_vocab = OrderedDict()
if "train" in datasets:
    for (ex_index, example) in enumerate(tqdm(datasets["train"])):
        if sentence2_key is None:
            if sanity_check_non_empty(example[sentence1_key]):
                sentence_combined = example[sentence1_key]
        else:
            pass
        sentence_tokens = sentence_combined
        for token in sentence_tokens:
            if token not in original_vocab.keys():
                original_vocab[token] = vocab_index
                vocab_index += 1
train_data_only = False
if not train_data_only:
    if "validation" in datasets:
        for (ex_index, example) in enumerate(tqdm(datasets["validation"])):
            if sentence2_key is None:
                if sanity_check_non_empty(example[sentence1_key]):
                    sentence_combined = example[sentence1_key]
            else:
                pass
            sentence_tokens = sentence_combined
            for token in sentence_tokens:
                if token not in original_vocab.keys():
                    original_vocab[token] = vocab_index
                    vocab_index += 1

    if "test" in datasets:
        for (ex_index, example) in enumerate(tqdm(datasets["test"])):
            if sentence2_key is None:
                if sanity_check_non_empty(example[sentence1_key]):
                    sentence_combined = example[sentence1_key]
            else:
                pass
            sentence_tokens = sentence_combined
            for token in sentence_tokens:
                if token not in original_vocab.keys():
                    original_vocab[token] = vocab_index
                    vocab_index += 1

100%|██████████| 14041/14041 [00:01<00:00, 10155.58it/s]
100%|██████████| 3250/3250 [00:00<00:00, 9136.55it/s]
100%|██████████| 3453/3453 [00:00<00:00, 10833.27it/s]


In [189]:
import string
alpha = list(string.ascii_lowercase) + list(string.ascii_uppercase)

In [190]:
train_features = []
train_labels = []
for (ex_index, example) in enumerate(tqdm(datasets["train"])):
    sentence_features = []
    for t in example["tokens"]:
        feature_dict = {}
        for a in alpha:
            feature_dict[a] = 0
        for char in t:
            if char in alpha:
                feature_dict[char] = feature_dict[char] + 1
        sentence_features.append(feature_dict)
    train_features.append(sentence_features)
    label_str = [str(l) for l in example[TAG_CONFIG[task_name]]]
    train_labels.append(label_str)

100%|██████████| 14041/14041 [00:03<00:00, 4245.50it/s]


In [191]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(train_features, train_labels)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [192]:
labels = list(crf.classes_)
labels

['3', '0', '7', '1', '2', '5', '4', '8', '6']

In [193]:
test_features = []
test_labels = []
for (ex_index, example) in enumerate(tqdm(datasets["test"])):
    sentence_features = []
    for t in example["tokens"]:
        feature_dict = {}
        for a in alpha:
            feature_dict[a] = 0
        for char in t:
            if char in alpha:
                feature_dict[char] = feature_dict[char] + 1
        sentence_features.append(feature_dict)
    test_features.append(sentence_features)
    label_str = [str(l) for l in example[TAG_CONFIG[task_name]]]
    test_labels.append(label_str)

100%|██████████| 3453/3453 [00:00<00:00, 4240.01it/s]


In [199]:
y_pred = crf.predict(test_features)
if task_name == "conll2003":
    actual_labels = []
    predicted_labels = []
    for i in range(0, len(y_pred)):
        actual = [label_list[label_to_id[int(ele)]] for ele in test_labels[i]]
        pred = [label_list[label_to_id[int(ele)]] for ele in y_pred[i]]
        assert len(actual) == len(pred)
        actual_labels.append(actual)
        predicted_labels.append(pred)
    seqeval = load_metric("seqeval")
    results = seqeval.compute(predictions=predicted_labels, references=actual_labels)
    print(results)
else:
    single_acc = metrics.flat_f1_score(test_labels, y_pred,
                          average='micro', labels=labels)
    print(single_acc)

{'LOC': {'precision': 0.3340471092077088, 'recall': 0.2805755395683453, 'f1': 0.30498533724340177, 'number': 1668}, 'MISC': {'precision': 0.13043478260869565, 'recall': 0.038461538461538464, 'f1': 0.05940594059405941, 'number': 702}, 'ORG': {'precision': 0.28093023255813954, 'recall': 0.18181818181818182, 'f1': 0.22076023391812868, 'number': 1661}, 'PER': {'precision': 0.43007518796992483, 'recall': 0.35374149659863946, 'f1': 0.38819138106549034, 'number': 1617}, 'overall_precision': 0.34114129080488415, 'overall_recall': 0.24238668555240794, 'overall_f1': 0.2834075147500259, 'overall_accuracy': 0.8360288575428018}
