# Parts of Speech Exercise

1) Rivedi la lezione di "start" dell'esercizio, dopo di che prova ad applicare le logiche del NER al POS
2) Scaricare il brown corpus e la lista universale dei tags
3) Creare un file JSON da cui caricare i dati
     - EXAMPLE {"inputs": ["Cats", "are", "animals"], "tags":[6,2,4]}
     - Usare load_dataset("json", data_files="data.json") per importare
   


## Dataset preparation

In [1]:
import nltk
from nltk.corpus import brown

In [2]:
nltk.download("brown")
nltk.download("universal_tagset")

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\emili\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\emili\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
corpus = brown.tagged_sents(tagset="universal")

In [4]:
# List of lists
corpus

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [5]:
import json
import os

In [6]:
def extract_tags(corpus):
    list_tags = []
    for sentence in corpus:
        for _, tag in sentence:
            list_tags.append(tag)

    return set(list_tags)

tag_names = extract_tags(corpus)

In [7]:
tag_names

{'.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X'}

In [8]:
id2tag = {k:v for k,v in enumerate(tag_names)}
tag2id = {v:k for k,v in id2tag.items()}

In [9]:
tag2id

{'VERB': 0,
 'NOUN': 1,
 'ADJ': 2,
 'ADP': 3,
 'ADV': 4,
 'CONJ': 5,
 '.': 6,
 'NUM': 7,
 'DET': 8,
 'PRT': 9,
 'X': 10,
 'PRON': 11}

In [10]:
fpath = "data/brown.json"
def create_json_from_corpus(corpus):
    list_of_inputs = []

    for sentence in corpus:
        curr_object = {"inputs": [], "tags": []}
        for inp, tag in sentence:
            curr_object["inputs"].append(inp)
            curr_object["tags"].append(tag2id[tag])
        list_of_inputs.append(curr_object)

    with open(fpath, "w") as f:
        json.dump(list_of_inputs, f)

if not os.path.isfile(fpath):
    create_json_from_corpus(corpus)

## Tokenizer

In [15]:
from transformers import AutoTokenizer
from datasets import load_dataset

checkpoint = "distilbert-base-cased"

In [12]:
dataset = load_dataset("json", data_files=fpath)
dataset

Generating train split: 57340 examples [00:00, 146493.64 examples/s]


DatasetDict({
    train: Dataset({
        features: ['inputs', 'tags'],
        num_rows: 57340
    })
})

In [13]:
test_row = dataset["train"][0]
print(test_row)

{'inputs': ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], 'tags': [8, 1, 1, 2, 1, 0, 1, 8, 1, 3, 1, 2, 1, 1, 0, 6, 8, 1, 6, 3, 8, 1, 0, 1, 6]}


In [14]:
split = dataset["train"].train_test_split(test_size=0.3, seed=42)
split

DatasetDict({
    train: Dataset({
        features: ['inputs', 'tags'],
        num_rows: 40138
    })
    test: Dataset({
        features: ['inputs', 'tags'],
        num_rows: 17202
    })
})

In [16]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [22]:
t = tokenizer(test_row["inputs"], is_split_into_words=True)
print(t.tokens())
print(t.word_ids())

['[CLS]', 'The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'", 's', 'recent', 'primary', 'election', 'produced', '`', '`', 'no', 'evidence', "'", "'", 'that', 'any', 'irregular', '##ities', 'took', 'place', '.', '[SEP]']
[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 11, 12, 13, 14, 15, 15, 16, 17, 18, 18, 19, 20, 21, 21, 22, 23, 24, None]


In [23]:
def align_tags(ids, tags):
    aligned_targets = []
    
    for wid in ids:
        if wid is None:
            # Qui abbiamo un token speciale da ignorare (id: None)
            label = -100
        else:
            label = tags[wid]

        aligned_targets.append(label)
            
    return aligned_targets

In [27]:
# Sanity check
print(len(align_tags(t.word_ids(), test_row["tags"])), len(t.tokens()))

32 32


In [34]:
def tokenize_fn(batch):
    tokenized_inputs = tokenizer(batch["inputs"], truncation=True, is_split_into_words=True)
    labels_batch = batch["tags"] # Original targets
    aligned_batch_labels = []
    for i, labels in enumerate(labels_batch):
        word_id = tokenized_inputs.word_ids(i)
        aligned_labels = align_tags(word_id, labels)
        aligned_batch_labels.append(aligned_labels)

    # Ricordiamo: il nostro target DEVE essere salvato in una colonna chiamata "labels"
    tokenized_inputs["labels"] = aligned_batch_labels

    return tokenized_inputs

In [35]:
tokenized_datasets = split.map(tokenize_fn, batched=True, remove_columns=split["train"].column_names)

Map: 100%|█████████████████████████████████████████████████████████████| 40138/40138 [00:03<00:00, 11351.34 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████| 17202/17202 [00:01<00:00, 13178.74 examples/s]


In [36]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 40138
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 17202
    })
})

## Data Collator

In [29]:
from transformers import DataCollatorForTokenClassification

In [30]:
data_collator =  DataCollatorForTokenClassification(tokenizer=tokenizer)

In [37]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    1,    1,    3,    1,    5,    1,    0,    4,    8,    2,    1,
            3,    8,    1,    1,    1,    1,    6, -100],
        [-100,    0,    1,    6,    6, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100]])

## Metrics

In [39]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np

In [60]:
def compute_metrics(logits_and_labels):
    
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)

    # Eliminiamo le predizioni e le labels dove non ci sono i -100
    clean_labels = [t for label in labels for t in label if t!=-100]
    clean_preds = [p for pred, targ in zip(predictions, labels) for p, t in zip(pred, targ) if t!=-100 ]

    accuracy = accuracy_score(clean_labels, clean_preds)
    f1 = f1_score(clean_labels, clean_preds, average="macro")

    return {
        "accuracy": accuracy,
        "f1": f1
    }

## Model, Trainer, TrainerArgs

In [61]:
from transformers import Trainer, TrainingArguments, AutoModelForTokenClassification

In [62]:
model = AutoModelForTokenClassification.from_pretrained(checkpoint, id2label=id2tag, label2id=tag2id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
train_args = TrainingArguments(
    output_dir="my_pos_model",
    save_strategy="epoch",
    eval_strategy="epoch",
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01
)

In [64]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    args=train_args
)

In [65]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0484,0.046784,0.986674,0.963302
2,0.029,0.044557,0.98795,0.965363
3,0.0166,0.050217,0.988402,0.968191
4,0.0106,0.053689,0.9887,0.969614
5,0.0077,0.057711,0.988866,0.970157


TrainOutput(global_step=25090, training_loss=0.03067116279343582, metrics={'train_runtime': 2104.6829, 'train_samples_per_second': 95.354, 'train_steps_per_second': 11.921, 'total_flos': 2588388393132576.0, 'train_loss': 0.03067116279343582, 'epoch': 5.0})

## Inference

In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pos = pipeline("token-classification", model="my_pos_model/checkpoint-15054/", aggregation_strategy="simple", device=0)

In [80]:
pos("Snake, infiltrate the enemy fortress Outer Heaven")

[{'entity_group': 'NOUN',
  'score': 0.9998913,
  'word': 'Snake',
  'start': 0,
  'end': 5},
 {'entity_group': '.', 'score': 0.9999801, 'word': ',', 'start': 5, 'end': 6},
 {'entity_group': 'VERB',
  'score': 0.9998185,
  'word': 'infiltrate',
  'start': 7,
  'end': 17},
 {'entity_group': 'DET',
  'score': 0.9999366,
  'word': 'the',
  'start': 18,
  'end': 21},
 {'entity_group': 'NOUN',
  'score': 0.99992555,
  'word': 'enemy fortress',
  'start': 22,
  'end': 36},
 {'entity_group': 'ADJ',
  'score': 0.9991744,
  'word': 'Outer',
  'start': 37,
  'end': 42},
 {'entity_group': 'NOUN',
  'score': 0.9998031,
  'word': 'Heaven',
  'start': 43,
  'end': 49}]

In [4]:
pos("I am cooking a Genovese recipe")

[{'entity_group': 'PRON',
  'score': 0.9998221,
  'word': 'I',
  'start': 0,
  'end': 1},
 {'entity_group': 'VERB',
  'score': 0.9994816,
  'word': 'am cooking',
  'start': 2,
  'end': 12},
 {'entity_group': 'DET',
  'score': 0.99987876,
  'word': 'a',
  'start': 13,
  'end': 14},
 {'entity_group': 'NOUN',
  'score': 0.9994346,
  'word': 'Genovese recipe',
  'start': 15,
  'end': 30}]