<a href="https://colab.research.google.com/github/chinmay002/NLP/blob/main/Chapter_4_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install accelerate>=0.20.1
!pip install datasets



In [2]:
#Restart the notebook

In [3]:
from transformers import AutoTokenizer
import pandas as pd
import numpy as np


In [4]:
from datasets import get_dataset_config_names,load_dataset

xtreme_subsets = get_dataset_config_names('xtreme')


In [5]:
panx_subsets = [s for s in xtreme_subsets if s.startswith('PAN')]
panx_subsets[:3]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']

In [6]:
load_dataset('xtreme',name = 'PAN-X.de')

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [7]:
from collections import defaultdict
from datasets import DatasetDict

#actua; proportion of population speaking lanhuages in Swiss
langs = ['de','fr','it','en']
fracs = [0.05,0.1,0.084,0.059]

panx_ch = defaultdict(DatasetDict)

for lang,frac in zip(langs,fracs):
  #load data
  ds = load_dataset('xtreme',name = f'PAN-X.{lang}')
  #select only assigned prop data
  for split in ds:
    panx_ch[lang][split] = (ds[split].shuffle(seed = 0).select(range(int(frac * ds[split].num_rows))))

In [8]:
ds = load_dataset('xtreme',name='PAN-X.de')
ds['train'].select(range(int(ds['train'].num_rows*0.25)))

Dataset({
    features: ['tokens', 'ner_tags', 'langs'],
    num_rows: 5000
})

In [9]:
panx_ch['de']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
})

In [10]:
panx_ch['de']['train'][0]
#ner_tags mapping each entity to a class ID

{'tokens': ['2.000',
  'Einwohnern',
  'an',
  'der',
  'Danziger',
  'Bucht',
  'in',
  'der',
  'polnischen',
  'Woiwodschaft',
  'Pommern',
  '.'],
 'ner_tags': [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0],
 'langs': ['de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de']}

In [11]:
tags = panx_ch['de']['train'].features['ner_tags'].feature
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [12]:
def create_tag_names(batch):

  return {'new_tag_names':[tags.int2str(idx) for idx in  batch['ner_tags']]}

panx_de = panx_ch['de'].map(create_tag_names)

In [13]:
from collections import Counter
split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
  for row in dataset["new_tag_names"]:
    for tag in row:
      if tag.startswith("B"):
        tag_type = tag.split("-")[1]
        split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,LOC,ORG,PER
train,474,416,505
validation,246,214,254
test,239,235,271


##SentencePieceToenizer

In [14]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
from transformers import AutoTokenizer

bert_model_name = 'bert-base-cased'
xlmr_model_name = 'xlm-roberta-base'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)


In [16]:
text = 'Chinmay Maganur from Haveri Karnataka'
print(bert_tokenizer(text).tokens())
print(xlmr_tokenizer(text).tokens())

['[CLS]', 'Chin', '##ma', '##y', 'Ma', '##gan', '##ur', 'from', 'Have', '##ri', 'Karnataka', '[SEP]']
['<s>', '▁Chin', 'may', '▁Maga', 'nur', '▁from', '▁Have', 'ri', '▁Karnataka', '</s>']


#Custom Model for own task

In [17]:
#custome classifcation head for XLM-R.  XLM uses Roberta , we will use ROBERTA as base model. and augument with XLM-R settings

In [18]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel,RobertaPreTrainedModel

In [19]:
'''
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

# Load a pre-trained token-level classification model and tokenizer
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Define the input text
text = "She walks to the park."

# Tokenize the text
input_ids = tokenizer.encode(text, return_tensors='pt')
print(input_ids)
# Perform token-level classification
with torch.no_grad():
    output = model(input_ids)

print(output)

# Get the predicted logits for each token
logits = output.logits
print(logits)
# Predicted labels can be obtained by finding the argmax of the logits
predicted_labels = logits.argmax(dim=2)[0].tolist()

# Map labels to their respective tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
token_label_pairs = list(zip(tokens, predicted_labels))

# Display the results
for token, label_id in token_label_pairs:
    label = model.config.id2label[label_id]
    print(f"{token}: {label}")
'''

'\nfrom transformers import AutoModelForTokenClassification, AutoTokenizer\nimport torch\n\n# Load a pre-trained token-level classification model and tokenizer\nmodel_name = "dbmdz/bert-large-cased-finetuned-conll03-english"\ntokenizer = AutoTokenizer.from_pretrained(model_name)\nmodel = AutoModelForTokenClassification.from_pretrained(model_name)\n\n# Define the input text\ntext = "She walks to the park."\n\n# Tokenize the text\ninput_ids = tokenizer.encode(text, return_tensors=\'pt\')\nprint(input_ids)\n# Perform token-level classification\nwith torch.no_grad():\n    output = model(input_ids)\n\nprint(output)\n\n# Get the predicted logits for each token\nlogits = output.logits\nprint(logits)\n# Predicted labels can be obtained by finding the argmax of the logits\npredicted_labels = logits.argmax(dim=2)[0].tolist()\n\n# Map labels to their respective tokens\ntokens = tokenizer.convert_ids_to_tokens(input_ids[0])\ntoken_label_pairs = list(zip(tokens, predicted_labels))\n\n# Display the 

In [20]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
  config_class = XLMRobertaConfig

  def __init__(self,config):
    super().__init__(config)
    self.num_labels = config.num_labels
    self.roberta = RobertaModel(config,add_pooling_layer = False)#pool=false to ensure all tokens hiddenstates are returned and not just CLS
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.classifier = nn.Linear(config.hidden_size,config.num_labels)
    self.init_weights() #inherit from RobertaPreTrained  to load the pretreined weights for model body and randomly initialize the wieghts of our tokens classif head


  def forward(self,input_ids=None,attention_mask=None,token_type_ids =None,labels=None,**kwargs):
    outputs = self.roberta(input_ids,attention_mask = attention_mask,token_type_ids = token_type_ids,**kwargs)
    sequence_output = self.dropout(outputs[0])
    logits = self.classifier(sequence_output)
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
    return TokenClassifierOutput(loss= loss,logits = logits, hidden_states=outputs.hidden_states,attentions = outputs.attentions)

In [21]:
#logits.view(-1)

In [22]:
index2tag = {idx:tag for idx,tag in enumerate(tags.names)}
tag2index = {tag:idx for idx,tag in enumerate(tags.names)}

In [23]:
print(index2tag)
print(tag2index)


{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


In [24]:
from transformers import AutoConfig
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,num_labels = tags.num_classes,id2label = index2tag,label2id = tag2index)
#when we load model with automodel.from_pretrained. config file is automatically downloaded , if we want to modify something like num_classes ,
#then we can load the config first with params we would like to customize

In [25]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
xlmr_model = (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name,config = xlmr_config)).to(device)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
text ='Jack Sparrow loves New York!'
xlmr_tokens = xlmr_tokenizer(text).tokens()
input_ids = xlmr_tokenizer.encode(text,return_tensors ='pt')
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Input IDs,0,21763,37456,15555,5161,7,2356,5753,38,2


In [27]:
outputs = xlmr_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs,dim = -1)
print(f"Number of tokens in sequence: {len(xlmr_tokens)}")
print(f"Shape of outputs: {outputs.shape}")

Number of tokens in sequence: 10
Shape of outputs: torch.Size([1, 10, 7])


In [28]:
print(torch.argmax(outputs,dim=-1))
print(predictions)
print(outputs)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')
tensor([[[ 0.3510, -0.0175,  0.0277, -0.2717, -0.5207, -0.1324, -0.1182],
         [ 0.6105, -0.3184, -0.2569, -0.6354, -0.3722, -0.3873,  0.0837],
         [ 0.7122, -0.2820, -0.1677, -0.7065, -0.4956, -0.3153,  0.2556],
         [ 0.6435, -0.3202, -0.1960, -0.6538, -0.4139, -0.3739,  0.2433],
         [ 0.5979, -0.3601, -0.1524, -0.5731, -0.4892, -0.2949,  0.1870],
         [ 0.6535, -0.3851, -0.2545, -0.5590, -0.4678, -0.3769,  0.1904],
         [ 0.6010, -0.3161, -0.2738, -0.6273, -0.4731, -0.3616,  0.1695],
         [ 0.4922, -0.2891, -0.2424, -0.6891, -0.4254, -0.4085,  0.1170],
         [ 0.5883, -0.4111, -0.3914, -0.6481, -0.4516, -0.4138,  0.2788],
         [ 0.2976,  0.0693,  0.0382, -0.2389, -0.4660, -0.1176, -0.1756]]],
       device='cuda:0', grad_fn=<ViewBackward0>)


In [29]:
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens,preds])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
1,O,O,O,O,O,O,O,O,O,O


In [30]:
def tag_text(text,tags,model,tokenizer):
  tokens = tokenizer(text).tokens()
  input_ids = xlmr_tokenizer.encode(text,return_tensors ='pt')
  outputs = model(input_ids.to(device)).logits
  predictions = torch.argmax(outputs,dim = -1)
  preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
  return pd.DataFrame([xlmr_tokens,preds])

In [31]:
xlmr_tokenizer

#xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)


XLMRobertaTokenizerFast(name_or_path='xlm-roberta-base', vocab_size=250002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	250001: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [32]:
words ,labels = panx_de['train'][0]['tokens'],panx_de['train'][0]['ner_tags']
words,labels

(['2.000',
  'Einwohnern',
  'an',
  'der',
  'Danziger',
  'Bucht',
  'in',
  'der',
  'polnischen',
  'Woiwodschaft',
  'Pommern',
  '.'],
 [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0])

In [33]:
tokenized_input = xlmr_tokenizer(panx_de['train'][0]['tokens'],is_split_into_words=True)#we cant send tokens to encode, we need to send text
xlmr_tokenizer(panx_de['train'][0]['tokens'],is_split_into_words=True)

{'input_ids': [0, 70101, 176581, 19, 142, 122, 2290, 708, 1505, 18363, 18, 23, 122, 127474, 15439, 13787, 14, 15263, 18917, 663, 6947, 19, 6, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [34]:
xlmr_tokenizer(panx_de['train'][0]['tokens'])

{'input_ids': [[0, 70101, 2], [0, 176581, 19, 2], [0, 142, 2], [0, 122, 2], [0, 2290, 708, 1505, 2], [0, 18363, 18, 2], [0, 23, 2], [0, 122, 2], [0, 127474, 15439, 2], [0, 13787, 14, 15263, 18917, 2], [0, 663, 6947, 19, 2], [0, 6, 5, 2]], 'attention_mask': [[1, 1, 1], [1, 1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1]]}

In [35]:
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
pd.DataFrame([tokens],index = ['Tokens'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>


In [36]:
#'Einwohnern' is diveide into two words. We need a way to mask the sub represntation after first words. tokenized_input provides word_ids()
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens,word_ids],index = ['tokens','words_ids'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
words_ids,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,


In [37]:
prev_word_idx = None
label_ids = []
for word_idx in word_ids :
  if word_idx is None or word_idx==prev_word_idx:
    label_ids.append(-100)
  elif word_idx != prev_word_idx:
    label_ids.append(labels[word_idx])
  prev_word_idx = word_idx

labels = [index2tag[l] if l!= -100 else "IGN"for  l in label_ids]
index = ['Tokens','WOrd IDs','label ids','labels']

In [38]:
pd.DataFrame([tokens,word_ids,label_ids,labels],index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
WOrd IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,
label ids,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100
labels,IGN,O,O,IGN,O,O,B-LOC,IGN,IGN,I-LOC,...,B-LOC,IGN,IGN,IGN,I-LOC,IGN,IGN,O,IGN,IGN


In [39]:
#why -100, pytorch cross entropy loss has an attributte called ignore_index whose value is -100, index is ignore dureing training, so these tokens are ignores

In [40]:
#for a complete data
def tokenize_and_align_labels(examples):
  tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True,is_split_into_words=True)
  labels = []
  for idx, label in enumerate(examples["ner_tags"]):
    word_ids = tokenized_inputs.word_ids(batch_index=idx)
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
      if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
      else:
        label_ids.append(label[word_idx])
      previous_word_idx = word_idx
    labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs

def encode_panx_dataset(corpus):
  return corpus.map(tokenize_and_align_labels,batched=True,remove_columns = ['langs','ner_tags','tokens'])

In [41]:
panx_de_encoded = encode_panx_dataset(panx_ch['de'])

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [42]:
panx_de_encoded['train'][0]

{'input_ids': [0,
  70101,
  176581,
  19,
  142,
  122,
  2290,
  708,
  1505,
  18363,
  18,
  23,
  122,
  127474,
  15439,
  13787,
  14,
  15263,
  18917,
  663,
  6947,
  19,
  6,
  5,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100,
  0,
  0,
  -100,
  0,
  0,
  5,
  -100,
  -100,
  6,
  -100,
  0,
  0,
  5,
  -100,
  5,
  -100,
  -100,
  -100,
  6,
  -100,
  -100,
  0,
  -100,
  -100]}

In [43]:
import numpy as np
def align_predictions(predictions,label_ids):
  preds = np.argmax(predictions,axis=2)
  batch_size,seq_len = preds.shape
  labels_list,preds_list = [],[]
  for batch_idx in range(batch_size):
    example_labels,example_preds = [],[]
    for seq_idx in range(seq_len):
      if label_ids[batch_idx,seq_idx]!=-100:
        example_labels.append(index2tag[label_ids[batch_idx,seq_idx]])
        example_preds.append(index2tag[preds[batch_idx,seq_idx]])
  labels_list.append(example_labels)
  preds_list.append(example_preds)

  return labels_list,preds_list

In [44]:
from transformers import TrainingArguments
num_epochs = 3
batch_size = 24
logging_steps = len(panx_de_encoded["train"]) // batch_size
model_name = f"{xlmr_model_name}-finetuned-panx-de"
training_args = TrainingArguments(output_dir=model_name, log_level="error", num_train_epochs=num_epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size, evaluation_strategy="epoch",
                                  save_steps=1e6, weight_decay=0.01, disable_tqdm=False,
                                  logging_steps=logging_steps,)

In [45]:
!pip install seqeval



In [46]:
from seqeval.metrics import f1_score
def compute_metrics(eval_pred):
  y_pred,y_true = align_predictions(eval_pred.predictions,eval_pred.label_ids)
  return {'f1':f1_score(y_true,y_pred)}

In [47]:
#padding each input sequence length to a largest sequence length in a batch
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)


In [48]:
def model_init():
  return (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name,config =xlmr_config).to(device))

In [49]:
from transformers import Trainer
trainer = Trainer(model_init=model_init, args=training_args,
                  data_collator=data_collator, compute_metrics=compute_metrics,
                  train_dataset=panx_de_encoded["train"],
                  eval_dataset=panx_de_encoded["validation"],
                  tokenizer=xlmr_tokenizer)

In [50]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.7285,0.310838,1.0
2,0.2742,0.227314,1.0
3,0.1874,0.207024,1.0


TrainOutput(global_step=126, training_loss=0.39109017073162017, metrics={'train_runtime': 43.4231, 'train_samples_per_second': 69.088, 'train_steps_per_second': 2.902, 'total_flos': 70214428965648.0, 'train_loss': 0.39109017073162017, 'epoch': 3.0})

In [51]:
trainer.model

XLMRobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

In [52]:
text_de = "Jeff Dean ist ein Informatiker bei Google in Kalifornien"
tag_text(text_de, tags, trainer.model, xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>,,,,
1,O,B-PER,I-PER,I-PER,O,O,O,O,O,B-ORG,O,B-LOC,B-LOC,O


##Error Analysis

In [59]:
aaaaa

#we might have accidently mask too many tokens and also mask some of our original labels to get good drop in perf
#we might include zero calss or 0 entity in NER as a normal class, which will heavily skew accuracy and f1-scores since it is the majority class by a large margin


In [60]:
from torch.nn.functional import cross_entropy

In [61]:
'''
def forward_pass_with_label(valid_set):
  print(valid_set)
  print('-------valid_set-------')


  features = [dict(zip(valid_set,t))for t in zip(*valid_set.values())]

  print(features)
  print('-----------------batch---------------')

  print(t)
  print('------------------t-----------------')


  print()
  '''

"\ndef forward_pass_with_label(valid_set):\n  print(valid_set)\n  print('-------valid_set-------')\n\n  \n  features = [dict(zip(valid_set,t))for t in zip(*valid_set.values())]\n  \n  print(features)\n  print('-----------------batch---------------')\n\n  print(t)\n  print('------------------t-----------------')\n\n\n  print()\n  "

In [62]:
'''
#{inputs_ids : [[],[],[]],attention_masks: [[],[],[]],labels = [[],[],[]]}
#

'''



#valid_set = panx_de_encoded['validation'][0:2]
#forward_pass_with_label(valid_set)

'\n#{inputs_ids : [[],[],[]],attention_masks: [[],[],[]],labels = [[],[],[]]}\n#\n\n'

In [63]:
#panx_de_encoded['validation'].map(forward_pass_with_label,batched=True,batch_size=2)

In [64]:
batch = {
    'feature1': [1, 2, 3],
    'feature2': [4, 5, 6]
}

[dict(zip(batch,t))for t in zip(*batch.values())]
#zip function pairs the iterables. since from batch iterbale is key and tuple is also iterable, its zips key and tuple

[{'feature1': 1, 'feature2': 4},
 {'feature1': 2, 'feature2': 5},
 {'feature1': 3, 'feature2': 6}]

In [65]:
for t in zip(*batch.values()):
  print(batch,t)

{'feature1': [1, 2, 3], 'feature2': [4, 5, 6]} (1, 4)
{'feature1': [1, 2, 3], 'feature2': [4, 5, 6]} (2, 5)
{'feature1': [1, 2, 3], 'feature2': [4, 5, 6]} (3, 6)


In [66]:
#feat = [{'input_ids': [0, 10699, 11, 15, 16104, 1388, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 3, -100, 4, 4, 4, -100]}, {'input_ids': [0, 56530, 25216, 30121, 152385, 19229, 83982, 1002, 170, 10, 434, 188, 31721, 299, 170, 57, 15263, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, -100, -100, -100, -100, 3, -100, -100, 4, -100, -100, -100, -100, -100, 4, -100, -100]}]
#dc = data_collator(feat) #paded with 1's
#label_ids = dc['labels']
#outputs = trainer.model(dc['input_ids'].to(device),dc['attention_mask'].to(device))
#print(outputs.logits)
#pred_label = torch.argmax(trainer.model(dc['input_ids'].to(device),dc['attention_mask'].to(device))[0],dim=-1)
#print(pred_label)



In [67]:
def forward_pass_with_label(batch):
  features = [dict(zip(batch,t))for t in zip(*batch.values())]
  batch = data_collator(features)
  input_ids = batch['input_ids'].to(device)
  attention_masks = batch['attention_mask'].to(device)
  labels =batch['labels'].to(device)
  with torch.no_grad():
    outputs = trainer.model(input_ids,attention_masks) #[batch,seq_len,classes]
    predicted_label = torch.argmax(outputs.logits,axis=-1).cpu().numpy()
  loss = cross_entropy(outputs.logits.view(-1,7),labels.view(-1),reduction='none')
  loss = loss.view(len(input_ids),-1).cpu().numpy()
  return {'loss':loss,'predicted_label':predicted_label}



In [None]:
valid_set = panx_de_encoded["validation"]
valid_set = valid_set.map(forward_pass_with_label, batched=True, batch_size=32)
df = valid_set.to_pandas()

##Cross-Lingual Transfer

In [64]:
def get_f1_score(trainer,data):
  return trainer.predict(data).metrics['test_f1']

In [65]:
f1_scores = defaultdict(dict)
f1_scores['de']['de'] = get_f1_score(trainer,panx_de_encoded['test'])
print(f"F1-score of [de] model on [de] dataset: {f1_scores['de']['de']:.3f}")

F1-score of [de] model on [de] dataset: 0.923


In [67]:
text_fr = "Jeff Dean est informaticien chez Google en Californie"
tag_text(text_fr, tags, trainer.model, xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>,,,,
1,I-LOC,B-PER,I-PER,I-PER,O,O,O,O,B-ORG,O,B-LOC,B-LOC,B-LOC,I-LOC


In [68]:
def evaluate_lang_performance(lang, trainer):
  panx_ds = encode_panx_dataset(panx_ch[lang])
  return get_f1_score(trainer, panx_ds["test"])
f1_scores["de"]["fr"] = evaluate_lang_performance("fr", trainer)
print(f"F1-score of [de] model on [fr] dataset: {f1_scores['de']['fr']:.3f}")

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

F1-score of [de] model on [fr] dataset: 0.000


In [69]:
f1_scores["de"]["it"] = evaluate_lang_performance("it", trainer)
print(f"F1-score of [de] model on [it] dataset: {f1_scores['de']['it']:.3f}")

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

F1-score of [de] model on [it] dataset: 1.000


In [71]:
f1_scores["de"]["en"] = evaluate_lang_performance("en", trainer)
print(f"F1-score of [de] model on [en] dataset: {f1_scores['de']['en']:.3f}")


Map:   0%|          | 0/1180 [00:00<?, ? examples/s]

Map:   0%|          | 0/590 [00:00<?, ? examples/s]

Map:   0%|          | 0/590 [00:00<?, ? examples/s]

F1-score of [de] model on [en] dataset: 1.000
