<a href="https://colab.research.google.com/github/gupta24789/named-entity-recognition/blob/main/ner_bert_simple_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q simpletransformers

In [2]:
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from pprint import pprint
import itertools

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from simpletransformers.ner import NERModel,NERArgs

In [3]:
# ## Doenload data & structure
# Path('data/train').mkdir(parents = True, exist_ok= True)
# Path('data/val').mkdir(parents = True, exist_ok= True)
# Path('data/test').mkdir(parents = True, exist_ok= True)

# os.system("cd data/train && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/train/sentences.txt")
# os.system("cd data/train && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/train/labels.txt")
# os.system("cd data/val && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/val/sentences.txt")
# os.system("cd data/val && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/val/labels.txt")
# os.system("cd data/test && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/test/sentences.txt")
# os.system("cd data/test && wget https://raw.githubusercontent.com/gupta24789/named-entity-recognition/main/data/test/labels.txt")

In [4]:
train_sentences = open("data/train/sentences.txt","r").readlines()
train_labels = open("data/train/labels.txt","r").readlines()
val_sentences = open("data/val/sentences.txt","r").readlines()
val_labels = open("data/val/labels.txt","r").readlines()
test_sentences = open("data/test/sentences.txt","r").readlines()
test_labels = open("data/test/labels.txt","r").readlines()

In [5]:
train_sentences[:3]

['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .\n',
 'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "\n',
 'They marched from the Houses of Parliament to a rally in Hyde Park .\n']

In [6]:
def transform_data(sents, labels):
  sent_list = []
  label_list = []
  sentence_ids = []

  for i,(sent, label) in enumerate(zip(sents, labels)):
    sent = sent.strip().split(" ")
    label = label.strip().split(" ")
    sentence_ids.append([f'Sentence : {i+1}'] * len(sent))
    sent_list.append(sent)
    label_list.append(label)
  return sentence_ids, sent_list, label_list

In [7]:
train_sent_ids, train_sents, train_tags = transform_data(train_sentences,train_labels)
val_sent_ids, val_sents, val_tags  = transform_data(val_sentences,val_labels)
test_sent_ids, test_sents, test_tags  = transform_data(val_sentences,val_labels)

In [8]:
pprint(train_sent_ids[:2], compact=True)

[['Sentence : 1', 'Sentence : 1', 'Sentence : 1', 'Sentence : 1',
  'Sentence : 1', 'Sentence : 1', 'Sentence : 1', 'Sentence : 1',
  'Sentence : 1', 'Sentence : 1', 'Sentence : 1', 'Sentence : 1',
  'Sentence : 1', 'Sentence : 1', 'Sentence : 1', 'Sentence : 1',
  'Sentence : 1', 'Sentence : 1', 'Sentence : 1', 'Sentence : 1',
  'Sentence : 1', 'Sentence : 1', 'Sentence : 1', 'Sentence : 1'],
 ['Sentence : 2', 'Sentence : 2', 'Sentence : 2', 'Sentence : 2',
  'Sentence : 2', 'Sentence : 2', 'Sentence : 2', 'Sentence : 2',
  'Sentence : 2', 'Sentence : 2', 'Sentence : 2', 'Sentence : 2',
  'Sentence : 2', 'Sentence : 2', 'Sentence : 2', 'Sentence : 2',
  'Sentence : 2', 'Sentence : 2', 'Sentence : 2', 'Sentence : 2',
  'Sentence : 2', 'Sentence : 2', 'Sentence : 2', 'Sentence : 2',
  'Sentence : 2', 'Sentence : 2', 'Sentence : 2', 'Sentence : 2',
  'Sentence : 2', 'Sentence : 2']]


In [9]:
pprint(train_sents[:2], compact=True)

[['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London',
  'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the',
  'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'],
 ['Families', 'of', 'soldiers', 'killed', 'in', 'the', 'conflict', 'joined',
  'the', 'protesters', 'who', 'carried', 'banners', 'with', 'such', 'slogans',
  'as', '"', 'Bush', 'Number', 'One', 'Terrorist', '"', 'and', '"', 'Stop',
  'the', 'Bombings', '.', '"']]


In [10]:
pprint(train_tags[:2], compact=True)

[['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O',
  'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
  'O', 'O', 'O', 'B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
  'O']]


In [11]:
train_df = pd.DataFrame({"sentence_id": list(itertools.chain.from_iterable(train_sent_ids)),
                         "words": list(itertools.chain.from_iterable(train_sents)),
                         "labels": list(itertools.chain.from_iterable(train_tags))})

val_df = pd.DataFrame({"sentence_id": list(itertools.chain.from_iterable(val_sent_ids)),
                         "words": list(itertools.chain.from_iterable(val_sents)),
                         "labels": list(itertools.chain.from_iterable(val_tags))})

test_df = pd.DataFrame({"sentence_id": list(itertools.chain.from_iterable(test_sent_ids)),
                         "words": list(itertools.chain.from_iterable(test_sents)),
                         "labels": list(itertools.chain.from_iterable(test_tags))})

In [12]:
train_df.head()

Unnamed: 0,sentence_id,words,labels
0,Sentence : 1,Thousands,O
1,Sentence : 1,of,O
2,Sentence : 1,demonstrators,O
3,Sentence : 1,have,O
4,Sentence : 1,marched,O


In [13]:
val_df.head()

Unnamed: 0,sentence_id,words,labels
0,Sentence : 1,Russia,B-geo
1,Sentence : 1,'s,O
2,Sentence : 1,victory,O
3,Sentence : 1,put,O
4,Sentence : 1,the,O


In [14]:
train_df.sentence_id = LabelEncoder().fit_transform(train_df.sentence_id)
val_df.sentence_id = LabelEncoder().fit_transform(val_df.sentence_id)
test_df.sentence_id = LabelEncoder().fit_transform(test_df.sentence_id)

In [15]:
label = train_df["labels"].unique().tolist()
label

['O',
 'B-geo',
 'B-gpe',
 'B-per',
 'I-geo',
 'B-org',
 'I-org',
 'B-tim',
 'B-art',
 'I-art',
 'I-per',
 'I-gpe',
 'I-tim',
 'B-nat',
 'B-eve',
 'I-eve',
 'I-nat']

## Data format

## data : sentence_id,	words,	labels

In [16]:
args = NERArgs()
args.num_train_epochs = 3
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 64
args.eval_batch_size = 32
args.use_cuda = True

In [17]:
model = NERModel('bert', 'bert-base-cased',labels=label,args =args)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
model.train_model(train_df,eval_data = val_df,acc=accuracy_score)

  return [


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/525 [00:00<?, ?it/s]



Running Epoch 1 of 3:   0%|          | 0/525 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/525 [00:00<?, ?it/s]

(1575, 0.11362296008992763)

In [19]:
result, model_outputs, preds_list = model.eval_model(train_df)
result

  return [


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1050 [00:00<?, ?it/s]

{'eval_loss': 0.031021951520150262,
 'precision': 0.9265438373570521,
 'recall': 0.9316697969770146,
 'f1_score': 0.9290997470805966}

In [20]:
result, model_outputs, preds_list = model.eval_model(val_df)
result

  return [


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/225 [00:00<?, ?it/s]

{'eval_loss': 0.09207151164611181,
 'precision': 0.8344152012274283,
 'recall': 0.842519215873205,
 'f1_score': 0.8384476266714104}

In [21]:
result, model_outputs, preds_list = model.eval_model(test_df)
result

  return [


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/225 [00:00<?, ?it/s]

{'eval_loss': 0.09207151164611181,
 'precision': 0.8344152012274283,
 'recall': 0.842519215873205,
 'f1_score': 0.8384476266714104}

In [23]:
prediction, model_output = model.predict(["What is the new name of Bangalore"])
prediction

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[[{'What': 'O'},
  {'is': 'O'},
  {'the': 'O'},
  {'new': 'O'},
  {'name': 'O'},
  {'of': 'O'},
  {'Bangalore': 'B-geo'}]]

In [28]:
prediction, model_output = model.predict(["Apple is launching new iphone to India"])
prediction


  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[[{'Apple': 'B-org'},
  {'is': 'O'},
  {'launching': 'O'},
  {'new': 'O'},
  {'iphone': 'O'},
  {'to': 'O'},
  {'India': 'B-geo'}]]