In [2]:
import pandas as pd

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
from simpletransformers.ner import NERModel,NERArgs

In [5]:
df = pd.read_csv('/content/ner_dataset.csv', encoding= 'latin1')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [6]:
df.Tag.unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [7]:
df[df.Tag == 'B-geo'].head()

Unnamed: 0,Sentence #,Word,POS,Tag
6,,London,NNP,B-geo
12,,Iraq,NNP,B-geo
65,,Hyde,NNP,B-geo
94,,Britain,NNP,B-geo
106,,Brighton,NNP,B-geo


In [8]:
# fill missing values "Sentence #" column with fowrad fill
df =df.fillna(method ="ffill")

In [9]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [10]:
# label encoding sentence column
df["Sentence #"] = LabelEncoder().fit_transform(df["Sentence #"] )

In [11]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,0,Thousands,NNS,O
1,0,of,IN,O
2,0,demonstrators,NNS,O
3,0,have,VBP,O
4,0,marched,VBN,O


In [12]:
df.rename(columns={"Sentence #":"sentence_id","Word":"words","Tag":"labels"}, inplace =True)

In [13]:
# grouping dependednt and indenpendent variables
X= df[["sentence_id","words"]]
Y =df["labels"]

In [14]:
X.head()

Unnamed: 0,sentence_id,words
0,0,Thousands
1,0,of
2,0,demonstrators
3,0,have
4,0,marched


In [15]:
# creating and spliting train and test data
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size =0.2)

In [16]:
#building up train data and test data
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [17]:
train_data.head()

Unnamed: 0,sentence_id,words,labels
585767,18653,rebate,O
293758,3839,identified,O
574813,18084,",",O
547004,16679,the,O
45028,11435,.,O


In [18]:
# training a model
NER_tags = df["labels"].unique().tolist()
NER_tags

['O',
 'B-geo',
 'B-gpe',
 'B-per',
 'I-geo',
 'B-org',
 'I-org',
 'B-tim',
 'B-art',
 'I-art',
 'I-per',
 'I-gpe',
 'I-tim',
 'B-nat',
 'B-eve',
 'I-eve',
 'I-nat']

In [19]:
# parameter for the NER BERT model 
args = NERArgs()
# number of epochs
args.num_train_epochs = 2
# learning rate
args.learning_rate = 1e-4
args.overwrite_output_dir =True
# training and validation batch size
args.train_batch_size = 32
args.eval_batch_size = 32

In [20]:
model = NERModel('bert', 'bert-base-cased',labels=NER_tags,args =args)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [21]:
model.train_model(train_data,eval_data = test_data,acc=accuracy_score)

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/1499 [00:00<?, ?it/s]



Running Epoch 1 of 2:   0%|          | 0/1499 [00:00<?, ?it/s]

(2998, 0.1620789350229772)

In [22]:
result, model_outputs, preds_list = model.eval_model(test_data)

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1460 [00:00<?, ?it/s]

In [23]:
result

{'eval_loss': 0.1780587410464985,
 'f1_score': 0.7960392744217009,
 'precision': 0.8260343993921393,
 'recall': 0.768146197327852}

In [28]:
# prediction on a sample text
prediction, model_output = model.predict(["London is the capital and largest city of England and the United Kingdom."])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [29]:
prediction

[[{'London': 'B-geo'},
  {'is': 'O'},
  {'the': 'O'},
  {'capital': 'O'},
  {'and': 'O'},
  {'largest': 'O'},
  {'city': 'O'},
  {'of': 'O'},
  {'England': 'B-geo'},
  {'and': 'O'},
  {'the': 'O'},
  {'United': 'B-geo'},
  {'Kingdom.': 'I-geo'}]]