In [49]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import re

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "Hello my name is Sarah I am live in Canada in 1984 years with Ahmad"
# remove all non-alphabetic characters from the text
example = re.sub(r'[^a-zA-Z ]+', '', example)
ner_results = nlp(example)

import pandas as pd

# assuming your data is stored in a variable called 'data'
df = pd.DataFrame(ner_results)

df

Unnamed: 0,entity,score,index,word,start,end
0,B-PER,0.999105,5,Sarah,17,22
1,B-LOC,0.999755,10,Canada,36,42
2,B-PER,0.998341,14,Ahmad,58,63


In [50]:
# add new column with entity after '-'
df['entity_type'] = df['entity'].apply(lambda x: x.split('-')[1])
# delete original entity column
df = df.drop(columns=['entity'])

# rename entity_type column to entity
df = df.rename(columns={'entity_type': 'entity'})
print(df)

      score  index    word  start  end entity
0  0.999105      5   Sarah     17   22    PER
1  0.999755     10  Canada     36   42    LOC
2  0.998341     14   Ahmad     58   63    PER


In [53]:
# define desired column order
new_column_order = ['entity', 'word', 'score', 'index', 'start', 'end']

# reindex columns in desired order
df = df.reindex(columns=new_column_order)
df

Unnamed: 0,entity,word,score,index,start,end
0,PER,Sarah,0.999105,5,17,22
1,LOC,Canada,0.999755,10,36,42
2,PER,Ahmad,0.998341,14,58,63


In [54]:
grouped_df = df["grouped_df"].groupby('entity').count()
grouped_df = grouped_df.rename(columns={'entity_text': 'number'}).reset_index().rename(columns={'entity_label': 'entity'})


In [55]:
grouped_df

Unnamed: 0_level_0,word,score,index,start,end
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LOC,1,1,1,1,1
PER,2,2,2,2,2
