In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
"""
data annotation with transfer learning (zero shot learning mechanism) using distilBERT and roberta
"""
import pandas as pd
from transformers import DistilBertForTokenClassification
from transformers import DistilBertTokenizerFast
from transformers import RobertaForTokenClassification
from transformers import RobertaTokenizer
from transformers import pipeline

In [4]:
def load_dataset_from_pickle(**kwargs)->pd.core.frame.DataFrame:
    return pd.read_pickle(kwargs['file_path'])

In [7]:
def save_dataset_to_pickle(**kwargs)->None:
    kwargs['dataset'].to_pickle(kwargs['file_path'])

In [8]:
class NER:
    model=None
    model_pipeline=None
    tokenizer=None
    dataset:pd.core.frame.DataFrame=None
    ner_dataset:pd.core.frame.DataFrame=None
    model_choice:str=""
    def __init__(self,**kwargs)->None:
        self.dataset = kwargs['tokenised_dataset']
        self.model_choice = kwargs['model_choice']
        self.ner_dataset = pd.DataFrame(columns=list(self.dataset.columns))
    def __select_model(self)->None:
        match(self.model_choice):
            case 'DistilBert':
                self.model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased')
                self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased',tokenization_strategy='word',model_max_length=512,padding=True,truncation=True)
            case 'Roberta':
                self.model = RobertaForTokenClassification.from_pretrained('roberta-base-uncased')
                self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base-uncased', tokenization_strategy='word',model_max_length=512)
            case _:
                self.model = None
    def __entity_recognition(self,**kwargs)->None:
        sentences = kwargs['data_sentences']
        self.model_pipeline = pipeline('ner',model=self.model,tokenizer=self.tokenizer)
        result:list=[]
        for sentence in sentences:
            result.append(self.model_pipeline(sentence))
        return result

    def entity_recognition(self)->None:
        self.__select_model()
        tmp_dict:dict = {i:[] for i in ('entity','word')}
        for field in self.dataset.columns:
            ner_results = self.dataset.apply(lambda row: self.__entity_recognition(data_sentences=row[field]),axis=1)
            for result in ner_results[0][0]:
                tmp_dict['entity'].append(result[0]['entity'])
                tmp_dict['word'].append(result[0]['word'])
            self.ner_dataset[field] = tmp_dict
            tmp_dict['entity'] = []
            tmp_dict['word'] = []
        save_dataset_to_pickle(dataset=self.ner_dataset,file_path=rf'/content/drive/MyDrive/herbology/ner_dataset.pkl')


In [30]:
def main()->None:
    dataset = load_dataset_from_pickle(file_path=rf'/content/drive/MyDrive/herbology/tokens.pkl')
    ner = NER(tokenised_dataset=dataset,model_choice='DistilBert')
    ner.entity_recognition()

In [None]:
main()

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  for result in ner_results[0][0]:
  for result in ner_results[0][0]:


In [5]:
dataset = load_dataset_from_pickle(file_path=rf'/content/drive/MyDrive/herbology/ner_dataset.pkl')

In [6]:
dataset

Unnamed: 0,English,Siddha/Tamil,Habitat,Action,Unani,Dosage,Synonym,Family,Folk,Ayurvedic
entity,"[LABEL_1, LABEL_0, LABEL_0, LABEL_0, LABEL_0, ...","[LABEL_1, LABEL_0, LABEL_0, LABEL_0, LABEL_1, ...","[LABEL_1, LABEL_0, LABEL_0, LABEL_0, LABEL_0, ...","[LABEL_1, LABEL_0, LABEL_0, LABEL_0, LABEL_0, ...","[LABEL_1, LABEL_0, LABEL_0, LABEL_0, LABEL_0, ...","[LABEL_1, LABEL_1]","[LABEL_1, LABEL_0, LABEL_0, LABEL_0, LABEL_0, ...","[LABEL_1, LABEL_0, LABEL_0, LABEL_0, LABEL_0, ...","[LABEL_1, LABEL_0, LABEL_0, LABEL_0, LABEL_0, ...","[LABEL_1, LABEL_0, LABEL_0, LABEL_0, LABEL_0, ..."
word,"[[CLS], gum, #, ,, lady, finger, ,, ok, #, ., ...","[[CLS], ve, #, #, ., [SEP]]","[[CLS], native, to, tropical, africa, ;, culti...","[[CLS], immature, pods, (, deco, #, -, ti, #, ...","[[CLS], ba, #, #, #, ., [SEP]]","[[CLS], [SEP]]","[[CLS], hi, #, #, es, #, #, #, lin, #, ., [SEP]]","[[CLS], mal, #, #, #, ., [SEP]]","[[CLS], b, #, #, ,, ra, #, #, #, ., [SEP]]","[[CLS], b, #, #, #, ,, b, #, #, #, ,, b, #, #,..."
