In [19]:
import json 
from itertools import chain
import pandas as pd
from transformers import pipeline, BertTokenizer, BertForMaskedLM
import logging


In [20]:
from transformers import pipeline, BertTokenizer, BertForMaskedLM
import logging

In [21]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet 
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/shruthi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/shruthi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [22]:
from nltk.tokenize import MWETokenizer, WordPunctTokenizer, TreebankWordTokenizer,word_tokenize
from transformers import BertTokenizer, AlbertTokenizer, GPT2Tokenizer
from transformers import WordpieceTokenizer,AutoTokenizer

In [23]:
class Lowercaser:
    dataset:pd.core.frame.DataFrame
    def __init__(self, **kwargs):
        self.dataset = kwargs['dataset']
    def convert_to_lowercase(self):
        for col in self.dataset.columns:
            self.dataset[col] = self.dataset[col].apply(lambda x: x.lower() if isinstance(x,str) else x)
    
    def get_dataset(self)->pd.core.frame.DataFrame:
        return self.dataset

In [24]:
class Lemmatiser:
    dataset:pd.core.frame.DataFrame
    lemmatizer=None
    def __init__(self,**kwargs):
        self.dataset = kwargs['dataset']
        self.lemmatizer = WordNetLemmatizer()
    
    def __lemmatise(self,**kwargs)->None:
        field:str = kwargs['column']
        self.dataset.loc[:,field] = self.dataset.apply(lambda row: self.lemmatizer.lemmatize(row[field]),axis=1)
        print(self.dataset[field])
    
    def lemmatise(self)->None:
        for fields in self.dataset.columns:
            self.__lemmatise(column=fields)
    
    def get_dataset(self)->pd.core.frame.DataFrame:
        return self.dataset

In [25]:
class Tokeniser:
    dataset:pd.core.frame.DataFrame=None
    tokeniser_choice:str = ""
    tokeniser=None
    word_tokeniser=None
    tokenised_dataset:pd.core.frame.DataFrame=None
    def __init__(self,**kwargs):
        self.dataset = kwargs['dataset']
        self.tokeniser_choice = kwargs['token_choice']
        self.tokenised_dataset = pd.DataFrame(columns=self.dataset.columns)
    
    def __select_tokeniser(self)->None:
        match(self.tokeniser_choice):
            case 'BertTokeniser':
                self.tokeniser = BertTokenizer.from_pretrained('bert-base-uncased',tokenization_strategy='word',model_max_length=512)
            case _:
                self.word_tokeniser = word_tokenize()
                self.tokeniser = MWETokenizer()
                
            # case 'AutoTokeniser':
            #     self.tokeniser = AutoTokenizer.from_pretrained("obi/deid_bert_i2b2", tokenizer_args={"do_basic_tokenize": False})
            # case 'gpt2':
            #     self.tokeniser = GPT2Tokenizer.from_pretrained('gpt2')
    def __tokenise(self,**kwargs)->list[list[str]]:
        sentences = kwargs['data_sentence']
        self.__select_tokeniser()
        tokens:list[str] = []
        tmp_tokens = self.tokeniser.encode_plus(sentences,
                                                    add_special_tokens=True,
                                                    max_length=20,
                                                    padding='max_length',
                                                    truncation=True,
                                                    return_tensors='pt')
        tokens.append(tmp_tokens['input_ids'].flatten().tolist()[0])
        return tokens
    
    def tokenise(self)->None:
        # for col in self.dataset.columns:
        #     self.dataset.loc[0:10:,col] = self.dataset.apply(lambda x: self.__tokenise(data_sentence=x[col]),axis=1)
        self.dataset.loc[:,'Action'] = self.dataset.apply(lambda x:self.__tokenise(data_sentence=x['Action']),axis=1)
        print(self.dataset['Action'])
    def get_dataset(self)->pd.core.frame.DataFrame:
        return self.dataset
    
    def save_dataset(self)->pd.core.frame.DataFrame:
        pass

In [26]:
'''
    Global Model Configurations for Interpolation
'''
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
bert_tokeniser = BertTokenizer.from_pretrained('bert-base-uncased',tokenization_strategy='word')
fill_mask = pipeline('fill-mask',model=bert_model,tokenizer=bert_tokeniser)


In [27]:
def load_json_data(**kwargs)->dict:
    file_path:str = kwargs['file_path']
    data:dict[str,list[dict[str,str]]]={}
    with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
    return data

In [28]:
def flatten_json_obj(**kwargs)->pd.core.frame.DataFrame:
    data = kwargs['json_unflattened']
    sub_data:dict[str,dict[str,str]] = data['root'][0]
    column_list:set[str] = set()
    for name, sub_dict in sub_data.items():
        for key in sub_dict.keys():
            if key not in column_list:
                column_list.add(key)
    print(column_list)
    dataset = pd.DataFrame.from_dict(data=sub_data,orient='index',columns=list(column_list))
    return dataset

In [29]:
def lowercasing(**kwargs)->pd.core.frame.DataFrame:
    lc = Lowercaser(dataset=kwargs['dataset'])
    lc.convert_to_lowercase()
    return lc.get_dataset()

In [30]:
def interpolation_with_gpt(**kwargs)->str:
    text_generator = pipeline('text-generation',model='gpt2')
    prompt:str = kwargs['context_before'] + "[MISSING]" + kwargs['context_after']
    text:str = kwargs['text']
    if pd.isna(text):
        generated_text = text_generator(prompt, max_length=50)[0]['generated_text']
        return generated_text.replace(prompt,"").strip()
    return text

In [31]:
def interpolation_with_bert(**kwargs)->str:
    text:str = kwargs['text']
    if pd.isna(text):
        sentence = kwargs['context_before'] + "[MASK]" + kwargs['context_after']
        predictions = fill_mask(sentence)
        return predictions[0]['sequence']
    return text

In [32]:
def interpolate_missing_values(**kwargs)->pd.core.frame.DataFrame:
    dataset = kwargs['dataset']
    column = kwargs['col']
    col_before = kwargs['col_before']
    dataset.loc[:,column] = dataset.apply(lambda row: interpolation_with_bert(text=row[column],context_before=row[col_before],context_after=""),axis=1)
    return dataset

In [33]:
def lemmatisation(**kwargs)->None:
    dataset = kwargs['dataset']
    tokeniser = Tokeniser(dataset=dataset, token_choice='BertTokeniser')
    tokeniser.tokenise()

In [34]:
def preprocess(**kwargs)->None:
    dataset:pd.core.frame.DataFrame = kwargs['dataset']
    fields:list[str] = list(dataset.columns)
    #performing lowercasing operation 
    dataset = lowercasing(dataset=dataset)
    
    #interpolating missing values by taking advantage of the masking property of a pre-trained BERT model
    for col_index in range(len(fields[0:-2])):
        dataset = interpolate_missing_values(dataset=dataset, col=fields[col_index],col_before=fields[col_index-1])
    print(dataset.isna())
    
    #tokenising and lemmatising the dataset,forming an entirely new dataset of padded tokens
    lemmatisation(dataset=dataset)

In [35]:
def main():
    obj = load_json_data(file_path=rf'../data/json_data1.json')
    dataset = flatten_json_obj(json_unflattened=obj)
    preprocess(dataset=dataset)