In [2]:
import pandas as pd

***Synonym replacement***

In [118]:
df_train=pd.read_csv('final_train.csv')

import random
import nltk
from nltk.corpus import wordnet

# Function to replace entity with synonyms
def replace_entity_with_synonym(entity):
    if entity.isnumeric():
        # Replace number with a random number
        temp=entity*2
        return temp
    else:
        synonyms = set()
        for syn in wordnet.synsets(entity):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name().lower())

        if len(synonyms) == 0:
            return entity

        # Replace entity with random synonym
        synonym = random.choice(list(synonyms))
        return synonym

# Function to perform data augmentation using synonym replacement
def augment_dataset_with_synonyms(dataset):
    augmented_dataset = []

    for sentence, entities in dataset:
        augmented_entities = []
        for entity, entity_type in entities:
            # Replace entity with synonym or random number
            new_entity = replace_entity_with_synonym(entity)

            # Append original entity and new entity to list
            #augmented_entities.append((entity, entity_type))
            augmented_entities.append((new_entity, entity_type))

        # Append original sentence and augmented entities to dataset
        augmented_dataset.append((sentence, augmented_entities))

    return augmented_dataset

dataset = []
for sentence, tags in zip(df_train.sentence, df_train.tags):
    temp3 = []
    for token, entity in zip(sentence.split(), tags.split()):
        temp3.append((token, entity))
    dataset.append((sentence,temp3))

augmented_dataset = augment_dataset_with_synonyms(dataset)

train_aug_sent=[]
train_aug_tag=[]
for sentence, entities in augmented_dataset:
    temp_sen=''
    temp_tag=''
    for i in entities:
        temp_sen+=i[0]
        temp_tag+=i[1]
        temp_sen+=' '
        temp_tag+=' '
    temp_sen=temp_sen.strip()
    temp_tag=temp_tag.strip()
    train_aug_sent.append(temp_sen)
    train_aug_tag.append(temp_tag)
    
pd1=pd.DataFrame(list(zip(train_aug_sent, train_aug_tag)),columns =['sentence','tags'])
pd1.to_csv('final_train_sr.csv',index=False)

In [93]:
df_train=pd.read_csv('final_test.csv')

import random
import nltk
from nltk.corpus import wordnet

# Function to replace entity with synonyms
def replace_entity_with_synonym(entity):
    if entity.isnumeric():
        # Replace number with a random number
        temp=entity*2
        return temp
    else:
        synonyms = set()
        for syn in wordnet.synsets(entity):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name().lower())

        if len(synonyms) == 0:
            return entity

        # Replace entity with random synonym
        synonym = random.choice(list(synonyms))
        return synonym

# Function to perform data augmentation using synonym replacement
def augment_dataset_with_synonyms(dataset):
    augmented_dataset = []

    for sentence, entities in dataset:
        augmented_entities = []
        for entity, entity_type in entities:
            # Replace entity with synonym or random number
            new_entity = replace_entity_with_synonym(entity)

            # Append original entity and new entity to list
            #augmented_entities.append((entity, entity_type))
            augmented_entities.append((new_entity, entity_type))

        # Append original sentence and augmented entities to dataset
        augmented_dataset.append((sentence, augmented_entities))

    return augmented_dataset

dataset = []
for sentence, tags in zip(df_train.sentence, df_train.tags):
    temp3 = []
    for token, entity in zip(sentence.split(), tags.split()):
        temp3.append((token, entity))
    dataset.append((sentence,temp3))

augmented_dataset = augment_dataset_with_synonyms(dataset)

train_aug_sent=[]
train_aug_tag=[]
for sentence, entities in augmented_dataset:
    temp_sen=''
    temp_tag=''
    for i in entities:
        temp_sen+=i[0]
        temp_tag+=i[1]
        temp_sen+=' '
        temp_tag+=' '
    temp_sen=temp_sen.strip()
    temp_tag=temp_tag.strip()
    train_aug_sent.append(temp_sen)
    train_aug_tag.append(temp_tag)
    
pd1=pd.DataFrame(list(zip(train_aug_sent, train_aug_tag)),columns =['sentence','tags'])
pd1.to_csv('final_test_sr.csv',index=False)

***Label Wise***

In [119]:
import random
import nltk
from nltk.corpus import wordnet

df_train=pd.read_csv('final_train.csv', encoding='utf-8', encoding_errors='ignore')

# Function to replace entity with random tokenwise
def label_wise_token_replacement_augmentation(tag, label_map):
    new_entity = random.choice(label_map[tag])
    return new_entity


# Function to perform data augmentation using random token replacement
def augment_dataset(dataset):
    augmented_dataset = []

    for sentence, entities in dataset:
        augmented_entities = []
        for entity, entity_type in entities:
            new_entity = label_wise_token_replacement_augmentation(entity_type, label_map)
            
            # Append original entity and new entity to list
            augmented_entities.append((new_entity, entity_type))

        # Append original sentence and augmented entities to dataset
        augmented_dataset.append((sentence, augmented_entities))
    return augmented_dataset



# create dataset from dataframe
dataset = []
for sentence, tags in zip(df_train.sentence, df_train.tags):
  temp = []
  for token, entity in zip(sentence.split(), tags.split()):
    temp.append((token, entity))
  dataset.append((sentence, temp))


# create empty label map for entity and tags match
label_map = {"UNIT": [], "O": [], "STATE": [], "QUANTITY": [], "NAME": [], "SIZE": [], "TEMP": [], "DF": []}


# fill label map from dataset
for sentence, tags_pair in dataset:
  for entity, tag in tags_pair:
      label_map[tag].append(entity)


# generate augmented dataset
augmented_dataset = augment_dataset(dataset)

train_aug_sent=[]
train_aug_tag=[]
for sentence, entities in augmented_dataset:
    temp_sen=''
    temp_tag=''
    for i in entities:
        temp_sen+=i[0]
        temp_tag+=i[1]
        temp_sen+=' '
        temp_tag+=' '
    temp_sen=temp_sen.strip()
    temp_tag=temp_tag.strip()
    train_aug_sent.append(temp_sen)
    train_aug_tag.append(temp_tag)
    
pd1=pd.DataFrame(list(zip(train_aug_sent, train_aug_tag)),columns =['sentence','tags'])
pd1.to_csv('final_train_label_wise.csv', index=False)



In [113]:
import random
import nltk
from nltk.corpus import wordnet

df_train=pd.read_csv('final_test.csv', encoding='utf-8', encoding_errors='ignore')

# Function to replace entity with random tokenwise
def label_wise_token_replacement_augmentation(tag, label_map):
    new_entity = random.choice(label_map[tag])
    return new_entity


# Function to perform data augmentation using random token replacement
def augment_dataset(dataset):
    augmented_dataset = []

    for sentence, entities in dataset:
        augmented_entities = []
        for entity, entity_type in entities:
            new_entity = label_wise_token_replacement_augmentation(entity_type, label_map)
            
            # Append original entity and new entity to list
            augmented_entities.append((new_entity, entity_type))

        # Append original sentence and augmented entities to dataset
        augmented_dataset.append((sentence, augmented_entities))
    return augmented_dataset



# create dataset from dataframe
dataset = []
for sentence, tags in zip(df_train.sentence, df_train.tags):
  temp = []
  for token, entity in zip(sentence.split(), tags.split()):
    temp.append((token, entity))
  dataset.append((sentence, temp))


# create empty label map for entity and tags match
label_map = {"UNIT": [], "O": [], "STATE": [], "QUANTITY": [], "NAME": [], "SIZE": [], "TEMP": [], "DF": []}


# fill label map from dataset
for sentence, tags_pair in dataset:
  for entity, tag in tags_pair:
      label_map[tag].append(entity)


# generate augmented dataset
augmented_dataset = augment_dataset(dataset)

train_aug_sent=[]
train_aug_tag=[]
for sentence, entities in augmented_dataset:
    temp_sen=''
    temp_tag=''
    for i in entities:
        temp_sen+=i[0]
        temp_tag+=i[1]
        temp_sen+=' '
        temp_tag+=' '
    temp_sen=temp_sen.strip()
    temp_tag=temp_tag.strip()
    train_aug_sent.append(temp_sen)
    train_aug_tag.append(temp_tag)
    
pd1=pd.DataFrame(list(zip(train_aug_sent, train_aug_tag)),columns =['sentence','tags'])
pd1.to_csv('final_test_label_wise.csv', index=False)

***Shuffle with segments***

In [218]:
df_train=pd.read_csv('final_train.csv', encoding='utf-8', encoding_errors='ignore')

train_sent=[]
train_tag=[]
for i in df_train['sentence']:
    train_sent.append(i)
for i in df_train['tags']:
    train_tag.append(i)

train_sent_list=[]
train_tag_list=[]
for i in range(len(train_sent)):
    temp_sent=(train_sent[i].split(" "))
    temp_tag=(train_tag[i].split(" "))
    train_sent_list.append(temp_sent)
    train_tag_list.append(temp_tag)
    
# create dataset from dataframe
dataset = []
for sentence, tags in zip(df_train.sentence, df_train.tags):
    temp = []
    for token, entity in zip(sentence.split(), tags.split()):
        temp.append((token, entity))
    dataset.append((sentence, temp))

def shuffle_with_segments(entities):
    label_map = {}
    for entity, tags in entities:
        if tags not in label_map:
            label_map[tags] = [entity]
        else:
            label_map[tags].append(entity)
    for key in label_map:
        random.shuffle(label_map[key])
    return label_map

dataset2=[]
for i,j in dataset:
    temp2=shuffle_with_segments(j)
    dataset2.append(temp2)

aug_train_set=[]
for i in range(len(train_sent_list)):
    try:
        temp=[]
        count=0
        for j in train_tag_list[i]:
            if(len(dataset2[i][j])==1):
                temp.append(train_sent_list[i][count])
            else:
                val=random.choice(dataset2[i][j])
                temp.append(val)
                dataset2[i][j].remove(val)
            count=count+1
        aug_train_set.append(temp)
    except KeyError:
        continue
    
aug_train_sent=[]
for i in aug_train_set:
    temp = ' '.join([str(elem) for elem in i])
    aug_train_sent.append(temp)

pd1=pd.DataFrame(list(zip(train_aug_sent, train_aug_tag)),columns =['sentence','tags'])
pd1.to_csv('final_train_sis.csv', index=False)

In [219]:
final_train=pd.read_csv('final_train.csv', encoding='utf-8', encoding_errors='ignore')
final_train_label_wise=pd.read_csv('final_train_label_wise.csv', encoding='utf-8', encoding_errors='ignore')
final_train_sr=pd.read_csv('final_train_sr.csv', encoding='utf-8', encoding_errors='ignore')
final_train_sis=pd.read_csv('final_train_sis.csv', encoding='utf-8', encoding_errors='ignore')

final_pd=pd.concat([final_train, final_train_label_wise, final_train_sr, final_train_sis ])
final_pd.to_csv('final_train_augmented.csv', index=False)