In [3]:
import pandas as pd
import torchtext.vocab as vocab
from scipy.spatial.distance import cosine
from tqdm import tqdm
from multiprocessing import Pool
import torch
from transformers import BertTokenizer, BertForMaskedLM
import numpy as np
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertForMaskedLM.from_pretrained("bert-base-uncased")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

glove = vocab.GloVe(name='6B', dim=300)

mask_token_id = tokenizer.mask_token_id
substitution_percentage = 0.1

glove_vectors = vocab.GloVe(name='6B', dim=300)

def get_most_similar_glove_embedding(glove, word):
    try:
        word_vector = glove.vectors[glove.stoi[word]]
        similarities = 1 - cosine(glove.vectors, word_vector)
        most_similar_index = np.argmax(similarities)
        return glove.itos[most_similar_index]
    except KeyError:
        return None

# Function to substitute subwords with GloVe embeddings
def substitute_subwords_with_glove(sentence, tokens, positions, glove, tokenizer):
    for position in positions:
        token = tokens[position]
        tokenized_word = tokenizer.tokenize(token)

        if len(tokenized_word) > 1:  # Word tokenized into subwords
            # Replace subwords with most similar GloVe embedding
            closest_word = get_most_similar_glove_embedding(glove, token)
            if closest_word:
                tokens[position] = closest_word

# Function to augment data
def augment_data(sentence, percentage=0.1):
    tokens = tokenizer.tokenize(sentence)
    num_words_to_replace = int(len(tokens) * percentage)

    positions_to_replace = np.random.choice(len(tokens), size=num_words_to_replace, replace=False)
    
    substitute_subwords_with_glove(sentence, tokens, positions_to_replace, glove_vectors, tokenizer)

    augmented_sentence = tokenizer.convert_tokens_to_string(tokens)
    return augmented_sentence

# Load the data from the text file
# Assuming the file is tab-separated and doesn't have a header
data = pd.read_csv('train_150k.txt', sep='\t', header=None, names=['label', 'text'])

# Convert the DataFrame into a list of tuples
data_tuples = [tuple(x) for x in data.to_records(index=False)]

def augment_row(row):
    # Unpack the row
    label, text = row
    
    # Augment the text
    augmented_text = augment_data(text)
    
    # Return a new row with the same label and augmented text
    return (label, augmented_text)

# Create a pool of workers
with Pool() as p:
    # Apply the data augmentation to each row in the DataFrame in parallel
    augmented_data = list(tqdm(p.imap(augment_row, data_tuples), total=len(data)))

# Convert the augmented data to a DataFrame
augmented_data = pd.DataFrame(augmented_data, columns=['label', 'text'])

# Save the augmented data back to a text file
augmented_data.to_csv('augmented_data.txt', sep='\t', header=False, index=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
.vector_cache/glove.6B.zip: 862MB [02:52, 5.00MB/s]                               
100%|█████████▉| 399999/400000 [00:22<00:00, 17777.19it/s]
100%|██████████| 149985/149985 [00:05<00:00, 25922.18it/s]
