In [2]:
import pandas as pd
import numpy as np

In [3]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action

In [4]:
df = pd.read_csv('../posts_info.csv', index_col=0)[['text','topic']]
df.sample(5)

Unnamed: 0,text,topic
437,Safety alert as GM recalls cars\n\nThe worlds ...,business
5335,It seems whenever a mainstream film company wa...,movie
2522,Gangsters dominate gaming chart\n\nVideo games...,tech
4478,If you like silly comedies like Airplane youll...,movie
5909,This serial is interesting to watch as an MST3...,movie


In [5]:
import textwrap
df_ = df[['text', 'topic']].copy()

for i, txt in enumerate(df_.text.values):
    txt = textwrap.shorten(txt, width=512, fix_sentence_endings=True).rsplit('.  ')
    txt.pop() if len(txt)>1 else None
    txt = '. '.join(txt)+'.'
    df_.text.values[i] = txt
    
df_.text[0]

'UK economy facing major risks The UK manufacturing sector will continue to face serious challenges over the next two years, the British Chamber of Commerce (BCC) has said. The groups quarterly survey of companies found exports had picked up in the last three months of 2004 to their best levels in eight years. The rise came despite exchange rates being cited as a major concern. However, the BCC found the whole UK economy still faced major risks and warned that growth is set to slow.'

In [6]:
aug = naf.Sequential([
    naw.ContextualWordEmbsAug(
        model_path='bert-base-multilingual-uncased', action="substitute", aug_p=.4, aug_max=20, device='cuda'
    ),
        naw.ContextualWordEmbsAug(
        model_path='bert-base-multilingual-uncased', action="insert", aug_p=.4, aug_max=20, device='cuda'
    ),
])
print(df_.text[0])
aug.augment(df_.text[0], n=5)

UK economy facing major risks The UK manufacturing sector will continue to face serious challenges over the next two years, the British Chamber of Commerce (BCC) has said. The groups quarterly survey of companies found exports had picked up in the last three months of 2004 to their best levels in eight years. The rise came despite exchange rates being cited as a major concern. However, the BCC found the whole UK economy still faced major risks and warned that growth is set to slow.


['uk economy facing international risks as uk coal manufacturing sector sector will find its best face facing challenges into the next two years, the 2007 british conservative chamber of commerce ( bcc ) study has said. by the groups following quarterly survey industry companies found exports they had largely picked up in the relatively last three months of 2004 prior to their high positions in eight years. the economic rise followed from where reduced rates was cited like a central public concern. in however, industry groups it found during the 2006 economy uk economy still faced major global risks and warned that food is set to fare.',
 'uk is economy facing major risks because the uk small manufacturing sector sector is now continue being a face global financial challenges again over the next two final quarters, the small british government of commerce ( fas ) uk has then argued. the groups s quarterly survey identified companies changed consumers not already picked up in the last t

In [287]:
sample = df_.sample(5)
sample

Unnamed: 0,text,topic
927,Three DJs replace Peel radio show The late Joh...,entertainment
6941,I bought the DVD of this movie because I am a ...,movie
3681,"With all this death around, I’ve found myself ...",covid
5771,The king is dead long live the King! The tria...,movie
1419,Blair backs pre-election budget Tony Blair has...,politics


In [291]:
newdf = pd.DataFrame(columns=['text','topic'])

for i, txt in zip(sample.index, sample.text.values):
    generated = pd.DataFrame({'text': aug.augment(txt, n=10)})
    generated['topic'] = sample.at[i, 'topic']

    newdf = pd.concat([newdf, generated])#.reset_index(drop=True)

In [293]:
newdf.shape

(50, 2)

In [5]:
df = pd.read_csv('newdf.csv')
df_full = pd.read_csv('newdf_full.csv')

In [6]:
# let's map text to embedding
from transformers import AutoTokenizer
# https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import BertModel


def get_model(model_name):
    assert model_name in ['bert_cased','bert_uncased', 'roberta', 'distilbert',]

    checkpoint_names = {
        'bert_cased': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'bert_uncased': 'bert-base-uncased',  # https://huggingface.co/bert-base-uncased

    }

    model_classes = {
        'bert_cased': BertModel,
        'bert_uncased': BertModel,
    }

    return (
        AutoTokenizer.from_pretrained(checkpoint_names[model_name]),
        model_classes[model_name].from_pretrained(checkpoint_names[model_name])
    )


In [7]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding


class PostDataset(Dataset):
    def __init__(self, texts, tokenizer):
        super().__init__()

        self.texts = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            return_token_type_ids=False,
            max_length=512,
            return_tensors='pt',
            truncation=True,
            padding=True
        )
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        return {'input_ids': self.texts['input_ids'][idx], 'attention_mask': self.texts['attention_mask'][idx]}

    def __len__(self):
        return len(self.texts['input_ids'])

    # def LayerNormaliz

def reload(values, tokenizer):
    dataset = PostDataset(values.tolist(), tokenizer)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    loader = DataLoader(dataset, batch_size=32,
                        collate_fn=data_collator, pin_memory=True, shuffle=False)

    return (
        dataset, data_collator, loader
    )


In [8]:
import torch
from tqdm import tqdm


@torch.inference_mode()
def get_embeddings_labels(model, loader):  
    model.eval()
    model.to(device)
 
    total_embeddings = []

    for batch in tqdm(loader):
        batch = {key: batch[key].to(device)
                 for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, :25, :] #as in paper

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)


In [10]:
tokenizer, model = get_model('bert_uncased')

dataset, data_collator, loader = reload(df['text'].values, tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
print(torch.cuda.get_device_name())

model = model.to(device)

cuda:0
NVIDIA GeForce GTX 1060 with Max-Q Design


In [12]:
embeddings = get_embeddings_labels(model, loader)

embeddings.shape

100%|██████████| 2195/2195 [43:24<00:00,  1.19s/it]  


torch.Size([70230, 25, 768])

In [15]:
def pool_by_tokens(tensor:torch.Tensor, method:str):
    assert method in ['max_pooling', 'mean_pooling']
    _dict = {
        'max_pooling' : lambda tensor: torch.max(tensor, dim=1)[0],
        'mean_pooling' : lambda tensor: torch.mean(tensor, dim=1)
    }
    return _dict[method](tensor).numpy()



In [16]:
def normalization(array, method):
    assert method in ['identity', 'standard', 'layer', 'minmax']
    axis = 0
    _dict = {
        'identity' : lambda x: x,
        'standard' : lambda x: x / np.linalg.norm(x, axis=0),
        'layer' : lambda x: (x - x.mean(axis)) / x.std(axis),
        'minmax' : lambda x: (x - x.min(axis)) / (x.max(axis) - x.min(axis))
    }
    return _dict[method](array)

In [23]:
embdict = {}
for pooling in ['max_pooling', 'mean_pooling']:
    array = pool_by_tokens(embeddings, pooling)
    for norm_method in ['identity', 'standard', 'layer', 'minmax' ]:
        embdict[pooling+'__'+norm_method] = normalization(array, norm_method)

np.save('./embeddings/embaug_dict.npy', embdict)
embdict.keys()        

dict_keys(['max_pooling__identity', 'max_pooling__standard', 'max_pooling__layer', 'max_pooling__minmax', 'mean_pooling__identity', 'mean_pooling__standard', 'mean_pooling__layer', 'mean_pooling__minmax'])

In [22]:
map_dict = {topic: i for i, topic in enumerate(pd.Series(df.topic.values).value_counts().index.sort_values())}

txt_labels = df.topic.values
num_labels = np.vectorize(map_dict.get)(df.topic.values)

np.save('./embeddings/embaug_txt_labels.npy', txt_labels)
np.save('./embeddings/embaug_num_labels.npy', num_labels)
