# Load packages

In [1]:
import os

import numpy as np
import pandas as pd
import transformers
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

## Load data

In [2]:
DATA_PATH = os.path.join('..', 'data', 'preprocessed_url_simple')
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
validation = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
train.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,bot#9,YEA now that note GOOD,bot,others
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others
3,bot#1,The decade in the significantly easier schedul...,bot,others
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn


In [3]:
train.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,bot#9,YEA now that note GOOD,bot,others
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others
3,bot#1,The decade in the significantly easier schedul...,bot,others
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn


In [4]:
spec = "bert-base-cased"
tokenizer = transformers.BertTokenizer.from_pretrained(spec, max_length=512)
model = transformers.BertModel.from_pretrained(spec).to(device)

In [5]:
token_test = tokenizer.encode(train['text'][0], truncation=True, max_length=512, return_tensors='pt').to(device)
ans = model(token_test)

In [6]:
ans[0].shape

torch.Size([1, 9, 768])

In [7]:
def add_bert_embeddings(df):
    """
    Add BERT embeddings to dataframe
    :param df: dataframe with column 'text'
    :return: dataframe with additional column 'bert_embeddings'
    """

    def transform(text):
        token = tokenizer.encode(text, truncation=True, max_length=512, return_tensors='pt').to(device)
        ans = model(token)
        vec = np.array(ans[0].cpu().detach().numpy())
        vec = vec.reshape(vec.shape[1], vec.shape[2])
        vec = np.mean(vec, axis=0)
        return vec

    df['bert_embeddings'] = df['text'].apply(lambda x: transform(x))
    return df


train = add_bert_embeddings(train)
train.head()

Unnamed: 0,screen_name,text,account.type,class_type,bert_embeddings
0,bot#9,YEA now that note GOOD,bot,others,"[0.10614613, 0.0023416397, 0.18387558, 0.25720..."
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human,"[-0.15180907, 0.1564969, -0.10380695, 0.157478..."
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others,"[0.19033994, -0.039005734, -0.015785955, 0.235..."
3,bot#1,The decade in the significantly easier schedul...,bot,others,"[0.1858164, 0.07074168, 0.030424008, 0.2930759..."
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn,"[0.20630777, 0.35826805, 0.041690856, 0.272989..."


In [8]:
train['bert_embeddings'][0].shape

(768,)

In [9]:
validation = add_bert_embeddings(validation)
test = add_bert_embeddings(test)

## Save data

In [10]:
OUTPUT_PATH = os.path.join('..', 'data', 'bert_embeddings')
os.makedirs(OUTPUT_PATH, exist_ok=True)
train.to_pickle(os.path.join(OUTPUT_PATH, 'train.pkl'))
validation.to_pickle(os.path.join(OUTPUT_PATH, 'validation.pkl'))
test.to_pickle(os.path.join(OUTPUT_PATH, 'test.pkl'))