# Load packages

In [1]:
import os

import numpy as np
import pandas as pd
import transformers
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

## Load data

In [2]:
DATA_PATH = os.path.join('..', 'data', 'preprocessed_url_simple')
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
validation = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
train.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,bot#9,YEA now that note GOOD,bot,others
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others
3,bot#1,The decade in the significantly easier schedul...,bot,others
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn


In [3]:
train.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,bot#9,YEA now that note GOOD,bot,others
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others
3,bot#1,The decade in the significantly easier schedul...,bot,others
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn


In [4]:
spec = "bert-base-cased"
tokenizer = transformers.BertTokenizer.from_pretrained(spec)
model = transformers.BertModel.from_pretrained(spec)

In [5]:
nlp = transformers.pipeline('feature-extraction', model=model, tokenizer=tokenizer, device=device)
ans = nlp(train['text'][0])

In [6]:
def add_bert_embeddings(df):
    def transform(text):
        ans = nlp(text[-512:])
        vec = np.array(ans[0])
        vec = np.mean(vec, axis=0)
        return vec

    df['bert_embeddings'] = df['text'].apply(lambda x: transform(x))
    return df

In [7]:
train = add_bert_embeddings(train)
train.head()



Unnamed: 0,screen_name,text,account.type,class_type,bert_embeddings
0,bot#9,YEA now that note GOOD,bot,others,"[0.10614613195260365, 0.0023416380087534585, 0..."
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human,"[-0.15180905582383275, 0.15649690048303455, -0..."
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others,"[0.19033993559423834, -0.03900572611019015, -0..."
3,bot#1,The decade in the significantly easier schedul...,bot,others,"[0.185816412064014, 0.07074167917016894, 0.030..."
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn,"[0.20630772430233138, 0.35826802742667496, 0.0..."


In [8]:
train['bert_embeddings'][0].shape

(768,)

In [9]:
validation = add_bert_embeddings(validation)
test = add_bert_embeddings(test)



## Save data

In [11]:
OUTPUT_PATH = os.path.join('..', 'data', 'bert_embeddings')
os.makedirs(OUTPUT_PATH, exist_ok=True)
train.to_pickle(os.path.join(OUTPUT_PATH, 'train.pkl'))
validation.to_pickle(os.path.join(OUTPUT_PATH, 'validation.pkl'))
test.to_pickle(os.path.join(OUTPUT_PATH, 'test.pkl'))