# Applying BERT Multilingual Classifier to Predict Account Suspension 

Guidance from: https://github.com/kacossio/TeamPython/blob/master/Bert%20Multilingual%20Embedding.ipynb

## 1. Load Packages

In [1]:
########## Load Packages
import warnings
warnings.simplefilter("ignore")

import importlib
import pandas as pd
import numpy as np
import re
from io import StringIO
import itertools
import os 
import time
import datetime

from io import StringIO # python3; python2: BytesIO 
import boto3

import emoji
import random 
import math

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import math

## 2. Set parameters

In [2]:
# Set Parameters 
########## Set Parameters

# Indicate how many rows to skip before columns
# Note: Python uses zero-based indexing, so skiprow=0 begins at the first row of file,
# while skiprow=1 begins at the second row.
skiprow=0

# Indicate name of column that contains text data for analysis
text_column = "text"

filepath = "data/"

import_bucket = "joe-exotic-2020"

embedding_bucket = "modeling/embeddings"

key = 'full_clean' # already created on S3
csv_buffer = StringIO()
s3_resource = boto3.resource('s3')
s3 = boto3.client('s3')

results_bucket = 'full_clean' # already created on S3

## 3. Load in Data from S3

In [3]:
def import_data(filelist):
    '''Read in data from excel files into Pandas dataframe.
    Inputs: Filelist 
    Outputs: Pandas dataframe containing imported data
    '''

    # Read in single file
    object_key = filelist[0].split('/', 1)[1]
    csv_obj = s3.get_object(Bucket=import_bucket, Key=object_key)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')
    df = pd.read_csv(StringIO(csv_string), error_bad_lines=False)
    
    return df

### Load in data from S3

In [4]:
#### Load in data from S3

# Import Train and Measure Balance
# Import Flattened Data
filelist = [os.path.join(obj.bucket_name, obj.key) 
    for obj in s3_resource.Bucket(name=import_bucket).objects.all() 
    if re.findall("train_updated",obj.key)]

df_train = import_data(filelist)

df_train['suspended'] = pd.to_numeric(df_train['suspended'], errors='coerce')
df_train = df_train[df_train['suspended'].notna()]

# Import Test and Measure Balance

filelist = [os.path.join(obj.bucket_name, obj.key) 
    for obj in s3_resource.Bucket(name=import_bucket).objects.all() 
    if re.findall("test_updated",obj.key)]

df_test = import_data(filelist)

df_test['suspended'] = pd.to_numeric(df_test['suspended'], errors='coerce')
df_test = df_test[df_test['suspended'].notna()]

# Import Validation and Measure Balance
# Import Flattened Data
filelist = [os.path.join(obj.bucket_name, obj.key) 
    for obj in s3_resource.Bucket(name=import_bucket).objects.all() 
    if re.findall("valid_updated",obj.key)]

df_valid = import_data(filelist)

df_valid['suspended'] = pd.to_numeric(df_valid['suspended'], errors='coerce')
df_valid = df_valid[df_valid['suspended'].notna()]

### Supplementary Pre-Processing

#### Ensure that Target Variable is Numeric 

In [5]:
df_train['suspended'] = df_train['suspended'].astype(int)
df_valid['suspended'] = df_valid['suspended'].astype(int)
df_test['suspended'] = df_test['suspended'].astype(int)

#### Remove Duplicates 

In [6]:
df_train = df_train.drop_duplicates(subset=['id', 'created_at', 'text'])
df_valid = df_valid.drop_duplicates(subset=['id', 'created_at', 'text'])
df_test = df_test.drop_duplicates(subset=['id', 'created_at', 'text'])

#### Ensure binary possibly_sensitive vars

In [7]:
df_train['possibly_sensitive'][df_train['possibly_sensitive'].apply(lambda x: isinstance(x, str))] =np.nan
df_valid['possibly_sensitive'][df_valid['possibly_sensitive'].apply(lambda x: isinstance(x, str))] =np.nan
df_test['possibly_sensitive'][df_test['possibly_sensitive'].apply(lambda x: isinstance(x, str))] =np.nan

## 4. Extract Embeddings

In [8]:
#import packages 
from translate import Translator
import spacy
import langid
import keras_bert
import tensorflow as tf
import time
import datetime as dt
import pytz

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Lasso, LogisticRegression

Using TensorFlow backend.


### Prep for Tensor 

#### Convert Dates to Unix Epoch Time

In [9]:
### Function to convert dates into float (Unix Epoch Times )
def convert_dates_float(df):
    '''
    Convert key input data variables to numeric format for tensors. Uses unix epoch time in seconds. 
    '''
    # created_at (tweet)
    df['created_at'] = pd.to_datetime(df['created_at'], format='%Y-%m-%d %H:%M:%S.%f')
    my_datetime = dt.datetime(1970,1,1) 
    good_dt = pytz.timezone('UTC').localize(my_datetime)
    df['created_at'] = (df['created_at'] - good_dt).dt.total_seconds()

    # User.created_at (account)
    df['user.created_at'] = pd.to_datetime(df['user.created_at'], format='%Y-%m-%d %H:%M:%S.%f')
    my_datetime = dt.datetime(1970,1,1) 
    good_dt = pytz.timezone('UTC').localize(my_datetime)
    df['user.created_at'] = (df['user.created_at'] - good_dt).dt.total_seconds()
    
    return df

#### Convert binary and categorical variables to one-hot encoded (not sure this is best or not)

Options 

- Integer Encoding: Where each unique label is mapped to an integer.
- One Hot Encoding: Where each label is mapped to a binary vector.
- Learned Embedding: Where a distributed representation of the categories is learned.

We use one hot encoding below. 

#### We use get_dummies below instead of one_hot_encoder as get dummies knows how to deal with missingness. 

In [10]:
### One Hote Encoding (Unix Epoch Times )
def one_hot(df_train, df_valid, df_test): 
    '''
    One hot encoding requires the full dataset in order to ensure that there end up the same amount of columns for test, validation and train.
    We therefore combine train, valid, and test, fill nas with 0 where necessary, and one hot encode categorical vars. 
    '''
    df_train['split'] = "train"
    df_valid['split'] = "valid"
    df_test['split'] = "test"
    df = pd.concat([df_train, df_test, df_valid], ignore_index=True, sort=False)
    df = convert_dates_float(df)
    # Extra layer of Processing 
    df = df[df['retweet_count'] != "False"] 
    df['quoted_status.user.followers_count'] = df['quoted_status.user.followers_count'].fillna(0) 
    df['quoted_status.user.friends_count'] = df['quoted_status.user.friends_count'].fillna(0) 
    df['retweeted_status.user.followers_count'] = df['retweeted_status.user.followers_count'].fillna(0) 
    df['retweeted_status.user.friends_count'] = df['retweeted_status.user.friends_count'].fillna(0) 
    # One-hot
    df = df.drop(["user.protected.1", "user.protected.2", "user.protected.3"], axis=1)
    df = pd.get_dummies(df, columns=["source", "lang", "possibly_sensitive", "withheld_in_countries", "place.country", 
                                         "user.geo_enabled", "user.lang", "user.verified", "user.has_extended_profile",
                                        "user.lang", "user.protected", "user.time_zone", "user.verified", "user.default_profile",
                                        "is_quote_status"])
    return df
# Tp get rid of: Text, user.protected.1, user.protected.2, user.protected.3, 
# To concat (or get rid of): user.description, user.location, user.name, user.screen_name
# to potentially take out entirely - user.id (This would explain everything)


#### Split one-hot encoded df back apart into train, valid, and test

In [11]:
df = one_hot(df_train, df_valid, df_test)
df_train_f = df[df['split'] == "train"]
df_valid_f = df[df['split'] == "valid"]
df_test_f = df[df['split'] == "test"]


#### Remove other text fields (may concatanate with tweets in future iterations)

In [12]:
df_train = df_train_f.drop(['user.description', "user.location", "user.name", "user.screen_name", "split"], axis=1)
df_valid = df_valid_f.drop(['user.description', "user.location", "user.name", "user.screen_name", "split"], axis=1)
df_test = df_test_f.drop(['user.description', "user.location", "user.name", "user.screen_name", "split"], axis=1)

In [13]:
df_train.head()

Unnamed: 0,id,created_at,text,retweet_count,favorite_count,quoted_status_id,user.id,user.created_at,user.favourites_count,user.followers_count,...,user.geo_enabled_False,user.geo_enabled_True,user.verified_False,user.has_extended_profile_False,user.has_extended_profile_True,user.protected_False,user.verified_False.1,user.default_profile_True,is_quote_status_False,is_quote_status_True
1,1.304799e+18,1599923000.0,containcontrast you are as worse as nazi germ...,0.0,1.0,0.0,1.27812e+18,1593563000.0,25355.0,377.0,...,1,0,1,0,1,1,1,1,1,0
2,1.304796e+18,1599923000.0,RT bcelyj nevernever maryann CCP CCCP CCCP,1.0,0.0,0.0,1.27812e+18,1593563000.0,25355.0,377.0,...,1,0,1,0,1,1,1,1,1,0
3,1.304796e+18,1599923000.0,RT maryann CCP CCP https co EAFQGqFQ,802.0,0.0,0.0,1.27812e+18,1593563000.0,25355.0,377.0,...,1,0,1,0,1,1,1,1,1,0
4,1.304795e+18,1599922000.0,https co oBzs zO https co Si btzc U,0.0,15.0,1.0,1.27812e+18,1593563000.0,25355.0,377.0,...,1,0,1,0,1,1,1,1,0,1
5,1.304794e+18,1599922000.0,https co oBzs zO Z,0.0,10.0,1.0,1.27812e+18,1593563000.0,25355.0,377.0,...,1,0,1,0,1,1,1,1,0,1


### Define embeddings functions

In [27]:
#get tweets from dataframe
def get_tweets_list(df):
    '''
    Convert panda series of tweets into list of tweets and output id_list for tweets. 
    '''
    all_tweets = df['text'].tolist()
    id_list = df['id'].tolist()
    return all_tweets,id_list

#input list of tweets with structure as [[tweet, screen_name],[tweet, screen_name],[tweet, screen_name],...]
def get_bert_embeddings(tweet_list):
    '''
    Extracy embeddings using tf hub supplied model path and your tweet list.
    '''
    model_path = "multi_cased_L-12_H-768_A-12"
    embeddings = keras_bert.extract_embeddings(model_path, tweet_list)
    print('embeddings complete')
    return(embeddings)

#mean pool the embeddings to return 768 embeddings per sentence
def avg_pooling(embed_array):
    '''
    Mean pool the embeddings to return 768 embeddings per sentence. 
    '''
    embeddings_pooled = []
    for sentence in embed_array:
        sentence = np.expand_dims((sentence),axis = 0)
        sentence = tf.keras.layers.GlobalAveragePooling1D()(sentence)
        embeddings_pooled.append(np.squeeze(sentence))
    return(embeddings_pooled)

### Use only the below on Pytorch implementation

In [14]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from numba import cuda 
import gc

In [16]:
print(torch.cuda.device_count())

1


In [17]:
print(torch.cuda.current_device())

0


In [18]:
print(torch.cuda.get_device_name(torch.cuda.current_device()))

Tesla K80


In [19]:
print(torch.cuda.is_available())

True


#### Importing pre-trained BERT model and tokenizer

In [20]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-multilingual-cased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
#model = model_class.from_pretrained(pretrained_weights)

Train

#### Here, we’ll tokenize and process all sentences together as a batch.

In [21]:
gc.collect()
torch.cuda.empty_cache()

In [22]:
def roundup(x, pl):
    return int(math.ceil(x / pl)) * pl

In [23]:
def pytorch_bert_embeddings(df, file_save):
    full_range = range(0,roundup(len(df), 1000),1000)
    # First iteration crashed GPU memory after 45,000 (44), so restarted kernel and reran starting at 46,000 (45)
    # Had to rerun 46,000-47,000 in smaller chunks given larger tensors evidently. 
    # Started over at 46 (47,000)
    # Same thing happened when we got to 63,000
    b = range(0,len(full_range))
    for a in b:
        try: 
            model = model_class.from_pretrained(pretrained_weights)
            model = model.to('cuda')
            if a < b[-1]:
                df_train_for_merge = df[full_range[a]:full_range[a+1]]
                tokenized = df_train_for_merge['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation = True)))
                # Padding 
                # The dataset is currently a list (or pandas Series/DataFrame) of lists. Before BERT can process this as input, 
                # we’ll need to make all the vectors the same size by padding shorter sentences with the token id 0. 
                # You can refer to the notebook for the padding step, it’s basic python string and array manipulation.
                max_len = 0
                for i in tokenized.values:
                    if len(i) > max_len:
                        max_len = len(i)

                padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

                # After the padding, we have a matrix/tensor that is ready to be passed to BERT:
                # Mask
                attention_mask = np.where(padded != 0, 1, 0)

                # Input IDs and Mask 
                # We now create an input tensor out of the padded token matrix, and send that to BERT
                input_ids = torch.tensor(padded)  

                attention_mask = torch.tensor(attention_mask)
            else: 
                df_train_for_merge = df[full_range[a]:]
                tokenized = df_train_for_merge['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation = True)))

                # Padding 
                # The dataset is currently a list (or pandas Series/DataFrame) of lists. Before BERT can process this as input, 
                # we’ll need to make all the vectors the same size by padding shorter sentences with the token id 0. 
                # You can refer to the notebook for the padding step, it’s basic python string and array manipulation.
                max_len = 0
                for i in tokenized.values:
                    if len(i) > max_len:
                        max_len = len(i)

                padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

                # After the padding, we have a matrix/tensor that is ready to be passed to BERT:
                # Mask
                attention_mask = np.where(padded != 0, 1, 0)

                # Input IDs and Mask 
                # We now create an input tensor out of the padded token matrix, and send that to BERT
                input_ids = torch.tensor(padded)  

                attention_mask = torch.tensor(attention_mask)

            input_ids = input_ids.to('cuda')
            attention_mask = attention_mask.to('cuda')


            # Disabling gradient calculation is useful for inference, when you are sure that you will not call 
            # Tensor.backward(). It will reduce memory consumption for computations that would otherwise have requires_grad=True.
            with torch.no_grad():
                last_hidden_states = model(input_ids, attention_mask)

            # Are there options to set the batch size, etc. and stuff here though? 

            # Takes a total of 24 seconds with the pytorch implementation on CPU (faster than Keras bert)
            # Note if we run the same 300 sample with everythin on Cuda, it takes .035-.044 (GPU is a big deal)
            # If we put just the model on cuda and not the inputs, we get an error as they must be on the same device. 

            bert_pooled_train = last_hidden_states[0][:,0,:].cpu().numpy()

            id_tweet_train = df_train_for_merge['id'].tolist()

            #bert embedding
            bert_embeddings_df_train = pd.DataFrame(list(zip(id_tweet_train, bert_pooled_train)),columns=['id', 'Bert_embeddings'])
            bert_embeddings_df_train = pd.concat([bert_embeddings_df_train['id'], bert_embeddings_df_train['Bert_embeddings'].apply(pd.Series)], axis = 1)
            bert_embeddings_df_train = pd.merge(df_train_for_merge.drop(['text'], axis=1), bert_embeddings_df_train, left_on = ["id"], right_on = ["id"])

            bert_embeddings_df_train.to_csv(('s3://joe-exotic-2020/modeling/embeddings/' + file_save + "_" + str(full_range[a]) + ".csv"), index=False, encoding = "utf_8_sig")
            print("Embeddings Extracted for Slice" + " " + str(full_range[a]))
            del input_ids, attention_mask, model, df_train_for_merge
            gc.collect()
            torch.cuda.empty_cache()
        except: 
            c = range(full_range[a], full_range[a+1]+50, 50)
            d = range(0,len(c))
            for e in d:
                model = model_class.from_pretrained(pretrained_weights)
                model = model.to('cuda')
                try: 
                    df_train_for_merge = df[c[e]:c[e+1]]
                    tokenized = df_train_for_merge['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation = True)))
                    # Padding 
                    # The dataset is currently a list (or pandas Series/DataFrame) of lists. Before BERT can process this as input, 
                    # we’ll need to make all the vectors the same size by padding shorter sentences with the token id 0. 
                    # You can refer to the notebook for the padding step, it’s basic python string and array manipulation.
                    max_len = 0
                    for i in tokenized.values:
                        if len(i) > max_len:
                            max_len = len(i)

                    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

                    # After the padding, we have a matrix/tensor that is ready to be passed to BERT:
                    # Mask
                    attention_mask = np.where(padded != 0, 1, 0)

                    # Input IDs and Mask 
                    # We now create an input tensor out of the padded token matrix, and send that to BERT
                    input_ids = torch.tensor(padded)  

                    attention_mask = torch.tensor(attention_mask)

                    input_ids = input_ids.to('cuda')
                    attention_mask = attention_mask.to('cuda')
                    # Disabling gradient calculation is useful for inference, when you are sure that you will not call 
                    # Tensor.backward(). It will reduce memory consumption for computations that would otherwise have requires_grad=True.
                    with torch.no_grad():
                        last_hidden_states = model(input_ids, attention_mask)

                    # Are there options to set the batch size, etc. and stuff here though? 

                    # Takes a total of 24 seconds with the pytorch implementation on CPU (faster than Keras bert)
                    # Note if we run the same 300 sample with everythin on Cuda, it takes .035-.044 (GPU is a big deal)
                    # If we put just the model on cuda and not the inputs, we get an error as they must be on the same device. 

                    bert_pooled_train = last_hidden_states[0][:,0,:].cpu().numpy()

                    id_tweet_train = df_train_for_merge['id'].tolist()

                    #bert embedding
                    bert_embeddings_df_train = pd.DataFrame(list(zip(id_tweet_train, bert_pooled_train)),columns=['id', 'Bert_embeddings'])
                    bert_embeddings_df_train = pd.concat([bert_embeddings_df_train['id'], bert_embeddings_df_train['Bert_embeddings'].apply(pd.Series)], axis = 1)
                    bert_embeddings_df_train = pd.merge(df_train_for_merge.drop(['text'], axis=1), bert_embeddings_df_train, left_on = ["id"], right_on = ["id"])

                    bert_embeddings_df_train.to_csv(('s3://joe-exotic-2020/modeling/embeddings/' + file_save + "_" + str(c[e]) + "_" + str(c[e+1]) + ".csv"), index=False, encoding = "utf_8_sig")
                    print("Embeddings Extracted for Slice" + " " + str(c[e]) + "_" + str(c[e+1]))
                    del input_ids, attention_mask, model, df_train_for_merge
                    gc.collect()
                    torch.cuda.empty_cache()
                except:
                    pass

In [None]:
start= time.time()
file_save =  'ml_train_embeddings_df'
pytorch_bert_embeddings(df_train, file_save)
print(time.time() - start)

In [24]:
start= time.time()
file_save =  'ml_valid_embeddings_df'
pytorch_bert_embeddings(df_valid, file_save)
print(time.time() - start)

Embeddings Extracted for Slice 32000
Embeddings Extracted for Slice 33000
Embeddings Extracted for Slice 34000_34050
Embeddings Extracted for Slice 34050_34100
Embeddings Extracted for Slice 34100_34150
Embeddings Extracted for Slice 34150_34200
Embeddings Extracted for Slice 34200_34250
Embeddings Extracted for Slice 34250_34300
Embeddings Extracted for Slice 34300_34350
Embeddings Extracted for Slice 34350_34400
Embeddings Extracted for Slice 34400_34450
Embeddings Extracted for Slice 34450_34500
Embeddings Extracted for Slice 34500_34550
Embeddings Extracted for Slice 34550_34600
Embeddings Extracted for Slice 34600_34650
Embeddings Extracted for Slice 34650_34700
Embeddings Extracted for Slice 34700_34750
Embeddings Extracted for Slice 34750_34800
Embeddings Extracted for Slice 34800_34850
Embeddings Extracted for Slice 34850_34900
Embeddings Extracted for Slice 34900_34950
Embeddings Extracted for Slice 34950_35000
Embeddings Extracted for Slice 35000
Embeddings Extracted for Slic

In [46]:
start= time.time()
file_save =  'ml_test_embeddings_df'
pytorch_bert_embeddings(df_test, file_save)
print(time.time() - start)

Embeddings Extracted for Slice 0
Embeddings Extracted for Slice 1000
Embeddings Extracted for Slice 2000
Embeddings Extracted for Slice 3000
Embeddings Extracted for Slice 4000
Embeddings Extracted for Slice 5000
Embeddings Extracted for Slice 6000
Embeddings Extracted for Slice 7000
Embeddings Extracted for Slice 8000
Embeddings Extracted for Slice 9000
Embeddings Extracted for Slice 10000
Embeddings Extracted for Slice 11000
Embeddings Extracted for Slice 12000
Embeddings Extracted for Slice 13000
Embeddings Extracted for Slice 14000
Embeddings Extracted for Slice 15000
Embeddings Extracted for Slice 16000
Embeddings Extracted for Slice 17000
Embeddings Extracted for Slice 18000
Embeddings Extracted for Slice 19000
Embeddings Extracted for Slice 20000
Embeddings Extracted for Slice 21000
Embeddings Extracted for Slice 22000
Embeddings Extracted for Slice 23000
Embeddings Extracted for Slice 24000
Embeddings Extracted for Slice 25000
Embeddings Extracted for Slice 26000
Embeddings Ext

In [28]:
############# Import Data
def import_list_data(filelist):
    '''Read in data from excel files into Pandas dataframe. Concatenates multiple files if necessary. 
    Inputs: Directory path, number of rows to skip
    Outputs: Pandas dataframe containing imported data
    '''
    dataframes = []
    # Iterate through files of the directory
    for filename in filelist:
        object_key = filename.split('/', 1)[1]
        csv_obj = s3.get_object(Bucket=import_bucket, Key=object_key)
        body = csv_obj['Body']
        csv_string = body.read().decode('utf-8')
        dataframe = pd.read_csv(StringIO(csv_string))
        dataframes.append(dataframe)
    df = pd.concat(dataframes, ignore_index=True, sort=False)

    return df

In [42]:
filelist = [os.path.join(obj.bucket_name, obj.key) 
    for obj in s3_resource.Bucket(name=import_bucket).objects.all() 
    if re.findall("ml_train_embeddings",obj.key)]
filelist

['joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_0.csv',
 'joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_1000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_10000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_100000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_101000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_102000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_103000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_104000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_105000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_106000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_107000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_11000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_12000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_train_e

In [43]:
bert_embeddings_df_train = import_list_data(filelist)
len(bert_embeddings_df_train)

107374

In [44]:
bert_embeddings_df_train.head()

Unnamed: 0,id,created_at,retweet_count,favorite_count,quoted_status_id,user.id,user.created_at,user.favourites_count,user.followers_count,user.friends_count,...,758,759,760,761,762,763,764,765,766,767
0,1.304799e+18,1599923000.0,0.0,1.0,0.0,1.27812e+18,1593563000.0,25355.0,377.0,774.0,...,0.254089,-0.092161,-0.177658,-0.322701,0.016352,-0.007543,0.031331,0.462816,0.165087,-0.072143
1,1.304796e+18,1599923000.0,1.0,0.0,0.0,1.27812e+18,1593563000.0,25355.0,377.0,774.0,...,0.114728,-0.077202,-0.269712,-0.151349,0.03366,0.276272,0.065734,0.664379,-0.040229,-0.356698
2,1.304796e+18,1599923000.0,802.0,0.0,0.0,1.27812e+18,1593563000.0,25355.0,377.0,774.0,...,0.062325,-0.218971,-0.567088,-0.617004,0.170469,0.163738,0.165489,0.253347,0.1058,-0.174792
3,1.304795e+18,1599922000.0,0.0,15.0,1.0,1.27812e+18,1593563000.0,25355.0,377.0,774.0,...,0.333985,-0.193174,-0.613174,-0.672665,0.01219,0.253517,0.135646,0.110566,0.26942,-0.121226
4,1.304794e+18,1599922000.0,0.0,10.0,1.0,1.27812e+18,1593563000.0,25355.0,377.0,774.0,...,0.215525,-0.281756,-0.346513,-0.911152,-0.005457,0.121276,0.14528,0.193568,0.342988,0.069493


In [29]:
filelist = [os.path.join(obj.bucket_name, obj.key) 
    for obj in s3_resource.Bucket(name=import_bucket).objects.all() 
    if re.findall("ml_valid_embeddings",obj.key)]
filelist

['joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_0.csv',
 'joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_1000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_10000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_11000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_12000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_13000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_14000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_15000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_16000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_17000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_18000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_19000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_2000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_valid_embeddings

In [30]:
bert_embeddings_df_valid = import_list_data(filelist)
len(bert_embeddings_df_valid)

39537

In [31]:
bert_embeddings_df_valid.head()

Unnamed: 0,id,created_at,retweet_count,favorite_count,quoted_status_id,user.id,user.created_at,user.favourites_count,user.followers_count,user.friends_count,...,758,759,760,761,762,763,764,765,766,767
0,1.279757e+18,1593953000.0,0.0,0.0,0.0,1.278122e+18,1593563000.0,0.0,2.0,53.0,...,0.177418,0.05465,-0.278031,-0.18553,0.475544,0.522608,0.254073,0.107183,-0.213097,-0.048582
1,1.278125e+18,1593564000.0,0.0,1.0,0.0,1.278122e+18,1593563000.0,0.0,2.0,53.0,...,-0.265097,-0.048022,-0.359503,-0.175341,0.115412,0.233461,0.088037,0.116152,0.329816,0.027202
2,1.278845e+18,1593735000.0,0.0,1.0,0.0,1.278129e+18,1593565000.0,41.0,1.0,47.0,...,0.279357,-0.046791,-0.050131,-0.540188,0.453747,0.14329,-0.02608,0.072237,0.114676,-0.176587
3,1.278851e+18,1593737000.0,0.0,0.0,0.0,1.278129e+18,1593565000.0,41.0,1.0,47.0,...,0.409402,0.247715,-0.528003,-0.368062,0.361968,0.259629,0.344309,0.421999,0.068539,-0.127544
4,1.278847e+18,1593736000.0,0.0,0.0,0.0,1.278129e+18,1593565000.0,41.0,1.0,47.0,...,0.067151,0.025943,-0.470204,-0.401065,0.429876,0.340666,0.063684,0.142442,-0.099972,-0.136322


In [35]:
filelist = [os.path.join(obj.bucket_name, obj.key) 
    for obj in s3_resource.Bucket(name=import_bucket).objects.all() 
    if re.findall("ml_test_embeddings",obj.key)]
filelist

['joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_0.csv',
 'joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_1000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_10000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_11000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_12000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_13000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_14000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_15000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_16000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_17000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_18000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_19000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_2000.csv',
 'joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_20000.csv'

In [36]:
bert_embeddings_df_test = import_list_data(filelist)
len(bert_embeddings_df_test)

29139

In [37]:
bert_embeddings_df_test.head()

Unnamed: 0,id,created_at,retweet_count,favorite_count,quoted_status_id,user.id,user.created_at,user.favourites_count,user.followers_count,user.friends_count,...,758,759,760,761,762,763,764,765,766,767
0,1.278734e+18,1593709000.0,0.0,0.0,0.0,1.27813e+18,1593565000.0,5.0,1.0,60.0,...,0.20077,0.205723,-0.42135,-0.228305,0.457495,0.068868,0.081225,0.269605,-0.175302,-0.022077
1,1.292049e+18,1596884000.0,1.0,3.0,0.0,1.278156e+18,1593571000.0,21.0,0.0,47.0,...,0.270942,0.135556,-0.294425,-0.134652,0.138386,-0.151911,0.204863,0.453594,0.257261,-0.188632
2,1.278256e+18,1593595000.0,76.0,0.0,0.0,1.278212e+18,1593584000.0,133.0,0.0,16.0,...,0.211961,-0.093005,-0.457276,-0.883988,0.138549,0.11575,-0.008013,0.371775,0.046258,-0.095041
3,1.278295e+18,1593604000.0,0.0,0.0,0.0,1.278212e+18,1593584000.0,133.0,0.0,16.0,...,-0.265097,-0.048022,-0.359503,-0.175341,0.115412,0.233461,0.088037,0.116152,0.329816,0.027202
4,1.278346e+18,1593616000.0,0.0,0.0,0.0,1.278212e+18,1593584000.0,133.0,0.0,16.0,...,0.064108,-0.40524,-0.403547,-0.27813,0.169948,0.180723,0.204909,0.457883,0.07463,-0.301944


### Save Multilngual Embeddings 

In [48]:
bert_embeddings_df_train.to_csv('s3://joe-exotic-2020/modeling/embeddings/ml_train_embeddings_df_full.csv', index=False, encoding = "utf_8_sig")

In [38]:
bert_embeddings_df_valid.to_csv('s3://joe-exotic-2020/modeling/embeddings/ml_valid_embeddings_df_full.csv', index=False, encoding = "utf_8_sig")

In [None]:
bert_embeddings_df_test.to_csv('s3://joe-exotic-2020/modeling/embeddings/ml_test_embeddings_df_full.csv', index=False, encoding = "utf_8_sig")

### LABSE: Extracting the new BERT embeddings that learn across languages

https://towardsdatascience.com/labse-language-agnostic-bert-sentence-embedding-by-google-ai-531f677d775f

In [16]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

In [17]:
def get_model(model_url, max_seq_length):
    labse_layer = hub.KerasLayer(model_url, trainable=True)

    # Define input.
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                         name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                     name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                      name="segment_ids")
        
    # LaBSE layer.
    pooled_output,  _ = labse_layer([input_word_ids, input_mask, segment_ids])
    
    # The embedding is l2 normalized.
    pooled_output = tf.keras.layers.Lambda(
        lambda x: tf.nn.l2_normalize(x, axis=1))(pooled_output)

    # Define model.
    return tf.keras.Model(
        inputs=[input_word_ids, input_mask, segment_ids],
        outputs=pooled_output), labse_layer

max_seq_length = 64
labse_model, labse_layer = get_model(
    model_url="https://tfhub.dev/google/LaBSE/1", max_seq_length=max_seq_length)

In [18]:
import bert
from bert import tokenization

In [82]:
# !pip install bert-for-tf2

vocab_file = labse_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = labse_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.tokenization.FullTokenizer(vocab_file, do_lower_case)

def create_input(input_strings, tokenizer, max_seq_length):
    '''
    Set up function to create inputs needed for word embeddings extraction. 
    '''
    input_ids_all, input_mask_all, segment_ids_all = [], [], []
    for input_string in input_strings:
        # Tokenize input.
        input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        sequence_length = min(len(input_ids), max_seq_length)

    # Padding or truncation.
    if len(input_ids) >= max_seq_length:
        input_ids = input_ids[:max_seq_length]
    else:
        input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

    input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

    input_ids_all.append(input_ids)
    input_mask_all.append(input_mask)
    segment_ids_all.append([0] * max_seq_length)

    return np.array(input_ids_all), np.array(input_mask_all), np.array(segment_ids_all)

def encode(input_text):
    '''
    Function to extract word embedding for each sentence. 
    '''
    input_ids, input_mask, segment_ids = create_input(
        input_text, tokenizer, max_seq_length)
    return labse_model([input_ids, input_mask, segment_ids])


#input list of tweets with structure as [[tweet, screen_name],[tweet, screen_name],[tweet, screen_name],...]
def get_bert_embeddings_labse(tweet_list):
    embedded_tweets = list()
    for sent in tweet_list:
        a = np.array(encode(sent))
        embedded_tweets.append(a[0])
        if len(embedded_tweets) % 1000 == 0:
            print(len(embedded_tweets))    
    return embedded_tweets

# https://tfhub.dev/google/LaBSE/1

#### Train

In [64]:
#extracted first 10 tweets and check the time.
start= time.time()
list_of_tweets_train, id_tweet_train = get_tweets_list(df_train)
print(time.time() - start)

0.008150100708007812


In [65]:
#extracted first 10 tweets and check the time.
start= time.time()
bert_pooled_train_labse = get_bert_embeddings_labse(list_of_tweets_train)
print(time.time() - start)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
2125.650132417679


#### Valid

In [83]:
#extracted first 10 tweets and check the time.
start= time.time()
list_of_tweets_valid, id_tweet_valid = get_tweets_list(df_valid)
bert_pooled_valid_labse = get_bert_embeddings_labse(list_of_tweets_valid)
print(time.time() - start)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
764.0787312984467


#### Test

In [84]:
#extracted first 10 tweets and check the time.
start= time.time()
list_of_tweets_test, id_tweet_test = get_tweets_list(df_test)
bert_pooled_test_labse = get_bert_embeddings_labse(list_of_tweets_test)
print(time.time() - start)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
580.1269092559814


#### Combine the text of each screen name in order to classify per screen name. An alternative that we'll have to try to incorporate is to fit in the time series element. 

#### Train

In [67]:
#bert embedding
bert_embeddings_df_train_labse = pd.DataFrame(list(zip(id_tweet_train, bert_pooled_train_labse)),columns=['id', 'Bert_embeddings'])
bert_embeddings_df_train_labse = pd.concat([bert_embeddings_df_train_labse['id'], bert_embeddings_df_train_labse['Bert_embeddings'].apply(pd.Series)], axis = 1)
bert_embeddings_df_train_labse = pd.merge(df_train.drop(['text'], axis=1), bert_embeddings_df_train_labse, left_on = ["id"], right_on = ["id"])
bert_embeddings_df_train_labse.head()

Unnamed: 0,id,created_at,retweet_count,favorite_count,quoted_status_id,user.id,user.created_at,user.favourites_count,user.followers_count,user.friends_count,...,758,759,760,761,762,763,764,765,766,767
0,1.304799e+18,1599923000.0,0.0,1.0,0.0,1.27812e+18,1593563000.0,25355.0,377.0,774.0,...,0.004597,0.045674,0.016957,0.003439,0.055194,-0.044291,0.034094,-0.038885,-0.036568,-0.032016
1,1.304796e+18,1599923000.0,1.0,0.0,0.0,1.27812e+18,1593563000.0,25355.0,377.0,774.0,...,-0.044256,-0.027914,-0.017669,0.02655,0.047473,0.013402,-0.009568,0.000791,0.00225,-0.052819
2,1.304796e+18,1599923000.0,802.0,0.0,0.0,1.27812e+18,1593563000.0,25355.0,377.0,774.0,...,0.020261,0.026414,0.023692,0.006574,0.007752,0.001251,0.003044,-0.043307,-0.004868,0.001397
3,1.304795e+18,1599922000.0,0.0,15.0,1.0,1.27812e+18,1593563000.0,25355.0,377.0,774.0,...,0.009558,0.030677,0.000332,0.065556,0.0059,0.003982,-0.000807,-0.029937,-0.075138,-0.037401
4,1.304794e+18,1599922000.0,0.0,10.0,1.0,1.27812e+18,1593563000.0,25355.0,377.0,774.0,...,0.071624,0.034412,0.007865,0.056166,-0.014121,-0.036367,0.041306,0.005384,-0.051179,-0.029693


In [68]:
len(bert_embeddings_df_train_labse)

107374

In [69]:
bert_embeddings_df_train_labse['suspended'].unique()

array([1, 0])

#### Valid

In [85]:
#bert embedding
bert_embeddings_df_valid_labse = pd.DataFrame(list(zip(id_tweet_valid, bert_pooled_valid_labse)),columns=['id', 'Bert_embeddings'])
bert_embeddings_df_valid_labse = pd.concat([bert_embeddings_df_valid_labse['id'], bert_embeddings_df_valid_labse['Bert_embeddings'].apply(pd.Series)], axis = 1)
bert_embeddings_df_valid_labse = pd.merge(df_valid.drop(['text'], axis=1), bert_embeddings_df_valid_labse, left_on = ["id"], right_on = ["id"])
bert_embeddings_df_valid_labse.head()

Unnamed: 0,id,created_at,retweet_count,favorite_count,quoted_status_id,user.id,user.created_at,user.favourites_count,user.followers_count,user.friends_count,...,758,759,760,761,762,763,764,765,766,767
0,1.279757e+18,1593953000.0,0.0,0.0,0.0,1.278122e+18,1593563000.0,0.0,2.0,53.0,...,-0.044256,-0.027914,-0.017669,0.02655,0.047473,0.013402,-0.009568,0.000791,0.00225,-0.052819
1,1.278125e+18,1593564000.0,0.0,1.0,0.0,1.278122e+18,1593563000.0,0.0,2.0,53.0,...,-0.044256,-0.027914,-0.017669,0.02655,0.047473,0.013402,-0.009568,0.000791,0.00225,-0.052819
2,1.278845e+18,1593735000.0,0.0,1.0,0.0,1.278129e+18,1593565000.0,41.0,1.0,47.0,...,-0.044256,-0.027914,-0.017669,0.02655,0.047473,0.013402,-0.009568,0.000791,0.00225,-0.052819
3,1.278851e+18,1593737000.0,0.0,0.0,0.0,1.278129e+18,1593565000.0,41.0,1.0,47.0,...,-0.044256,-0.027914,-0.017669,0.02655,0.047473,0.013402,-0.009568,0.000791,0.00225,-0.052819
4,1.278847e+18,1593736000.0,0.0,0.0,0.0,1.278129e+18,1593565000.0,41.0,1.0,47.0,...,-0.044256,-0.027914,-0.017669,0.02655,0.047473,0.013402,-0.009568,0.000791,0.00225,-0.052819


In [86]:
len(bert_embeddings_df_valid_labse)

38537

In [87]:
bert_embeddings_df_valid_labse['suspended'].unique()

array([0, 1])

#### Test

In [88]:
#bert embedding
bert_embeddings_df_test_labse = pd.DataFrame(list(zip(id_tweet_test, bert_pooled_test_labse)),columns=['id', 'Bert_embeddings'])
bert_embeddings_df_test_labse = pd.concat([bert_embeddings_df_test_labse['id'], bert_embeddings_df_test_labse['Bert_embeddings'].apply(pd.Series)], axis = 1)
bert_embeddings_df_test_labse = pd.merge(df_test.drop(['text'], axis=1), bert_embeddings_df_test_labse, left_on = ["id"], right_on = ["id"])
bert_embeddings_df_test_labse.head()

Unnamed: 0,id,created_at,retweet_count,favorite_count,quoted_status_id,user.id,user.created_at,user.favourites_count,user.followers_count,user.friends_count,...,758,759,760,761,762,763,764,765,766,767
0,1.278734e+18,1593709000.0,0.0,0.0,0.0,1.27813e+18,1593565000.0,5.0,1.0,60.0,...,-0.007161,0.039254,-0.029584,0.067115,0.046009,-0.025005,-0.002067,-0.001654,-0.080371,-0.02491
1,1.292049e+18,1596884000.0,1.0,3.0,0.0,1.278156e+18,1593571000.0,21.0,0.0,47.0,...,-0.044256,-0.027914,-0.017669,0.02655,0.047473,0.013402,-0.009568,0.000791,0.00225,-0.052819
2,1.278256e+18,1593595000.0,76.0,0.0,0.0,1.278212e+18,1593584000.0,133.0,0.0,16.0,...,-0.044256,-0.027914,-0.017669,0.02655,0.047473,0.013402,-0.009568,0.000791,0.00225,-0.052819
3,1.278295e+18,1593604000.0,0.0,0.0,0.0,1.278212e+18,1593584000.0,133.0,0.0,16.0,...,-0.044256,-0.027914,-0.017669,0.02655,0.047473,0.013402,-0.009568,0.000791,0.00225,-0.052819
4,1.278346e+18,1593616000.0,0.0,0.0,0.0,1.278212e+18,1593584000.0,133.0,0.0,16.0,...,-0.044256,-0.027914,-0.017669,0.02655,0.047473,0.013402,-0.009568,0.000791,0.00225,-0.052819


In [89]:
len(bert_embeddings_df_test_labse)

29139

In [90]:
bert_embeddings_df_test_labse['suspended'].unique()

array([0, 1])

### Save LabSE Embeddings

In [70]:
bert_embeddings_df_train_labse.to_csv('s3://joe-exotic-2020/modeling/embeddings/labse_train_embeddings_df.csv', index=False, encoding = "utf_8_sig")

In [91]:
bert_embeddings_df_valid_labse.to_csv('s3://joe-exotic-2020/modeling/embeddings/labse_valid_embeddings_df.csv', index=False, encoding = "utf_8_sig")

In [92]:
bert_embeddings_df_test_labse.to_csv('s3://joe-exotic-2020/modeling/embeddings/labse_test_embeddings_df.csv', index=False, encoding = "utf_8_sig")