# Applying BERT Multilingual Classifier to Predict Account Suspension 

Phase 2

Guidance from: https://github.com/kacossio/TeamPython/blob/master/Bert%20Multilingual%20Embedding.ipynb

## 1. Load Packages

In [1]:
########## Load Packages
import warnings
warnings.simplefilter("ignore")

import importlib
import pandas as pd
import numpy as np
import re
from io import StringIO
import itertools
import os 
import time
import datetime

from io import StringIO # python3; python2: BytesIO 
import boto3

import emoji
import random 
import math

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import math

## 2. Set parameters

In [9]:
# Set Parameters 
########## Set Parameters

# Indicate how many rows to skip before columns
# Note: Python uses zero-based indexing, so skiprow=0 begins at the first row of file,
# while skiprow=1 begins at the second row.
skiprow=0

# Indicate name of column that contains text data for analysis
text_column = "text"

import_bucket = "data/s3_data/processed_splits/"

embedding_bucket = "data/s3_data/embeddings/iterate_embeds"

results_bucket = 'data/s3_data/full_clean_2/' 

## 3. Load in Data from S3

### Load in data from S3

In [3]:
#### Load in data from S3

# Import Train and Measure Balance
# Import Flattened Data
df_train = pd.read_csv(import_bucket + "x_train.csv", error_bad_lines=False, encoding='utf-8')

df_train['suspended'] = pd.to_numeric(df_train['suspended'], errors='coerce')
df_train = df_train[df_train['suspended'].notna()]

# Import Test and Measure Balance

df_test = pd.read_csv(import_bucket + "x_test.csv", error_bad_lines=False, encoding='utf-8')

df_test['suspended'] = pd.to_numeric(df_test['suspended'], errors='coerce')
df_test = df_test[df_test['suspended'].notna()]

# Import Validation and Measure Balance
# Import Flattened Data
df_valid = pd.read_csv(import_bucket + "x_validation.csv", error_bad_lines=False, encoding='utf-8')

df_valid['suspended'] = pd.to_numeric(df_valid['suspended'], errors='coerce')
df_valid = df_valid[df_valid['suspended'].notna()]

### Supplementary Pre-Processing

#### Ensure that Target Variable is Numeric 

In [8]:
df_train['suspended'] = df_train['suspended'].astype(int)
df_valid['suspended'] = df_valid['suspended'].astype(int)
df_test['suspended'] = df_test['suspended'].astype(int)

In [9]:
print("Train:", len(df_train), "Valid:", len(df_valid), "Test", len(df_test))

Train: 211283 Valid: 64625 Test 64921


#### Remove Duplicates 

In [10]:
df_train = df_train.drop_duplicates(subset=['id', 'created_at', 'text'])
df_valid = df_valid.drop_duplicates(subset=['id', 'created_at', 'text'])
df_test = df_test.drop_duplicates(subset=['id', 'created_at', 'text'])

#### Ensure binary possibly_sensitive vars

In [11]:
df_train['possibly_sensitive'][df_train['possibly_sensitive'].apply(lambda x: isinstance(x, str))] =np.nan
df_valid['possibly_sensitive'][df_valid['possibly_sensitive'].apply(lambda x: isinstance(x, str))] =np.nan
df_test['possibly_sensitive'][df_test['possibly_sensitive'].apply(lambda x: isinstance(x, str))] =np.nan

## 4. Extract Embeddings

In [12]:
#import packages 
from translate import Translator
import spacy
import langid
import keras_bert
import tensorflow as tf
import time
import datetime as dt
import pytz

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Lasso, LogisticRegression

Using TensorFlow backend.


### Prep for Tensor 

#### Convert Dates to Unix Epoch Time

In [13]:
### Function to convert dates into float (Unix Epoch Times )
def convert_dates_float(df):
    '''
    Convert key input data variables to numeric format for tensors. Uses unix epoch time in seconds. 
    '''
    # created_at (tweet)
    df['created_at'] = pd.to_datetime(df['created_at'], format='%Y-%m-%d %H:%M:%S.%f')
    my_datetime = dt.datetime(1970,1,1) 
    good_dt = pytz.timezone('UTC').localize(my_datetime)
    df['created_at'] = (df['created_at'] - good_dt).dt.total_seconds()

    # User.created_at (account)
    df['user.created_at'] = pd.to_datetime(df['user.created_at'], format='%Y-%m-%d %H:%M:%S.%f')
    my_datetime = dt.datetime(1970,1,1) 
    good_dt = pytz.timezone('UTC').localize(my_datetime)
    df['user.created_at'] = (df['user.created_at'] - good_dt).dt.total_seconds()
    
    return df

#### Convert binary and categorical variables to one-hot encoded (not sure this is best or not)

Options 

- Integer Encoding: Where each unique label is mapped to an integer.
- One Hot Encoding: Where each label is mapped to a binary vector.
- Learned Embedding: Where a distributed representation of the categories is learned.

We use one hot encoding below. 

#### We use get_dummies below instead of one_hot_encoder as get dummies knows how to deal with missingness. 

In [16]:
### One Hote Encoding (Unix Epoch Times )
def one_hot(df_train, df_valid, df_test): 
    '''
    One hot encoding requires the full dataset in order to ensure that there end up the same amount of columns for test, validation and train.
    We therefore combine train, valid, and test, fill nas with 0 where necessary, and one hot encode categorical vars. 
    '''
    df_train['split'] = "train"
    df_valid['split'] = "valid"
    df_test['split'] = "test"
    df = pd.concat([df_train, df_test, df_valid], ignore_index=True, sort=False)
    df = convert_dates_float(df)
    # Extra layer of Processing 
    df = df[df['retweet_count'] != "False"] 
    df['quoted_status.user.followers_count'] = df['quoted_status.user.followers_count'].fillna(0) 
    df['quoted_status.user.friends_count'] = df['quoted_status.user.friends_count'].fillna(0) 
    df['retweeted_status.user.followers_count'] = df['retweeted_status.user.followers_count'].fillna(0) 
    df['retweeted_status.user.friends_count'] = df['retweeted_status.user.friends_count'].fillna(0) 
    # One-hot
    df = df.drop(["user.protected.1"], axis=1)
    df = pd.get_dummies(df, columns=["source", "lang", "possibly_sensitive", "withheld_in_countries", "place.country", 
                                         "user.geo_enabled", "user.lang", "user.verified", "user.has_extended_profile",
                                        "user.lang", "user.protected", "user.time_zone", "user.verified", "user.default_profile",
                                        "is_quote_status"])
    return df
# Tp get rid of: Text, user.protected.1, user.protected.2, user.protected.3, 
# To concat (or get rid of): user.description, user.location, user.name, user.screen_name
# to potentially take out entirely - user.id (This would explain everything)


#### Split one-hot encoded df back apart into train, valid, and test

In [17]:
df = one_hot(df_train, df_valid, df_test)
df_train_f = df[df['split'] == "train"]
df_valid_f = df[df['split'] == "valid"]
df_test_f = df[df['split'] == "test"]


In [18]:
df_train_f = df_train_f.drop_duplicates(subset=['id'])
df_valid_f = df_valid_f.drop_duplicates(subset=['id'])
df_test_f = df_test_f.drop_duplicates(subset=['id'])

In [20]:
print("full train:", len(df_train_f), "unique ids train:", len(df_train_f['id'].unique()), 
      "full valid:", len(df_valid_f), "unique ids valid:", len(df_valid_f['id'].unique()), 
      "full test:", len(df_test_f), "unique ids test:", len(df_test_f['id'].unique()))

full train: 115766 unique ids train: 115766 full valid: 35120 unique ids valid: 35120 full test: 34839 unique ids test: 34839


In [30]:
df_train_f.to_csv(results_bucket + 'df_train_full_cleaned_pe.csv', index=False, encoding = "utf_8_sig")
df_test_f.to_csv(results_bucket + 'df_test_full_cleaned_pe.csv', index=False, encoding = "utf_8_sig")
df_valid_f.to_csv(results_bucket + 'df_valid_full_cleaned_pe.csv', index=False, encoding = "utf_8_sig")