# Applying BERT Multilingual Classifier to Predict Account Suspension 

Phase 2

Guidance from: https://github.com/kacossio/TeamPython/blob/master/Bert%20Multilingual%20Embedding.ipynb

## 1. Load Packages

In [1]:
########## Load Packages
import warnings
warnings.simplefilter("ignore")

import importlib
import pandas as pd
import numpy as np
import re
from io import StringIO
import itertools
import os 
import time
import datetime

from io import StringIO # python3; python2: BytesIO 
import boto3

import emoji
import random 
import math

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import math

## 2. Set parameters

In [2]:
# Set Parameters 
########## Set Parameters

# Indicate how many rows to skip before columns
# Note: Python uses zero-based indexing, so skiprow=0 begins at the first row of file,
# while skiprow=1 begins at the second row.
skiprow=0

# Indicate name of column that contains text data for analysis
text_column = "text"

import_bucket = "data/s3_data/full_clean_2/"

embedding_bucket = "data/s3_data/embeddings/final_embeddings_textclass/"

results_bucket = 'data/s3_data/' # already created on S3

## 3. Load in Data

### Load in data from S3 - Original Data

In [3]:
#### Load in data from S3

# Import Train and Measure Balance
# Import Flattened Data
df_train = pd.read_csv(import_bucket + "df_train_full_cleaned_pe.csv", error_bad_lines=False, encoding='utf-8')

df_train['suspended'] = pd.to_numeric(df_train['suspended'], errors='coerce')
df_train = df_train[df_train['suspended'].notna()]

# Import Test and Measure Balance

df_test = pd.read_csv(import_bucket + "df_test_full_cleaned_pe.csv", error_bad_lines=False, encoding='utf-8')

df_test['suspended'] = pd.to_numeric(df_test['suspended'], errors='coerce')
df_test = df_test[df_test['suspended'].notna()]

# Import Validation and Measure Balance
# Import Flattened Data
df_valid = pd.read_csv(import_bucket + "df_valid_full_cleaned_pe.csv", error_bad_lines=False, encoding='utf-8')

df_valid['suspended'] = pd.to_numeric(df_valid['suspended'], errors='coerce')
df_valid = df_valid[df_valid['suspended'].notna()]

In [4]:
df_train = df_train.drop_duplicates(subset=['id'])
df_valid = df_valid.drop_duplicates(subset=['id'])
df_test = df_test.drop_duplicates(subset=['id'])

### Downsampling train and validation 

#### Group by Accounts

Train

In [5]:
df_train_accounts = df_train.groupby(
   ['user.id', 'user.screen_name'], as_index=False
).agg(
    {
         'id': 'count',
         'suspended': 'mean'
    }
)
df_train_accounts.head(3)

Unnamed: 0,user.id,user.screen_name,id,suspended
0,1.27813e+18,MMunir50647063,1,0
1,1.278132e+18,3qex2,63,0
2,1.278136e+18,SueleWui,1,0


Valid

In [6]:
df_valid_accounts = df_valid.groupby(
   ['user.id', 'user.screen_name'], as_index=False
).agg(
    {
         'id': 'count',
         'suspended': 'mean'
    }
)
df_valid_accounts.head(3)

Unnamed: 0,user.id,user.screen_name,id,suspended
0,1.27812e+18,SFAC_TFA_CN,1038,1
1,1.278152e+18,85IFYYgf0GGo5Gb,28,0
2,1.278155e+18,SamuelHPWong1,2,0


### Check current proportions - Tweets

Train

In [7]:
a = len(df_train[df_train['suspended'] == 0])/(len(df_train[df_train['suspended'] == 0]) + len(df_train[df_train['suspended'] == 1]))
b = len(df_train[df_train['suspended'] == 1])/(len(df_train[df_train['suspended'] == 0]) + len(df_train[df_train['suspended'] == 1]))
nonsus = len(df_train[df_train['suspended'] == 0])
sus = len(df_train[df_train['suspended'] == 1])

print("Non-suspended #:", nonsus , "Suspended #:", sus)
print("Non-suspended %:", a, "Suspended %:", b)

Non-suspended #: 106870 Suspended #: 8896
Non-suspended %: 0.9231553305806541 Suspended %: 0.07684466941934592


Valid

In [8]:
a = len(df_valid[df_valid['suspended'] == 0])/(len(df_valid[df_valid['suspended'] == 0]) + len(df_valid[df_valid['suspended'] == 1]))
b = len(df_valid[df_valid['suspended'] == 1])/(len(df_valid[df_valid['suspended'] == 0]) + len(df_valid[df_valid['suspended'] == 1]))
nonsus = len(df_valid[df_valid['suspended'] == 0])
sus = len(df_valid[df_valid['suspended'] == 1])

print("Non-suspended #:", nonsus , "Suspended #:", sus)
print("Non-suspended %:", a, "Suspended %:", b)

Non-suspended #: 28923 Suspended #: 6197
Non-suspended %: 0.8235478359908884 Suspended %: 0.17645216400911162


### Check current proportions - Accounts

Train

In [9]:
a = len(df_train_accounts[df_train_accounts['suspended'] == 0])/(len(df_train_accounts[df_train_accounts['suspended'] == 0]) + len(df_train_accounts[df_train_accounts['suspended'] == 1]))
b = len(df_train_accounts[df_train_accounts['suspended'] == 1])/(len(df_train_accounts[df_train_accounts['suspended'] == 0]) + len(df_train_accounts[df_train_accounts['suspended'] == 1]))
nonsus = len(df_train_accounts[df_train_accounts['suspended'] == 0])
sus = len(df_train_accounts[df_train_accounts['suspended'] == 1])

print("Non-suspended #:", nonsus , "Suspended #:", sus)
print("Non-suspended %:", a, "Suspended %:", b)

Non-suspended #: 1450 Suspended #: 103
Non-suspended %: 0.9336767546683837 Suspended %: 0.06632324533161622


Valid

In [10]:
a = len(df_valid_accounts[df_valid_accounts['suspended'] == 0])/(len(df_valid_accounts[df_valid_accounts['suspended'] == 0]) + len(df_valid_accounts[df_valid_accounts['suspended'] == 1]))
b = len(df_valid_accounts[df_valid_accounts['suspended'] == 1])/(len(df_valid_accounts[df_valid_accounts['suspended'] == 0]) + len(df_valid_accounts[df_valid_accounts['suspended'] == 1]))
nonsus = len(df_valid_accounts[df_valid_accounts['suspended'] == 0])
sus = len(df_valid_accounts[df_valid_accounts['suspended'] == 1])

print("Non-suspended #:", nonsus , "Suspended #:", sus)
print("Non-suspended %:", a, "Suspended %:", b)

Non-suspended #: 509 Suspended #: 58
Non-suspended %: 0.8977072310405644 Suspended %: 0.10229276895943562


### Sample by Accounts 

Train

In [11]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df_train_accounts[df_train_accounts.suspended==0]
df_minority = df_train_accounts[df_train_accounts.suspended==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=sus+150,     # to match minority class (Use higher to account for fact that susp accounts tweet more)
                                 random_state=123) # reproducible results

# Combine minority class with downsampled majority class
df_downsampled_train = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
df_downsampled_train.suspended.value_counts()
# 1    195
# 0    195
# Name: balance, dtype: int64

0    208
1    103
Name: suspended, dtype: int64

Valid

In [12]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df_valid_accounts[df_valid_accounts.suspended==0]
df_minority = df_valid_accounts[df_valid_accounts.suspended==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=sus+45,     # to match minority class (Use higher to account for fact that susp accounts tweet more)
                                 random_state=123) # reproducible results

# Combine minority class with downsampled majority class
df_downsampled_valid = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
df_downsampled_valid.suspended.value_counts()
# 1    195
# 0    195
# Name: balance, dtype: int64

0    103
1     58
Name: suspended, dtype: int64

### Check new Datasets

Train

In [13]:
df_downsampled_train.head()

Unnamed: 0,user.id,user.screen_name,id,suspended
855,1.282102e+18,Geoff58974426,14,0
1405,1.285779e+18,36MDIPu0Q8RTaIf,2,0
1103,1.283425e+18,xiems33,3,0
556,1.28054e+18,Hamza92062210,1,0
177,1.278758e+18,chib_saqib,3,0


Valid

In [14]:
df_downsampled_valid.head()

Unnamed: 0,user.id,user.screen_name,id,suspended
6,1.278219e+18,ayikzhi,9,0
298,1.281543e+18,frrsj1lgGxTnV5B,289,0
93,1.279014e+18,zhangta50319667,17,0
293,1.281517e+18,armanmani4211,1,0
346,1.282462e+18,u3XewM7yE9s6JYV,4,0


### Subset out by accounts 

Train

In [15]:
df_train_downsampled = df_train[df_train['user.id'].isin(df_downsampled_train['user.id'].tolist())]

In [16]:
len(df_train_downsampled)

26663

Valid

In [17]:
df_valid_downsampled = df_valid[df_valid['user.id'].isin(df_downsampled_valid['user.id'].tolist())]

In [18]:
len(df_valid_downsampled)

10472

### Check new proportions - Tweets

Train

In [19]:
a = len(df_train_downsampled[df_train_downsampled['suspended'] == 0])/(len(df_train_downsampled[df_train_downsampled['suspended'] == 0]) + len(df_train_downsampled[df_train_downsampled['suspended'] == 1]))
b = len(df_train_downsampled[df_train_downsampled['suspended'] == 1])/(len(df_train_downsampled[df_train_downsampled['suspended'] == 0]) + len(df_train_downsampled[df_train_downsampled['suspended'] == 1]))
nonsus = len(df_train_downsampled[df_train_downsampled['suspended'] == 0])
sus = len(df_train_downsampled[df_train_downsampled['suspended'] == 1])

print("Non-suspended #:", nonsus , "Suspended #:", sus)
print("Non-suspended %:", a, "Suspended %:", b)

Non-suspended #: 17767 Suspended #: 8896
Non-suspended %: 0.6663541236920076 Suspended %: 0.33364587630799236


Valid

In [20]:
a = len(df_valid_downsampled[df_valid_downsampled['suspended'] == 0])/(len(df_valid_downsampled[df_valid_downsampled['suspended'] == 0]) + len(df_valid_downsampled[df_valid_downsampled['suspended'] == 1]))
b = len(df_valid_downsampled[df_valid_downsampled['suspended'] == 1])/(len(df_valid_downsampled[df_valid_downsampled['suspended'] == 0]) + len(df_valid_downsampled[df_valid_downsampled['suspended'] == 1]))
nonsus = len(df_valid_downsampled[df_valid_downsampled['suspended'] == 0])
sus = len(df_valid_downsampled[df_valid_downsampled['suspended'] == 1])

print("Non-suspended #:", nonsus , "Suspended #:", sus)
print("Non-suspended %:", a, "Suspended %:", b)

Non-suspended #: 4275 Suspended #: 6197
Non-suspended %: 0.408231474407945 Suspended %: 0.591768525592055


In [21]:
df_train = df_train_downsampled.copy()
df_valid = df_valid_downsampled.copy()

In [22]:
# Check Lengths and Proportions

In [23]:
# Tweets

In [24]:
full = len(df_train) + len(df_valid) + len(df_test)
print("full train:", len(df_train), "percent train:", len(df_train)/full, 
      "full valid:", len(df_valid), "percent valid:", len(df_valid)/full, 
      "full test:", len(df_test), "percent test:", len(df_test)/full)

full train: 26663 percent train: 0.37045321921805097 full valid: 10472 percent valid: 0.14549698502236919 full test: 34839 percent test: 0.48404979575957985


In [25]:
# Accounts

In [26]:
df_train_accounts = df_train.groupby(
   ['user.id', 'user.screen_name'], as_index=False
).agg(
    {
         'id': 'count',
         'suspended': 'mean'
    }
)

df_valid_accounts = df_valid.groupby(
   ['user.id', 'user.screen_name'], as_index=False
).agg(
    {
         'id': 'count',
         'suspended': 'mean'
    }
)

df_test_accounts = df_test.groupby(
   ['user.id', 'user.screen_name'], as_index=False
).agg(
    {
         'id': 'count',
         'suspended': 'mean'
    }
)


full = len(df_train_accounts) + len(df_valid_accounts) + len(df_test_accounts)
print("full train:", len(df_train_accounts), "percent train:", len(df_train_accounts)/full, 
      "full valid:", len(df_valid_accounts), "percent valid:", len(df_valid_accounts)/full, 
      "full test:", len(df_test_accounts), "percent test:", len(df_test_accounts)/full)

full train: 311 percent train: 0.31573604060913707 full valid: 161 percent valid: 0.1634517766497462 full test: 513 percent test: 0.5208121827411167


### Check Balance Proportions

#### Tweets

Train

In [27]:
a = len(df_train[df_train['suspended'] == 0])/(len(df_train[df_train['suspended'] == 0]) + len(df_train[df_train['suspended'] == 1]))
b = len(df_train[df_train['suspended'] == 1])/(len(df_train[df_train['suspended'] == 0]) + len(df_train[df_train['suspended'] == 1]))
nonsus = len(df_train[df_train['suspended'] == 0])
sus = len(df_train[df_train['suspended'] == 1])

print("Non-suspended #:", nonsus , "Suspended #:", sus)
print("Non-suspended %:", a, "Suspended %:", b)

Non-suspended #: 17767 Suspended #: 8896
Non-suspended %: 0.6663541236920076 Suspended %: 0.33364587630799236


Valid

In [28]:
a = len(df_valid[df_valid['suspended'] == 0])/(len(df_valid[df_valid['suspended'] == 0]) + len(df_valid[df_valid['suspended'] == 1]))
b = len(df_valid[df_valid['suspended'] == 1])/(len(df_valid[df_valid['suspended'] == 0]) + len(df_valid[df_valid['suspended'] == 1]))
nonsus = len(df_valid[df_valid['suspended'] == 0])
sus = len(df_valid[df_valid['suspended'] == 1])

print("Non-suspended #:", nonsus , "Suspended #:", sus)
print("Non-suspended %:", a, "Suspended %:", b)

Non-suspended #: 4275 Suspended #: 6197
Non-suspended %: 0.408231474407945 Suspended %: 0.591768525592055


Test

In [29]:
a = len(df_test[df_test['suspended'] == 0])/(len(df_test[df_test['suspended'] == 0]) + len(df_test[df_test['suspended'] == 1]))
b = len(df_test[df_test['suspended'] == 1])/(len(df_test[df_test['suspended'] == 0]) + len(df_test[df_test['suspended'] == 1]))
nonsus = len(df_test[df_test['suspended'] == 0])
sus = len(df_test[df_test['suspended'] == 1])

print("Non-suspended #:", nonsus , "Suspended #:", sus)
print("Non-suspended %:", a, "Suspended %:", b)

Non-suspended #: 31795 Suspended #: 3044
Non-suspended %: 0.9126266540371423 Suspended %: 0.08737334596285772


#### Accounts

Train

In [30]:
a = len(df_train_accounts[df_train_accounts['suspended'] == 0])/(len(df_train_accounts[df_train_accounts['suspended'] == 0]) + len(df_train_accounts[df_train_accounts['suspended'] == 1]))
b = len(df_train_accounts[df_train_accounts['suspended'] == 1])/(len(df_train_accounts[df_train_accounts['suspended'] == 0]) + len(df_train_accounts[df_train_accounts['suspended'] == 1]))
nonsus = len(df_train_accounts[df_train_accounts['suspended'] == 0])
sus = len(df_train_accounts[df_train_accounts['suspended'] == 1])

print("Non-suspended #:", nonsus , "Suspended #:", sus)
print("Non-suspended %:", a, "Suspended %:", b)

Non-suspended #: 208 Suspended #: 103
Non-suspended %: 0.6688102893890675 Suspended %: 0.3311897106109325


Valid

In [31]:
a = len(df_valid_accounts[df_valid_accounts['suspended'] == 0])/(len(df_valid_accounts[df_valid_accounts['suspended'] == 0]) + len(df_valid_accounts[df_valid_accounts['suspended'] == 1]))
b = len(df_valid_accounts[df_valid_accounts['suspended'] == 1])/(len(df_valid_accounts[df_valid_accounts['suspended'] == 0]) + len(df_valid_accounts[df_valid_accounts['suspended'] == 1]))
nonsus = len(df_valid_accounts[df_valid_accounts['suspended'] == 0])
sus = len(df_valid_accounts[df_valid_accounts['suspended'] == 1])

print("Non-suspended #:", nonsus , "Suspended #:", sus)
print("Non-suspended %:", a, "Suspended %:", b)

Non-suspended #: 103 Suspended #: 58
Non-suspended %: 0.639751552795031 Suspended %: 0.36024844720496896


Test

In [32]:
a = len(df_test_accounts[df_test_accounts['suspended'] == 0])/(len(df_test_accounts[df_test_accounts['suspended'] == 0]) + len(df_test_accounts[df_test_accounts['suspended'] == 1]))
b = len(df_test_accounts[df_test_accounts['suspended'] == 1])/(len(df_test_accounts[df_test_accounts['suspended'] == 0]) + len(df_test_accounts[df_test_accounts['suspended'] == 1]))
nonsus = len(df_test_accounts[df_test_accounts['suspended'] == 0])
sus = len(df_test_accounts[df_test_accounts['suspended'] == 1])

print("Non-suspended #:", nonsus , "Suspended #:", sus)
print("Non-suspended %:", a, "Suspended %:", b)

Non-suspended #: 479 Suspended #: 34
Non-suspended %: 0.9337231968810916 Suspended %: 0.06627680311890838


In [204]:
## Save Downsampled Output

In [34]:
df_train.to_csv(results_bucket + 'downsamp_df_train_full_cleaned_pe.csv', index=False, encoding = "utf_8_sig")
df_test.to_csv(results_bucket + 'downsamp_df_test_full_cleaned_pe.csv', index=False, encoding = "utf_8_sig")
df_valid.to_csv(results_bucket + 'downsamp_df_valid_full_cleaned_pe.csv', index=False, encoding = "utf_8_sig")