# Dive into Abusive Language with Snorkel

Author: BingYune Chen 
<br>
Updated: 2021-08-02

----------

### Time to Predict New Labels

We just completed the following steps to work with our BERT model:

1. Fine-tuned BERT model using Sentiment140 to generalize on Twitter data
2. Trained BERT model using **benchmark labels** for X_train to predict abusive language

**We will now apply the fine-tuned and trained BERT model to predict labels for our unlabeled data.**

In [None]:
# Imports and setup for Google Colab

# Mount Google Drive
from google.colab import drive ## module to use Google Drive with Python
drive.mount('/content/drive') ## mount to access contents

# Install python libraries
! pip install --upgrade tensorflow --quiet
! pip install snorkel --quiet
! pip install tensorboard==1.15.0 --quiet
! pip install transformers --quiet

In [None]:
# Imports for data and plotting
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline 
import seaborn as sns

import pickle
import os
import re
import csv
from tqdm import tqdm

# Imports for snorkel analysis and multi-task learning
from snorkel.labeling.model import LabelModel
from snorkel.labeling import filter_unlabeled_dataframe

# Imports for bert language model
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn import metrics

import transformers

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler

import time
import datetime
import random

In [None]:
# Access notebook directory

# Define paths
LOAD_MODEL = '../models/'
LOAD_DATA = '../data/processed/'

SAVE_MODEL = '../models/'
SAVE_DATA = '../data/published/'

SAVE_FIG = '../assets/'

# Define files for training
INPUT_FILE = 'clean_20201103.txt' ## update
COUNT_FILE = 'abusivelanguage2020_v1_counts.csv'

# Define current version of BERT model to load
BERT_PRE = 'model_bert_df_train_dict_v1.pt' ## update without snorkel labels

# Save final labels
FINAL_FILE = 'abusivelanguage2020_v1.txt'

In [None]:
# Create BERT tokenizer (original BERT of 110M parameters)
# BERT tokenizer can handle punctuation, simleys, etc.
# Previously replaced mentions and urls with special tokens (#has_url, #has_mention)

bert_token = transformers.BertTokenizerFast.from_pretrained(
    'bert-base-uncased', 
    do_lower_case=True) 

# Create helper function for text parsing

def bert_encode(tweet_df, tokenizer):
    ## add '[CLS]' token as prefix to flag start of text
    ## append '[SEP]' token to flag end of text
    ## append '[PAD]' token to fill uneven text
    bert_tokens = tokenizer.batch_encode_plus(
        tweet_df['tweet'].to_list(),
        padding='max_length', 
        truncation=True,
        max_length=30
        )
    
    ## convert list to tensors
    input_word_ids = torch.tensor(bert_tokens['input_ids'])
    input_masks = torch.tensor(bert_tokens['attention_mask'])
    input_type_ids = torch.tensor(bert_tokens['token_type_ids'])

    inputs = {
        'input_word_ids': input_word_ids,
        'input_masks': input_masks,
        'input_type_ids': input_type_ids
        }

    return inputs

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [None]:
# Redfine BERT model for additional fine-tuning 
nlp_bert = transformers.BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', ## use 12-layer BERT model, uncased vocab
    num_labels=2, ## binary classfication
    output_attentions = False, ## model returns attentions weights
    output_hidden_states = False, ## model returns all hidden-states
    )

nlp_bert.cuda()

# Load saved BERT model
nlp_bert.load_state_dict(torch.load(os.path.join(LOAD_MODEL, BERT_PRE)))

# Put model in evaluation mode
nlp_bert.eval() ## IMPORTANT STEP

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
# Predict labels
batch_size = 32
chunksize = 100_000
n = 1

for chunk in tqdm(pd.read_csv(LOAD_DATA + INPUT_FILE, chunksize=chunksize)):
    
    print('')
    print('Encoding Chunk...{}'.format(n))

    ## predict on test
    X_test = bert_encode(chunk, bert_token)
    TOKEN_FILE = 'bert_tokens_{}_c{}.pkl'.format(INPUT_FILE[6:-4], n)
    
    with open(os.path.join(SAVE_MODEL, TOKEN_FILE), 'wb') as file:
        pickle.dump(X_test, file)

    print('')
    print('Predicting labels for {:,} tweets...'.format(
        len( X_test['input_word_ids'])))

    ## wrap tensors for test
    X_test_data = TensorDataset(
        X_test['input_word_ids'], 
        X_test['input_masks']
        )

    ## make sampler for test
    X_test_sampler = SequentialSampler(X_test_data)

    ## make dataLoader for test
    X_test_dataloader = DataLoader(
        X_test_data, 
        sampler=X_test_sampler, 
        batch_size=batch_size
        )

    ## track variables 
    predictions , true_labels = [], []

    ## predict 
    for batch in X_test_dataloader:
        ## add batch to GPU
        batch = tuple(t.to('cuda') for t in batch)
        
        ## unpack the inputs from our dataloader
        b_input_ids, b_input_mask = batch
        
        ## tell the model not to compute or store gradients, 
        ## saving memory and speeding up prediction
        with torch.no_grad():
            ## start forward pass, calculate logit predictions
            outputs = nlp_bert(b_input_ids, token_type_ids=None, 
                            attention_mask=b_input_mask)

        logits = outputs[0]

        ## move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        
        ## store predictions and true labels
        predictions.append(logits)

    print('')
    print('Saving new labels...')

    LABEL_FILE = 'label_{}_c{}.txt'.format(INPUT_FILE[6:-4], n)

    ## flatten arrays into single list
    chunk['label'] = [np.argmax(item) for sublist in 
                            predictions for item in sublist]

    chunk.to_csv(os.path.join(SAVE_DATA, LABEL_FILE), 
                columns=['label', 'tweet'],
                index=False,
                sep = ' ')
    
    with open(os.path.join(SAVE_DATA, COUNT_FILE), 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=",")
        writer.writerow([LABEL_FILE[:-4]] + chunk.label.value_counts().tolist())

    n += 1


0it [00:00, ?it/s][A


Encoding Chunk...1

Predicting labels for 100,000 tweets...

Saving new labels...



1it [01:50, 110.33s/it][A


Encoding Chunk...2

Predicting labels for 100,000 tweets...

Saving new labels...



2it [03:40, 110.26s/it][A


Encoding Chunk...3

Predicting labels for 100,000 tweets...

Saving new labels...



3it [05:30, 110.17s/it][A


Encoding Chunk...4

Predicting labels for 100,000 tweets...

Saving new labels...



4it [07:20, 110.08s/it][A


Encoding Chunk...5

Predicting labels for 100,000 tweets...

Saving new labels...



5it [09:10, 110.10s/it][A


Encoding Chunk...6

Predicting labels for 100,000 tweets...

Saving new labels...



6it [11:00, 109.97s/it][A


Encoding Chunk...7

Predicting labels for 100,000 tweets...

Saving new labels...



7it [12:49, 109.91s/it][A


Encoding Chunk...8

Predicting labels for 100,000 tweets...

Saving new labels...



8it [14:39, 109.91s/it][A


Encoding Chunk...9

Predicting labels for 100,000 tweets...

Saving new labels...



9it [16:29, 109.84s/it][A


Encoding Chunk...10

Predicting labels for 100,000 tweets...

Saving new labels...



10it [18:19, 109.87s/it][A


Encoding Chunk...11

Predicting labels for 100,000 tweets...

Saving new labels...



11it [20:09, 109.82s/it][A


Encoding Chunk...12

Predicting labels for 100,000 tweets...

Saving new labels...



12it [21:58, 109.78s/it][A


Encoding Chunk...13

Predicting labels for 100,000 tweets...

Saving new labels...



13it [23:48, 109.79s/it][A


Encoding Chunk...14

Predicting labels for 100,000 tweets...

Saving new labels...



14it [25:38, 109.78s/it][A


Encoding Chunk...15

Predicting labels for 100,000 tweets...

Saving new labels...



15it [27:28, 109.87s/it][A


Encoding Chunk...16

Predicting labels for 58,637 tweets...

Saving new labels...



16it [28:32, 107.06s/it]


In [None]:
# Explore final chunk
pd.set_option('display.max_colwidth', 500)
chunk.head(50)

Unnamed: 0,tweet,label
1500000,#has_retweet #has_mention #has_url John James has won the Senate Seat in Michigan and flipped a Seat Red!!,0
1500001,#has_retweet #has_mention Very cool.. #Teaserต้องไป,0
1500002,#has_url VIX but for PredictIt,0
1500003,"#has_retweet #has_mention In Malaysia, Biden ni macam PKR-DAP. Trump ni macam UMNO-BERSATU-PAS.",1
1500004,#has_mention #has_mention What was the gist of it?,0
1500005,#has_retweet #has_mention so i heard some tr*mp supporters might hack some accounts so if i seem to say anything in support of him or anything rac #has_truncate,0
1500006,#has_retweet #has_mention Follow everyone who retweets and likes this 💐,1
1500007,"#has_mention #has_mention This is democracy. that is how a democratic system works, I'm sorry if that bothers you.",1
1500008,#has_retweet #has_mention EVERYONE RETWEET DJT TWEET THAT WAS CENSORED! #has_url,0
1500009,"#has_retweet #has_mention He who finds a wife finds a good thing I cannot wait to congratulate you properly Mercy Show us the way, Shake our legs wi #has_truncate",1


In [None]:
# Combine all chunks into a single file
os.chdir(SAVE_DATA)
files = os.listdir()
sorted_files = sorted(files)
  
# Open new abusivelanguage file in write mode
with open(FINAL_FILE, 'w') as outfile:
    ## add header
    outfile.write("label tweet\n")
    ## iterate through list
    for fnames in sorted_files:
        ## check for txt file
        if fnames.endswith('.txt'):
            ## open each file in read mode
            f = open(fnames, 'r')
            lines = f.readlines()[1:] ## remove header from each file
            
            for l in lines:
                outfile.write(l) ## read then write to file
            ## Add '\n' to enter data from next line
            outfile.write("\n")
            f.close()
        else: 
            continue