# Dive into Abusive Language with Snorkel

Author: BingYune Chen 
<br>
Updated: 2021-08-02

----------

### Data Preparation

We applied an extensive set of pre-processing steps to decrease the size of the feature set, making it more suitable for learning algorithms.

* Remove HTML entities such as '&lt' or '&amp'
* Identify general tweet elements such as retweets, urls, and mentions
* Expand contractions

In [None]:
# Imports and setup for Google Colab

# Mount Google Drive
import os, sys # interact with Google Drive's operating system
from google.colab import drive ## module to use Google Drive with Python
drive.mount('/content/drive') ## mount to access contents

# Install python libraries
! pip install twarc --quiet

In [None]:
# Load standard libraries
import numpy as np
import pandas as pd

import re
import itertools 

from sklearn.model_selection import train_test_split

In [None]:
# Load combined labeled dataset
df = pd.read_csv('../data/interim/labeled_combined_data.csv')

In [None]:
# Confirm data loaded correctly with 'label' and 'tweet' columns
df.head()

Unnamed: 0,label,tweet
0,0,~~Ruffled | Ntac Eileen Dahlia - Beautiful col...
1,0,RT @ing3nu: Packed house for #WIG2015 http://t...
2,0,I need to stop frantically typing up responses...
3,0,For the first time in my months of monitoring ...
4,0,@holinka ARE YOU AT GDC


In [None]:
# Replace contractions 
# Code adapted from https://towardsdatascience.com/twitter-sentiment-analysis-using-fasttext-9ccd04465597
# Contractions source https://en.wikipedia.org/wiki/Contraction_%28grammar%29
def load_dict_contractions():
    return {
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "I'd":"I would",
        "I'll":"I will",
        "I'm":"I am",
        "I'm'a":"I am about to",
        "I'm'o":"I am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "I've":"I have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        "Whatcha":"What are you",
        "luv":"love",
        "sux":"sucks"
        }

In [None]:
# Clean tweet text to remove mentions, retweets, urls
def clean_tweet_txt(tweet_txt):
    
    ## remove mentions, but keep hashtags
    tweet_txt = ' '.join(re.sub(
        '(@[A-Za-z0-9_]+\:)|(@[A-Za-z0-9_\.]+)', 
        ' #has_mention ', 
        tweet_txt
        ).split()
    )
    
    ## remove retweets
    tweet_txt = ' '.join(re.sub(
        '(RT\: )|(RT\:)|(RT \: )|(RT )', 
        ' #has_retweet ', 
        tweet_txt
        ).split()
    )
    
    ## remove punctuation not needed for VADER sentiment
    tweet_txt = ' '.join(re.sub(
        '\\.\\.\\.$|[@…]', 
        ' #has_truncate ', 
        tweet_txt
        ).split()
    )
    
    ## remove urls
    tweet_txt = ' '.join(re.sub(
        '(\w+:\/\/\S+)|(\w+:)', 
        ' #has_url ', 
        tweet_txt
        ).split()
    )
    
    ## expand contractions
    CONTRACTIONS = load_dict_contractions()
    tweet_txt = tweet_txt.replace("’","'")
    words = tweet_txt.split()
    reformed = [CONTRACTIONS[
        word.lower()] if word.lower() in CONTRACTIONS else word 
        for word in words
        ]
    tweet_txt = " ".join(reformed)
    
    ## fix simple misspelled words (character repeats more than 2x)
    tweet_txt = ''.join(
        ''.join(t)[:2] for _, t in itertools.groupby(tweet_txt)
        )
    return tweet_txt

df['tweet'] = df['tweet'].apply(clean_tweet_txt)

In [None]:
# Check cleaned labeled dataset
df.head()

Unnamed: 0,label,tweet
0,0,~~Ruffled | Ntac Eileen Dahlia - Beautiful col...
1,0,#has_retweet #has_mention Packed house for #WI...
2,0,I need to stop frantically typing up responses...
3,0,For the first time in my months of monitoring ...
4,0,#has_mention ARE YOU AT GDC


In [None]:
# Save cleaned labeled dataset
df.to_csv('../data/interim/labeled_cleaned_data.csv', index=False)

In [None]:
# Create four subsets for training, validation, testing, development

# Training set is dataset to be labeled by snorkel
# Testing set is used to evaluate the classifier scores
# Evaluate trained classifer on data neither used for labeling nor training
df_train, df_test = train_test_split(df.copy(), 
                                     test_size=0.2, 
                                     stratify=df['label'], 
                                     random_state=42)

# Development set is used to evaluate and optimize labeling functions
# Show accuracy of the labeling functions via LFAnalysis
df_dev = df_train.groupby('label').apply(
    lambda x: x.sample(100, random_state=42)).reset_index(level=0, drop=True)
df_train.drop(df_dev.index, inplace=True)

# Validation set is used to evaluate the label model's predictions
# Show accuracy of generative approach via LabelModel
df_valid = df_test.sample(frac=0.1, random_state=42)
df_test.drop(df_valid.index, inplace=True)

print('Train:', len(df_train), 
      '\t Dev:', len(df_dev), 
      '\t', 'Valid:', len(df_valid),
      '\t Test:', len(df_test)
     ) # Train: 27960, Dev: 200, Valid: 704, Test: 6337

Train: 27960 	 Dev: 200 	 Valid: 704 	 Test: 6337


In [None]:
# Read to pickle files for cleaned, subset data
df_train.to_pickle('../data/processed/df_train.pkl')
df_dev.to_pickle('../data/processed/df_dev.pkl')
df_valid.to_pickle('../data/processed/df_valid.pkl')
df_test.to_pickle('../data/processed/df_test.pkl')

In [None]:
# Preprocess Twitter Sentiment140 dataset for transfer learning

df_twitter140_full = pd.read_csv(
    '../data/external/training.1600000.processed.noemoticon.csv',
    engine='python',
    names=['target', 'id', 'date', 'flag', 'user', 'text']
    )

df_twitter140_full['tweet'] = df_twitter140_full['text'].apply(clean_tweet_txt)
df_twitter = df_twitter140_full.loc[:, ['target', 'tweet']]

df_twitter.to_pickle('../data/processed/twitter_sentiment140.pkl')