# Dive into Abusive Language with Snorkel

Author: BingYune Chen 
<br>
Updated: 2021-08-02

----------

### Labeling Functions

**Common Types of Labeling Functions:**

* Hard-coded heuristics using regular expressions (regexes)
* Syntactic analysis using Spacy's dependency trees
* Distant supervision based on external knowledge bases (expert labels)
* Crowdsourcing noisy manual labels (amateur labels)

**We will now build other common types of labeling functions.**

In [None]:
# Imports and setup for Google Colab

# Mount Google Drive
from google.colab import drive ## module to use Google Drive with Python
drive.mount('/content/drive') ## mount to access contents

# Install python libraries
#! pip install tensorflow-gpu==1.15
! pip install --upgrade tensorflow --quiet
! pip install snorkel --quiet
! pip install tensorboard==1.15.0 --quiet
! python -m spacy download en_core_web_sm --quiet

In [None]:
# Imports for data and plotting
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline 
import seaborn as sns

import pickle
import os
import re

In [None]:
# Imports for sentiment analysis
# Valence Aware Dictionary and sEntiment Reasoner
# VADER was designed with a focus on social media texts
import nltk
nltk.download('stopwords')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
from nltk.tokenize import regexp_tokenize

In [None]:
# Imports for spaCy preprocessing
import spacy
spacy.load('en_core_web_sm')
spacy.prefer_gpu()
from spacy.tokenizer import _get_regex_pattern
from spacy.lang.en import English
from spacy.matcher import Matcher

# Imports for tensorflow/keras preprocessing
from keras.models import model_from_json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
# Imports for snorkel analysis and multi-task learning
from snorkel.preprocess import preprocessor
from snorkel.preprocess.nlp import SpacyPreprocessor
from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis, model
from snorkel.analysis import get_label_buckets

In [None]:
# Load labeled dataset for training 
df_train = pd.read_pickle('../data/processed/df_train.pkl')
df_train.reset_index(drop=True, inplace=True)

df_dev = pd.read_pickle('../data/processed/df_dev.pkl')
df_dev.reset_index(drop=True, inplace=True)

df_valid = pd.read_pickle('../data/processed/df_valid.pkl')
df_valid.reset_index(drop=True, inplace=True)

df_test = pd.read_pickle('../data/processed/df_test.pkl')
df_test.reset_index(drop=True, inplace=True)

df_train.head()

Unnamed: 0,label,tweet
0,1,#has_mention ee you got the hoes
1,1,Most of you hoes copy and paste but there has ...
2,1,#has_mention only when ur around me 😋 got to k...
3,1,"#has_retweet #has_mention *hits blunt* ""bruh i..."
4,1,"#has_url Alexander Skarsgard, my roommate has ..."


In [None]:
# Set voting values to be used in labeling functions
ABSTAIN = -1
NO_ABUSE = 0
ABUSE = 1

In [None]:
# Make custom preprocessing pipeline for spaCy
spacyp = SpacyPreprocessor(
    text_field="tweet", 
    doc_field="doc", 
    memoize=True,
    gpu=True)

# Load nltk's English stopwords and add custom tags
stops = (nltk.corpus.stopwords.words('english') 
    + ['#has_mention', '#has_url', '#has_retweet', '#has_truncate']
    + ['has_mention', 'has_url', 'has_retweet', 'has_truncate']
    )

# Additional preprocess step to identify lemmas
def spacyp_lemmatize(doc):
    lemma_list = [str(tok.lemma_).lower() for tok in doc
                  if tok.is_alpha and tok.text.lower() not in stops] 
    return lemma_list

In [None]:
# LFs with Trained Classifiers

# Use trained classifier for Tf-Idf bag of words
@labeling_function()
def lf_clf_tfidf_bow(df_row):
    ## load models for prediction
    MODEL_PATH = '../models/'
    os.chdir(MODEL_PATH)
    model_name = 'vectorizer_tfidf.pkl'
    with open(model_name, 'rb') as file:
        tfidf_vec = pickle.load(file)
    with open('model_bow_nb.pkl', 'rb') as file:
        model_bow_nb = pickle.load(file)
    
    ## apply vectorization
    X_test_vec = tfidf_vec.transform([df_row.tweet])
    ## make prediction
    score = model_bow_nb.predict(X_test_vec)

    return score[0]

# Use trained classifier for word embedding
@labeling_function(pre=[spacyp])
def lf_clf_wordembed_nlp(df_row):
    ## load models for prediction
    MODEL_PATH = '../models/'
    os.chdir(MODEL_PATH)
    with open('detector_bigram.pkl', 'rb') as file:
        detector_bigram = pickle.load(file)
    with open('detector_trigram.pkl', 'rb') as file:
        detector_trigram = pickle.load(file)
    with open('token_keras.pkl', 'rb') as file:
        k_token = pickle.load(file)

    json_file = open('model_wordembed_main.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    loaded_model.load_weights("model_wordembed_weights.h5")

    ## detect common bigrams and trigrams
    X_test_ug = spacyp_lemmatize(df_row.doc)
    X_test_bg = list(detector_bigram[X_test_ug])
    X_test_tg = list(detector_trigram[X_test_bg])
    ## create sequence
    lst_txt2seq = k_token.texts_to_sequences([" ".join(X_test_tg)])
    ## pad sequence
    X_test_pad = pad_sequences(
        lst_txt2seq,
        maxlen=15,
        padding="post",
        truncating="post"
        )
    
    loaded_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam', 
    metrics=['accuracy']
    )

    ## make prediction
    try: 
        score = loaded_model(X_test_pad)
    except:
        score = [0,1] ## default to abuse for oov, more common in dataset
    finally:
        return np.argmax(score)

In [None]:
# LFs with Third-Party Models and Heuristics

# Use pre trained model for sentiment analysis
# VADER focuses on punctuation, capitalization, degree modifiers, 
# conjunctions, preceding tri-grams
# VADER also accounts for emojis, slang, and emoticons

# Positive sentiment, less likely abusive language
@labeling_function()
def lf_vader_sentiment(df_row):
    vader = SentimentIntensityAnalyzer()
    vs = vader.polarity_scores(df_row.tweet)['compound']  
    if vs >= 0.05: 
        return NO_ABUSE
    elif vs <= -0.05: 
        return ABUSE
    else:
        return ABSTAIN

# Load emoji sentiment dictionary
emoji_df = pd.read_csv('../data/external/emoji_sentiment_data_v1.csv')
emoji_df['sentiment'] = emoji_df[['Positive', 'Neutral', 'Negative']].idxmax(axis=1)
pos_emoji = emoji_df.loc[emoji_df['sentiment'] == 'Positive', 'Emoji'] ## positive emoji
neg_emoji = emoji_df.loc[emoji_df['sentiment'] == 'Negative', 'Emoji'] ## negative emoji

# Positive emoji, less likely abusive language
# Apply sentiment ranking based on http://kt.ijs.si/data/Emoji_sentiment_ranking/
@labeling_function(pre=[spacyp])
def lf_emoji_sentiment_nlp(df_row):
    nlp = English()
    matcher = Matcher(nlp.vocab)
    
    pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
    neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]
    
    matcher.add("HAPPY", pos_patterns)  ## add positive pattern
    matcher.add("SAD", neg_patterns)  ## add negative pattern
    matches = matcher(df_row.doc)
    all_id = ([nlp.vocab.strings[match_id] 
              for match_id, start, end in matches] 
              + ['None']
              )
    result = max(set(all_id), key=all_id.count)
    if result == 'HAPPY':
        return NO_HATE
    elif result == 'SAD':
        return HATE
    else: 
        return ABSTAIN

In [None]:
# LFs with Complex Preprocessor spaCy and Pattern Matching

# Shorter comment and focus on a person, more likely abusive language
@labeling_function(pre=[spacyp])
def lf_has_person_nlp(df_row):
    if len(df_row.doc) < 20 and any(
        [ent.label_ == "PERSON" for ent in df_row.doc.ents]
        ):
        return ABUSE
    else:
        return ABSTAIN

# Mentions of titles for books, songs, or works of art, more likely abuse 
@labeling_function(pre=[spacyp])
def lf_has_work_art_nlp(df_row):
    if any([ent.label_ == "WORK_OF_ART" for ent in df_row.doc.ents]):
        return ABUSE
    else:
        return ABSTAIN

# Mentions of at least 3 named entities, more likely abusive language 
@labeling_function(pre=[spacyp])
def lf_has_3plus_entity_nlp(df_row):
    if len([ent.label_ in ["PERSON", "GPE", "LOC", "ORG", "LAW", "LANGUAGE"] 
            for ent in df_row.doc.ents]
           ) > 2:
        return ABUSE
    else:
        return ABSTAIN

# Mentions of "please stop" phrases, sarcastic, more likely abusive language
# Usually directed at those making abusive language comments or other issues
@labeling_function(pre=[spacyp])
def lf_has_please_stop_nlp(df_row):
    nlp = English()
    matcher = Matcher(nlp.vocab)
    pattern1 = [{"LEMMA": "do"},
                {"LEMMA": "not"}]
    pattern2 = [{"LEMMA": "stop"}]
    matcher.add("p1", [pattern1])
    matcher.add("p2", [pattern2])
    matches = matcher(df_row.doc)
    return ABUSE if len(matches) > 0 else ABSTAIN

# Higher stopword ratio, more likely abusive language
@labeling_function(pre=[spacyp])
def lf_has_stopwords_nlp(df_row):
    num_stopwords = len(
        [True for token in df_row.doc if token.lower_ in stops]
        )
    ratio  = num_stopwords / len(df_row.doc)
    return ABUSE if ratio > 0.5 and len(df_row.doc) > 10 else ABSTAIN

# Mentions of "about harass xyz" phrases, less likely abusive language
@labeling_function(pre=[spacyp])
def lf_has_harassment_nlp(df_row):
    nlp = English()
    matcher = Matcher(nlp.vocab)
    pattern1 = [{"LEMMA": "harass"}, {"LEMMA": "me"}]
    pattern2 = [{"LEMMA": "not"}, {"LEMMA": "harass"}]
    pattern3 = [{"LEMMA": "be"}, {"LEMMA": "harass"}]
    pattern4 = [{"LEMMA": "about"}, {"LEMMA": "harass"}]
    pattern5 = [{"LEMMA": "get"}, {"LEMMA": "harass"}]
    matcher.add("p1", [pattern1])
    matcher.add("p2", [pattern2])
    matcher.add("p3", [pattern3])
    matcher.add("p4", [pattern4])
    matcher.add("p5", [pattern5])
    matches = matcher(df_row.doc)
    return NO_ABUSE if len(matches) > 0 else ABSTAIN

# Mentions of "report you" phrases, less likely abusive language 
@labeling_function(pre=[spacyp])
def lf_has_report_you_nlp(df_row):
    nlp = English()
    matcher = Matcher(nlp.vocab)
    pattern1 = [{"LEMMA": "report"},
                {"LEMMA": "you"}]
    matcher.add("p1", [pattern1])
    matches = matcher(df_row.doc)
    return NO_ABUSE if len(matches) > 0 else ABSTAIN

# Mentions of "please read" phrases, less likely abusive language
@labeling_function(pre=[spacyp])
def lf_has_please_read_nlp(df_row):
    nlp = English()
    matcher = Matcher(nlp.vocab)
    pattern = [{"LEMMA": "please"},
               {"LEMMA": "read"},
               {"LEMMA": "the", "OP": "?"},
               {"LEMMA": "this", "OP": "?"}]
    matcher.add("p1", [pattern])
    matches = matcher(df_row.doc)
    return NO_ABUSE if len(matches) > 0 else ABSTAIN

In [None]:
# Make unique, offensive word lists from multiple bad word sources

# https://www.kaggle.com/chadapamettapun/hatespeechdetection?select=badWords.csv
bw1 = pd.read_csv('../data/external/kaggle_hatespeech_detection_badwords.txt')
bw1.drop_duplicates(ignore_index=False, inplace=True)
bw1.to_csv('../data/external/kaggle_hatespeech_detection_badwords2.txt', 
           index=False
           )

# https://www.kaggle.com/nicapotato/bad-bad-words
bw2 = pd.read_csv('../data/external/kaggle_bad_bad_words.txt') ## add word as column name
bw2.sort_values(by='word', ignore_index=False, inplace=True)
bw2.drop_duplicates(ignore_index=False, inplace=True)
bw2['word'] = bw2['word'].str.lower()
bw2a = bw1.merge(bw2, how='right', left_on='word', right_on='word')
bw2a = bw2a.loc[bw2a['type'].isnull(), 'word'].to_frame()
bw2a.to_csv('../data/external/kaggle_bad_bad_words2.txt', index=False)

# https://code.google.com/archive/p/badwordslist/downloads
bw3 = pd.read_csv('../data/external/badwordslist_badwords.txt') ## add word as column name
bw3.sort_values(by='word', ignore_index=False, inplace=True)
bw3.drop_duplicates(ignore_index=False, inplace=True)
bw3['word'] = bw3['word'].str.lower()
bw1bw2a = pd.concat([bw1[['word']], bw2a], ignore_index=False)
bw3a = bw1bw2a.merge(bw3, 
                     how='right', 
                     left_on='word', 
                     right_on='word', 
                     indicator=True)
bw3a = bw3a.loc[bw3a['_merge'] == 'right_only', 'word'].to_frame()
bw3a.to_csv('../data/external/badwordslist_badwords2.txt', index=False)

# https://github.com/areebbeigh/profanityfilter/blob/master/profanityfilter/data/badwords.txt
bw4 = pd.read_csv('../data/external/profanityfilter_badwords.txt') ## add word as column name
bw4.sort_values(by='word', ignore_index=False, inplace=True)
bw4.drop_duplicates(ignore_index=False, inplace=True)
bw4['word'] = bw4['word'].str.lower()
bw1bw2abw3a = pd.concat([bw1[['word']], bw2a, bw3a], ignore_index=False)
bw4a = bw1bw2abw3a.merge(bw4, 
                         how='right', 
                         left_on='word', 
                         right_on='word', 
                         indicator=True)
bw4a = bw4a.loc[bw4a['_merge'] == 'right_only', 'word'].to_frame()
bw4a.to_csv('../data/external/profanityfilter_badwords2.txt', index=False)

# https://github.com/snguyenthanh/better_profanity/blob/master/better_profanity/profanity_wordlist.txt
bw5 = pd.read_csv('../data/external/better_profanity_wordlist.txt') ## add word as column name
bw5.sort_values(by='word', ignore_index=False, inplace=True)
bw5.drop_duplicates(ignore_index=False, inplace=True)
bw5['word'] = bw5['word'].str.lower()
bw1bw2abw3abw4a = pd.concat(
    [bw1[['word']], bw2a, bw3a, bw4a], 
    ignore_index=False
    )
bw5a = bw1bw2abw3abw4a.merge(bw5, 
                             how='right', 
                             left_on='word', 
                             right_on='word', 
                             indicator=True
                             )
bw5a = bw5a.loc[bw5a['_merge'] == 'right_only', 'word'].to_frame()
bw5a.to_csv('../data/external/better_profanity_wordlist2.txt', index=False)

# Additional hashtags and words from various news articles
bw6 = pd.read_csv('../data/external/solo_badwords_map.txt') ## add word as column name
bw6.sort_values(by='word', ignore_index=False, inplace=True)
bw6.drop_duplicates(ignore_index=False, inplace=True)
bw6['word'] = bw6['word'].str.lower()
bw1bw2abw3abw4abw5a = pd.concat(
    [bw1[['word']], bw2a, bw3a, bw4a, bw5a], 
    ignore_index=False
    )
bw6a = bw1bw2abw3abw4abw5a.merge(bw6, 
                                 how='right', 
                                 left_on='word', 
                                 right_on='word', 
                                 indicator=True
                                 )
bw6a = bw6a.loc[bw6a['_merge'] == 'right_only', 'word'].to_frame() ## no duplicates
bw6a.to_csv('../data/external/solo_badwords_map2.txt', index=False)

In [None]:
# LFs with Keywords for offsensive words and leetspeak versions

# Generalize keyword lookup 
def keyword_lookup(df_row, keywords, label):
    tokens = [token.lower() for token 
              in regexp_tokenize(df_row.tweet, "[\w']+|#[\w']+")
              ]
    if any(word.lower() in tokens for word in keywords):
        return label
    return ABSTAIN

# Use of profanity, racial/ethnic slurs, gender insults, political slurs 
@labeling_function()
def lf_has_bad_words1(df_row):
    return keyword_lookup(df_row, 
                          bw1['word'],
                          ABUSE)

@labeling_function()
def lf_has_bad_words2(df_row):
    return keyword_lookup(df_row, 
                          bw2a['word'],
                          ABUSE)

@labeling_function()
def lf_has_bad_words3(df_row):
    return keyword_lookup(df_row, 
                          bw3a['word'],
                          ABUSE)

@labeling_function()
def lf_has_bad_words4(df_row):
    return keyword_lookup(df_row, 
                          bw4a['word'],
                          ABUSE)

@labeling_function()
def lf_has_bad_words5(df_row):
    return keyword_lookup(df_row, 
                          bw5a['word'],
                          ABUSE)

@labeling_function()
def lf_has_bad_words6(df_row):
    return keyword_lookup(df_row, 
                          bw6a['word'],
                          ABUSE)

# Use of please (and variations), sarcastic, more likely abusive language
@labeling_function()
def lf_has_please(df_row):
    return keyword_lookup(df_row, 
                          ['please', 'plz', 'pls', 'pl'],
                          ABUSE)

# Use of thank you (and variations), less likely abusive language
@labeling_function()
def lf_has_thankyou(df_row):
    return keyword_lookup(df_row, 
                          ['thank you', 'thanks', 'thx', 'tx'],
                          NO_ABUSE)

In [None]:
# LFs with Keywords for specific elements

# Use of all CAPS, less likely abusive language
@labeling_function()
def lf_all_capslock(df_row):
    if df_row.tweet == df_row.tweet.upper():
        return NO_ABUSE
    return ABSTAIN

# Has angry punctuations
@labeling_function()
def lf_has_angry_punctuations(df_row):
    return keyword_lookup(df_row, 
                          ['!!', '??', '**'],
                          ABUSE)

# Includes url, more likely abusive language
@labeling_function()
def lf_has_url(df_row):
    return keyword_lookup(df_row, 
                          ['#has_url'],
                          ABUSE)

# Includes truncate, longer text, more likely abusive language
@labeling_function()
def lf_has_truncate(df_row):
    return keyword_lookup(df_row, 
                          ['#has_truncate'],
                          ABUSE)

# Includes mention, more likely abusive language
@labeling_function()
def lf_has_mention(df_row):
    return keyword_lookup(df_row, 
                          ['#has_mention'],
                          ABUSE)

# Includes retweet, more likely abusive language
@labeling_function()
def lf_has_retweet(df_row):
    return keyword_lookup(df_row, 
                          ['#has_retweet'],
                          ABUSE)

### Evaluate LF performance

We will calculate the coverage of these labeling functions (LFs). Snorkel provides tooling for common LF analyses using the `LFAnalysis` utility.

Table Column Meanings:

* **Polarity:** The set of unique labels this labeling function outputs (excluding abstains)
* **Coverage:** The fraction of the dataset the label function labels
* **Overlaps:** The fraction of the dataset where this labeling function and at least one other labeling function label and agree
* **Conflicts:** The fraction of the dataset where this labeling function and at least one other labeling function label and disagree
* **Correct:** The number of data points this labeling function labels correctly (does not include abstain)
* **Incorrect:** The number of data points this labeling function labels incorrectly (does not include abstain)
* **Empirical Accuracy:** The empirical accuracy of this labeling function (does not include abstain)

The overall goal is to increase coverage without negatively impacting empiracal accuracy.


In [None]:
# Define list of labeling functions
lfs = [
    lf_clf_tfidf_bow,
    lf_clf_wordembed_nlp, ## slower preprocessing time because of tensors
    lf_vader_sentiment,
    lf_emoji_sentiment_nlp,
    lf_has_person_nlp,
    lf_has_work_art_nlp,
    lf_has_3plus_entity_nlp,
    lf_has_please_stop_nlp,
    lf_has_stopwords_nlp,
    lf_has_harassment_nlp, ## low coverage and empirical accuracy 0%
    lf_has_report_you_nlp, ## low coverage and empirical accuracy 0%
    lf_has_please_read_nlp, ## low coverage and empirical accuracy 0%
    lf_has_bad_words1,
    lf_has_bad_words2,
    lf_has_bad_words3,
    lf_has_bad_words4,
    lf_has_bad_words5,
    lf_has_bad_words6,
    lf_has_please,
    lf_has_thankyou,
    lf_all_capslock,
    lf_has_angry_punctuations, ## low coverage and empirical accuracy 0%
    lf_has_url,
    lf_has_truncate,
    lf_has_mention,
    lf_has_retweet
    ]

# Setup tooling to analyze labeling functions
applier = PandasLFApplier(lfs)

In [None]:
# Create labeling matrix and evaluate results
#l_dev = applier.apply(df_dev)

# Ignore all future warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

LFAnalysis(L=l_dev, lfs=lfs).lf_summary(Y=df_dev.label.values)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
lf_clf_tfidf_bow,0,"[0, 1]",1.0,1.0,0.68,154,46,0.77
lf_clf_wordembed_nlp,1,"[0, 1]",1.0,1.0,0.68,156,44,0.78
lf_vader_sentiment,2,"[0, 1]",0.735,0.735,0.505,97,50,0.659864
lf_emoji_sentiment_nlp,3,"[0, 1]",0.07,0.07,0.055,5,9,0.357143
lf_has_person_nlp,4,[1],0.195,0.195,0.135,21,18,0.538462
lf_has_work_art_nlp,5,[1],0.01,0.01,0.005,1,1,0.5
lf_has_3plus_entity_nlp,6,[1],0.085,0.085,0.075,3,14,0.176471
lf_has_please_stop_nlp,7,[1],0.09,0.09,0.075,7,11,0.388889
lf_has_stopwords_nlp,8,[1],0.115,0.115,0.09,12,11,0.521739
lf_has_harassment_nlp,9,[],0.0,0.0,0.0,0,0,0.0


In [None]:
# Save the unigram transformer for prediction
SAVE_PATH = '../models/'

# Set save directory for transformer unigram
os.chdir(SAVE_PATH)
#model_name = 'lf_dev_final.pkl'
with open(model_name, 'wb') as file:
    pickle.dump(l_dev, file)

In [None]:
# Create labeling matrix and evaluate results
#l_valid = applier.apply(df_valid)

# Ignore all future warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

LFAnalysis(L=l_valid, lfs=lfs).lf_summary(Y=df_valid.label.values)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
lf_clf_tfidf_bow,0,"[0, 1]",1.0,1.0,0.599432,585,119,0.830966
lf_clf_wordembed_nlp,1,"[0, 1]",1.0,1.0,0.599432,533,171,0.757102
lf_vader_sentiment,2,"[0, 1]",0.794034,0.794034,0.461648,404,155,0.722719
lf_emoji_sentiment_nlp,3,"[0, 1]",0.09517,0.09517,0.080966,17,50,0.253731
lf_has_person_nlp,4,[1],0.213068,0.213068,0.139205,89,61,0.593333
lf_has_work_art_nlp,5,[1],0.008523,0.008523,0.004261,4,2,0.666667
lf_has_3plus_entity_nlp,6,[1],0.051136,0.051136,0.041193,15,21,0.416667
lf_has_please_stop_nlp,7,[1],0.105114,0.105114,0.0625,51,23,0.689189
lf_has_stopwords_nlp,8,[1],0.115057,0.115057,0.068182,58,23,0.716049
lf_has_harassment_nlp,9,[],0.0,0.0,0.0,0,0,0.0


In [None]:
# Save the unigram transformer for prediction
SAVE_PATH = '../models/'

# Set save directory for transformer unigram
os.chdir(SAVE_PATH)
#model_name = 'lf_valid_final.pkl'
with open(model_name, 'wb') as file:
    pickle.dump(l_valid, file)

In [None]:
# Create labeling matrix and evaluate results
#l_test = applier.apply(df_test)

# Ignore all future warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

LFAnalysis(L=l_test, lfs=lfs).lf_summary(Y=df_test.label.values)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
lf_clf_tfidf_bow,0,"[0, 1]",1.0,1.0,0.618431,5212,1125,0.822471
lf_clf_wordembed_nlp,1,"[0, 1]",1.0,1.0,0.618431,4862,1475,0.76724
lf_vader_sentiment,2,"[0, 1]",0.782231,0.782231,0.477671,3458,1499,0.697599
lf_emoji_sentiment_nlp,3,"[0, 1]",0.107306,0.107306,0.094209,181,499,0.266176
lf_has_person_nlp,4,[1],0.187786,0.187786,0.123244,799,391,0.671429
lf_has_work_art_nlp,5,[1],0.009468,0.009468,0.005681,41,19,0.683333
lf_has_3plus_entity_nlp,6,[1],0.07038,0.07038,0.055547,241,205,0.540359
lf_has_please_stop_nlp,7,[1],0.088685,0.088685,0.064384,405,157,0.720641
lf_has_stopwords_nlp,8,[1],0.105728,0.105728,0.067224,491,179,0.732836
lf_has_harassment_nlp,9,[],0.0,0.0,0.0,0,0,0.0


In [None]:
# Save the unigram transformer for prediction
SAVE_PATH = '../models/'

# Set save directory for transformer unigram
os.chdir(SAVE_PATH)
#model_name = 'lf_test_final.pkl'
with open(model_name, 'wb') as file:
    pickle.dump(l_test, file)

In [None]:
# Create labeling matrix and evaluate results
#l_train = applier.apply(df_train)

# Ignore all future warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

LFAnalysis(L=l_train, lfs=lfs).lf_summary(Y=df_train.label.values)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
lf_clf_tfidf_bow,0,"[0, 1]",1.0,1.0,0.583798,23451,4509,0.838734
lf_clf_wordembed_nlp,1,"[0, 1]",1.0,1.0,0.583798,21645,6315,0.774142
lf_vader_sentiment,2,"[0, 1]",0.782582,0.782582,0.448748,15385,6496,0.703121
lf_emoji_sentiment_nlp,3,"[0, 1]",0.053863,0.053863,0.046924,348,1158,0.231076
lf_has_person_nlp,4,[1],0.190558,0.190558,0.122568,3516,1812,0.65991
lf_has_work_art_nlp,5,[1],0.006867,0.006867,0.004614,126,66,0.65625
lf_has_3plus_entity_nlp,6,[1],0.073605,0.073605,0.055472,1095,963,0.53207
lf_has_please_stop_nlp,7,[1],0.094671,0.094671,0.064878,1978,669,0.747261
lf_has_stopwords_nlp,8,[1],0.100143,0.100143,0.060801,2073,727,0.740357
lf_has_harassment_nlp,9,[0],0.000465,0.000465,0.000429,13,0,1.0


In [None]:
# Save the unigram transformer for prediction
SAVE_PATH = '../models/'

# Set save directory for transformer unigram
os.chdir(SAVE_PATH)
#model_name = 'lf_train_final.pkl'
with open(model_name, 'wb') as file:
    pickle.dump(l_train, file)

In [None]:
# Explore incorrect labels
pd.set_option('display.max_colwidth', None)
df_dev.iloc[l_dev[:,1] == HATE].sample(20, random_state=42)

Unnamed: 0,label,tweet
113,1,hahaha dumb bitch
142,1,Fake ass people. that is why I do not have no respect for em. Frontin ass bitches too
10,0,"I was planning on stopping by Portland again, but I do not think I'm going to have the time for that. Too much to do when I get back."
123,1,#has_mention your a big yute to be having slits in your brows my darg lol drop it out
116,1,& I MOST DEFIANTLY WILL NOT let no bitch who FW a nigga I ALREADY HAD get the best out of me 😂😭 Fuccin wit my Hand-Me-Downs & shit
146,1,you bitches love yall some corny nigga
96,0,NO NO NO NO NO #has_url STOP. Stop advocating that people pay for abuse mitigation.
38,0,We hope those gozlemes are worth the wait girls! #mkr #hungrycampers
33,0,My notifications are full of man tears! I thought I was going to have to wait till Christmas! Thank you baby Jesus!!
119,1,"#has_retweet #has_mention but naaww though, you all hoes is fu do not fucking play w/me."


In [None]:
print('Number of labeled examples:', len(df_dev.label),
      '\nRatio of tweets labeled:', round(len(df_dev.label) / len(df_dev), 2),
      '\nRatio positive:', round(
          len([val for val in df_dev.label if val == 1]) 
          / len(df_dev.label), 2
          )
      )

Number of labeled examples: 200 
Ratio of tweets labeled: 1.0 
Ratio positive: 0.5


In [None]:
print("Overall, we reach a coverage ratio of", round(
    LFAnalysis(L=l_dev, lfs=lfs).label_coverage(), 2), 
    "over the whole training set.")

Overall, we reach a coverage ratio of 1.0 over the whole training set.
