# Dive into Abusive Language with Snorkel

Author: BingYune Chen 
<br>
Updated: 2021-08-02


----------

### Labeling Functions

**Common Types of Labeling Functions:**

* Hard-coded heuristics using regular expressions (regexes)
* Syntactic analysis using Spacy's dependency trees
* Distant supervision based on external knowledge bases (expert labels)
* Crowdsourcing noisy manual labels (amateur labels)

**We will now apply all of our labeling functions.**

In [None]:
# Imports and setup for Google Colab

# Mount Google Drive
from google.colab import drive ## module to use Google Drive with Python
drive.mount('/content/drive') ## mount to access contents

# Install python libraries
! pip install --upgrade tensorflow --quiet
! pip install snorkel --quiet
! pip install tensorboard==1.15.0 --quiet
! python -m spacy download en_core_web_sm --quiet

In [None]:
# Imports for data and plotting
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline 
import seaborn as sns

import pickle
import os
import re
import itertools 

In [None]:
# Imports for sentiment analysis
# Valence Aware Dictionary and sEntiment Reasoner
# VADER was designed with a focus on social media texts
import nltk
nltk.download('stopwords')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
from nltk.tokenize import regexp_tokenize

In [None]:
# Imports for spaCy preprocessing
import spacy
spacy.load('en_core_web_sm')
spacy.prefer_gpu()
from spacy.tokenizer import _get_regex_pattern
from spacy.lang.en import English
from spacy.matcher import Matcher

# Imports for tensorflow/keras preprocessing
from keras.models import model_from_json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
# Imports for snorkel analysis and multi-task learning
from snorkel.preprocess import preprocessor
from snorkel.preprocess.nlp import SpacyPreprocessor
from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis, model
from snorkel.analysis import get_label_buckets

### Clean Unlabeled Data

In [None]:
# Replace contractions 
# Code adapted from https://towardsdatascience.com/twitter-sentiment-analysis-using-fasttext-9ccd04465597
# Contractions source https://en.wikipedia.org/wiki/Contraction_%28grammar%29
def load_dict_contractions():
    return {
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "I'd":"I would",
        "I'll":"I will",
        "I'm":"I am",
        "I'm'a":"I am about to",
        "I'm'o":"I am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "I've":"I have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        "Whatcha":"What are you",
        "luv":"love",
        "sux":"sucks"
        }

# Clean tweet text to remove mentions, retweets, urls (update to remove \n text)
def clean_tweet_txt(tweet_txt):
    
    # remove new line tags
    tweet_txt = ' '.join(re.sub(
        '\\n', 
        ' ', 
        tweet_txt
        ).split()
    )

    # remove mentions, but keep hashtags
    tweet_txt = ' '.join(re.sub(
        '(@[A-Za-z0-9_]+\:)|(@[A-Za-z0-9_\.]+)', 
        ' #has_mention ', 
        tweet_txt
        ).split()
    )
    
    # remove retweets
    tweet_txt = ' '.join(re.sub(
        '(RT\: )|(RT\:)|(RT \: )|(RT )', 
        ' #has_retweet ', 
        tweet_txt
        ).split()
    )
    
    # remove punctuation not needed for VADER sentiment
    tweet_txt = ' '.join(re.sub(
        '\\.\\.\\.$|[@…]', 
        ' #has_truncate ', 
        tweet_txt
        ).split()
    )
    
    # remove urls
    tweet_txt = ' '.join(re.sub(
        '(\w+:\/\/\S+)|(\w+:)', 
        ' #has_url ', 
        tweet_txt
        ).split()
    )
    
    # expand contractions
    CONTRACTIONS = load_dict_contractions()
    tweet_txt = tweet_txt.replace("’","'")
    words = tweet_txt.split()
    reformed = [CONTRACTIONS[
        word.lower()] if word.lower() in CONTRACTIONS else word 
        for word in words
        ]
    tweet_txt = " ".join(reformed)
    
    # fix simple misspelled words (character repeats more than 2x)
    tweet_txt = ''.join(
        ''.join(t)[:2] for _, t in itertools.groupby(tweet_txt)
        )
    return tweet_txt

In [None]:
# Apply cleaning function

DAY_DIRECTORY = '20201103_english_vf/'
OPEN_PATH = (# directory 
            + DAY_DIRECTORY
            )

CLEAN_DIRECTORY = '20201103_clean/'
SAVE_CLEAN_PATH = (# directory
                   + CLEAN_DIRECTORY
                   )

files = os.listdir(OPEN_PATH)
sorted_files = sorted(files)

for filename in sorted_files:
    if filename.endswith('.txt'):
        print('Opening file...{}'.format(filename))
        temp_df = pd.read_csv(OPEN_PATH + filename, engine='python')
        print('Cleaning file...{}'.format(filename))
        temp_df['tweet'] = temp_df['text'].apply(clean_tweet_txt)

        os.chdir(SAVE_CLEAN_PATH)
        temp_df['tweet'].to_csv('clean_{}'.format(filename), index=False)
        print('Saved clean file...{}\n'.format(filename))

    else:
        continue

In [None]:
# Combine all txt files into a single dataframe

CLEAN_DIRECTORY = '20201103_clean/'
OPEN_PATH = (# directory
             + CLEAN_DIRECTORY
            )

FULL_FILE_NAME = '20201103'
SAVE_CLEAN_PATH = # directory

files = os.listdir(OPEN_PATH)
sorted_files = sorted(files)

main_df = pd.DataFrame(columns=['tweet'])

for filename in sorted_files:
    if filename.endswith('.txt'):
        print('Opening file...{}'.format(filename))
        temp_df = pd.read_csv(OPEN_PATH + filename, engine='python')
        print('Adding file...{}'.format(filename))
        main_df = main_df.append(temp_df, ignore_index=True)
    else:
        continue

os.chdir(SAVE_CLEAN_PATH)
main_df.to_csv('clean_{}.txt'.format(FULL_FILE_NAME), index=False)

# Number of tweets by day
# 11/6 (1254209, 1)
# 11/5 (1545942, 1)
# 11/4 (1598379, 1) 
# 11/3 (1558637, 1)
# 11/2 
# 11/1 (1326398, 1)
# 10/31 (1198690, 1)

In [None]:
# Create split for initial data publication
FILE_NAME = '20201103_clean_p1.txt'
first_split = main_df.sample(n=200000, random_state=42)
first_split.to_csv(FILE_NAME)

# Create splits for LabelModel processing
#chunksize = 25000
n = 1
for df_chunk in pd.read_csv(FILE_NAME, chunksize=chunksize):
    df_chunk.to_csv('{}_{}.txt'.format(FILE_NAME[:-4], n))
    n += 1

### Apply Labeling Functions

In [None]:
# Set voting values to be used in labeling functions
ABSTAIN = -1
NO_ABUSE = 0
ABUSE = 1

In [None]:
# Make custom preprocessing pipeline for spaCy
spacyp = SpacyPreprocessor(
    text_field="tweet", 
    doc_field="doc", 
    memoize=True,
    gpu=True)

# Load nltk's English stopwords and add custom tags
stops = (nltk.corpus.stopwords.words('english') 
    + ['#has_mention', '#has_url', '#has_retweet', '#has_truncate']
    + ['has_mention', 'has_url', 'has_retweet', 'has_truncate']
    )

# Additional preprocess step to identify lemmas
def spacyp_lemmatize(doc):
    lemma_list = [str(tok.lemma_).lower() for tok in doc
                  if tok.is_alpha and tok.text.lower() not in stops] 
    return lemma_list

In [None]:
# LFs with Trained Classifiers

# Use trained classifier for Tf-Idf bag of words
@labeling_function()
def lf_clf_tfidf_bow(df_row):
    ## load models for prediction
    MODEL_PATH = '../models/'
    os.chdir(MODEL_PATH)
    model_name = 'vectorizer_tfidf.pkl'
    with open(model_name, 'rb') as file:
        tfidf_vec = pickle.load(file)
    with open('model_bow_nb.pkl', 'rb') as file:
        model_bow_nb = pickle.load(file)
    
    ## apply vectorization
    X_test_vec = tfidf_vec.transform([df_row.tweet])
    ## make prediction
    score = model_bow_nb.predict(X_test_vec)

    return score[0]

# Use trained classifier for word embedding
@labeling_function(pre=[spacyp])
def lf_clf_wordembed_nlp(df_row):
    ## load models for prediction
    MODEL_PATH = '../models/'
    os.chdir(MODEL_PATH)
    with open('detector_bigram.pkl', 'rb') as file:
        detector_bigram = pickle.load(file)
    with open('detector_trigram.pkl', 'rb') as file:
        detector_trigram = pickle.load(file)
    with open('token_keras.pkl', 'rb') as file:
        k_token = pickle.load(file)

    json_file = open('model_wordembed_main.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    loaded_model.load_weights("model_wordembed_weights.h5")

    ## detect common bigrams and trigrams
    X_test_ug = spacyp_lemmatize(df_row.doc)
    X_test_bg = list(detector_bigram[X_test_ug])
    X_test_tg = list(detector_trigram[X_test_bg])
    ## create sequence
    lst_txt2seq = k_token.texts_to_sequences([" ".join(X_test_tg)])
    ## pad sequence
    X_test_pad = pad_sequences(
        lst_txt2seq,
        maxlen=15,
        padding="post",
        truncating="post"
        )
    
    loaded_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam', 
    metrics=['accuracy']
    )

    ## make prediction
    try: 
        score = loaded_model(X_test_pad)
    except:
        score = [0,1] ## default to abuse for oov, more common in dataset
    finally:
        return np.argmax(score)

In [None]:
# LFs with Third-Party Models and Heuristics

# Use pre trained model for sentiment analysis
# VADER focuses on punctuation, capitalization, degree modifiers, 
# conjunctions, preceding tri-grams
# VADER also accounts for emojis, slang, and emoticons

# Positive sentiment, less likely abusive language
@labeling_function()
def lf_vader_sentiment(df_row):
    vader = SentimentIntensityAnalyzer()
    vs = vader.polarity_scores(df_row.tweet)['compound']  
    if vs >= 0.05: 
        return NO_ABUSE
    elif vs <= -0.05: 
        return ABUSE
    else:
        return ABSTAIN

# Load emoji sentiment dictionary
emoji_df = pd.read_csv('../data/external/emoji_sentiment_data_v1.csv')
emoji_df['sentiment'] = emoji_df[['Positive', 'Neutral', 'Negative']].idxmax(axis=1)
pos_emoji = emoji_df.loc[emoji_df['sentiment'] == 'Positive', 'Emoji'] ## positive emoji
neg_emoji = emoji_df.loc[emoji_df['sentiment'] == 'Negative', 'Emoji'] ## negative emoji

# Positive emoji, less likely abusive language
# Apply sentiment ranking based on http://kt.ijs.si/data/Emoji_sentiment_ranking/
@labeling_function(pre=[spacyp])
def lf_emoji_sentiment_nlp(df_row):
    nlp = English()
    matcher = Matcher(nlp.vocab)
    
    pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
    neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]
    
    matcher.add("HAPPY", pos_patterns)  ## add positive pattern
    matcher.add("SAD", neg_patterns)  ## add negative pattern
    matches = matcher(df_row.doc)
    all_id = ([nlp.vocab.strings[match_id] 
              for match_id, start, end in matches] 
              + ['None']
              )
    result = max(set(all_id), key=all_id.count)
    if result == 'HAPPY':
        return NO_HATE
    elif result == 'SAD':
        return HATE
    else: 
        return ABSTAIN

In [None]:
# LFs with Complex Preprocessor spaCy and Pattern Matching

# Shorter comment and focus on a person, more likely abusive language
@labeling_function(pre=[spacyp])
def lf_has_person_nlp(df_row):
    if len(df_row.doc) < 20 and any(
        [ent.label_ == "PERSON" for ent in df_row.doc.ents]
        ):
        return ABUSE
    else:
        return ABSTAIN

# Mentions of titles for books, songs, or works of art, more likely abuse 
@labeling_function(pre=[spacyp])
def lf_has_work_art_nlp(df_row):
    if any([ent.label_ == "WORK_OF_ART" for ent in df_row.doc.ents]):
        return ABUSE
    else:
        return ABSTAIN

# Mentions of at least 3 named entities, more likely abusive language 
@labeling_function(pre=[spacyp])
def lf_has_3plus_entity_nlp(df_row):
    if len([ent.label_ in ["PERSON", "GPE", "LOC", "ORG", "LAW", "LANGUAGE"] 
            for ent in df_row.doc.ents]
           ) > 2:
        return ABUSE
    else:
        return ABSTAIN

# Mentions of "please stop" phrases, sarcastic, more likely abusive language
# Usually directed at those making abusive language comments or other issues
@labeling_function(pre=[spacyp])
def lf_has_please_stop_nlp(df_row):
    nlp = English()
    matcher = Matcher(nlp.vocab)
    pattern1 = [{"LEMMA": "do"},
                {"LEMMA": "not"}]
    pattern2 = [{"LEMMA": "stop"}]
    matcher.add("p1", [pattern1])
    matcher.add("p2", [pattern2])
    matches = matcher(df_row.doc)
    return ABUSE if len(matches) > 0 else ABSTAIN

# Higher stopword ratio, more likely abusive language
@labeling_function(pre=[spacyp])
def lf_has_stopwords_nlp(df_row):
    num_stopwords = len(
        [True for token in df_row.doc if token.lower_ in stops]
        )
    ratio  = num_stopwords / len(df_row.doc)
    return ABUSE if ratio > 0.5 and len(df_row.doc) > 10 else ABSTAIN

# Mentions of "about harass xyz" phrases, less likely abusive language
@labeling_function(pre=[spacyp])
def lf_has_harassment_nlp(df_row):
    nlp = English()
    matcher = Matcher(nlp.vocab)
    pattern1 = [{"LEMMA": "harass"}, {"LEMMA": "me"}]
    pattern2 = [{"LEMMA": "not"}, {"LEMMA": "harass"}]
    pattern3 = [{"LEMMA": "be"}, {"LEMMA": "harass"}]
    pattern4 = [{"LEMMA": "about"}, {"LEMMA": "harass"}]
    pattern5 = [{"LEMMA": "get"}, {"LEMMA": "harass"}]
    matcher.add("p1", [pattern1])
    matcher.add("p2", [pattern2])
    matcher.add("p3", [pattern3])
    matcher.add("p4", [pattern4])
    matcher.add("p5", [pattern5])
    matches = matcher(df_row.doc)
    return NO_ABUSE if len(matches) > 0 else ABSTAIN

# Mentions of "report you" phrases, less likely abusive language 
@labeling_function(pre=[spacyp])
def lf_has_report_you_nlp(df_row):
    nlp = English()
    matcher = Matcher(nlp.vocab)
    pattern1 = [{"LEMMA": "report"},
                {"LEMMA": "you"}]
    matcher.add("p1", [pattern1])
    matches = matcher(df_row.doc)
    return NO_ABUSE if len(matches) > 0 else ABSTAIN

# Mentions of "please read" phrases, less likely abusive language
@labeling_function(pre=[spacyp])
def lf_has_please_read_nlp(df_row):
    nlp = English()
    matcher = Matcher(nlp.vocab)
    pattern = [{"LEMMA": "please"},
               {"LEMMA": "read"},
               {"LEMMA": "the", "OP": "?"},
               {"LEMMA": "this", "OP": "?"}]
    matcher.add("p1", [pattern])
    matches = matcher(df_row.doc)
    return NO_ABUSE if len(matches) > 0 else ABSTAIN

In [None]:
# Reload unique, offensive word lists from multiple bad word sources

bw1 = pd.read_csv('../data/interim/kaggle_hatespeech_detection_badwords2.txt')
bw2a = pd.read_csv('../data/interim/kaggle_bad_bad_words2.txt')
bw3a = pd.read_csv('../data/interim/badwordslist_badwords2.txt')
bw4a = pd.read_csv('../data/interim/profanityfilter_badwords2.txt')
bw5a = pd.read_csv('../data/interim/better_profanity_wordlist2.txt')
bw6a = pd.read_csv('../data/interim/solo_badwords_map2.txt')

In [None]:
# LFs with Keywords for offsensive words and leetspeak versions

# Generalize keyword lookup 
def keyword_lookup(df_row, keywords, label):
    tokens = [token.lower() for token 
              in regexp_tokenize(df_row.tweet, "[\w']+|#[\w']+")
              ]
    if any(word.lower() in tokens for word in keywords):
        return label
    return ABSTAIN

# Use of profanity, racial/ethnic slurs, gender insults, political slurs 
@labeling_function()
def lf_has_bad_words1(df_row):
    return keyword_lookup(df_row, 
                          bw1['word'],
                          ABUSE)

@labeling_function()
def lf_has_bad_words2(df_row):
    return keyword_lookup(df_row, 
                          bw2a['word'],
                          ABUSE)

@labeling_function()
def lf_has_bad_words3(df_row):
    return keyword_lookup(df_row, 
                          bw3a['word'],
                          ABUSE)

@labeling_function()
def lf_has_bad_words4(df_row):
    return keyword_lookup(df_row, 
                          bw4a['word'],
                          ABUSE)

@labeling_function()
def lf_has_bad_words5(df_row):
    return keyword_lookup(df_row, 
                          bw5a['word'],
                          ABUSE)

@labeling_function()
def lf_has_bad_words6(df_row):
    return keyword_lookup(df_row, 
                          bw6a['word'],
                          ABUSE)

# Use of please (and variations), sarcastic, more likely abusive language
@labeling_function()
def lf_has_please(df_row):
    return keyword_lookup(df_row, 
                          ['please', 'plz', 'pls', 'pl'],
                          ABUSE)

# Use of thank you (and variations), less likely abusive language
@labeling_function()
def lf_has_thankyou(df_row):
    return keyword_lookup(df_row, 
                          ['thank you', 'thanks', 'thx', 'tx'],
                          NO_ABUSE)

In [None]:
# LFs with Keywords for specific elements

# Use of all CAPS, less likely abusive language
@labeling_function()
def lf_all_capslock(df_row):
    if df_row.tweet == df_row.tweet.upper():
        return NO_ABUSE
    return ABSTAIN

# Has angry punctuations
@labeling_function()
def lf_has_angry_punctuations(df_row):
    return keyword_lookup(df_row, 
                          ['!!', '??', '**'],
                          ABUSE)

# Includes url, more likely abusive language
@labeling_function()
def lf_has_url(df_row):
    return keyword_lookup(df_row, 
                          ['#has_url'],
                          ABUSE)

# Includes truncate, longer text, more likely abusive language
@labeling_function()
def lf_has_truncate(df_row):
    return keyword_lookup(df_row, 
                          ['#has_truncate'],
                          ABUSE)

# Includes mention, more likely abusive language
@labeling_function()
def lf_has_mention(df_row):
    return keyword_lookup(df_row, 
                          ['#has_mention'],
                          ABUSE)

# Includes retweet, more likely abusive language
@labeling_function()
def lf_has_retweet(df_row):
    return keyword_lookup(df_row, 
                          ['#has_retweet'],
                          ABUSE)

In [None]:
# Define list of labeling functions
lfs = [
    lf_clf_tfidf_bow,
    lf_clf_wordembed_nlp, ## slow word embedding
    lf_vader_sentiment,
    lf_emoji_sentiment_nlp,
    lf_has_person_nlp,
    lf_has_work_art_nlp,
    lf_has_3plus_entity_nlp,
    lf_has_please_stop_nlp,
    lf_has_stopwords_nlp,
    lf_has_harassment_nlp, ## low coverage and empirical accuracy
    lf_has_report_you_nlp, ## low coverage and empirical accuracy of 0%
    lf_has_please_read_nlp, ## low coverage and empirical accuracy of 0%
    lf_has_bad_words1,
    lf_has_bad_words2,
    lf_has_bad_words3,
    lf_has_bad_words4,
    lf_has_bad_words5,
    lf_has_bad_words6,
    lf_has_please,
    lf_has_thankyou,
    lf_all_capslock,
    lf_has_angry_punctuations, ## low coverage and empirical accuracy of 0%
    lf_has_url,
    lf_has_truncate,
    lf_has_mention,
    lf_has_retweet
    ]

# Setup tooling to analyze labeling functions
applier = PandasLFApplier(lfs)

In [None]:
# Iterate through each chunk of cleaned data
OPEN_PATH = ##
FILE_NAME = '20201103_clean_p1_1.txt'

temp_df = pd.read_csv(OPEN_PATH + FILE_NAME)

temp_lfs = applier.apply(temp_df)
'''
model_name = 'lfs_{}_{}.pkl'.format(FILE_NAME[:8], FILE_NAME[-8:-4])
with open(model_name, 'wb') as file:
    pickle.dump(temp_lfs, file)
'''
LFAnalysis(L=temp_lfs, lfs=lfs).lf_summary()

  from pandas import Panel
100%|██████████| 25000/25000 [14:08:45<00:00,  2.04s/it]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_clf_tfidf_bow,0,"[0, 1]",1.0,1.0,0.91364
lf_clf_wordembed_nlp,1,"[0, 1]",1.0,1.0,0.91364
lf_vader_sentiment,2,"[0, 1]",0.5846,0.5846,0.53992
lf_emoji_sentiment_nlp,3,"[0, 1]",0.06828,0.06828,0.06528
lf_has_person_nlp,4,[1],0.14776,0.14776,0.13744
lf_has_work_art_nlp,5,[1],0.01168,0.01168,0.01048
lf_has_3plus_entity_nlp,6,[1],0.17872,0.17872,0.16436
lf_has_please_stop_nlp,7,[1],0.06088,0.06088,0.05664
lf_has_stopwords_nlp,8,[1],0.11972,0.11972,0.11096
lf_has_harassment_nlp,9,[],0.0,0.0,0.0


In [None]:
# Iterate through each chunk of cleaned data
OPEN_PATH = ##
FILE_NAME = '20201103_clean_p1_4.txt'

temp_df = pd.read_csv(OPEN_PATH + FILE_NAME)

temp_lfs = applier.apply(temp_df)
'''
model_name = 'lfs_{}_{}.pkl'.format(FILE_NAME[:8], FILE_NAME[-8:-4])
with open(model_name, 'wb') as file:
    pickle.dump(temp_lfs, file)
'''
LFAnalysis(L=temp_lfs, lfs=lfs).lf_summary()

  from pandas import Panel
100%|██████████| 25000/25000 [13:55:16<00:00,  2.00s/it]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_clf_tfidf_bow,0,"[0, 1]",1.0,1.0,0.91548
lf_clf_wordembed_nlp,1,"[0, 1]",1.0,1.0,0.91548
lf_vader_sentiment,2,"[0, 1]",0.58532,0.58532,0.54136
lf_emoji_sentiment_nlp,3,"[0, 1]",0.05324,0.05324,0.0524
lf_has_person_nlp,4,[1],0.15276,0.15276,0.14224
lf_has_work_art_nlp,5,[1],0.01208,0.01208,0.01096
lf_has_3plus_entity_nlp,6,[1],0.17804,0.17804,0.16496
lf_has_please_stop_nlp,7,[1],0.06016,0.06016,0.05648
lf_has_stopwords_nlp,8,[1],0.11948,0.11948,0.1102
lf_has_harassment_nlp,9,[0],4e-05,4e-05,4e-05


In [None]:
# Iterate through each chunk of cleaned data
OPEN_PATH = ##
FILE_NAME = '20201103_clean_p1_6.txt'

temp_df = pd.read_csv(OPEN_PATH + FILE_NAME)

temp_lfs = applier.apply(temp_df)
'''
model_name = 'lfs_{}_{}.pkl'.format(FILE_NAME[:8], FILE_NAME[-8:-4])
with open(model_name, 'wb') as file:
    pickle.dump(temp_lfs, file)
'''
LFAnalysis(L=temp_lfs, lfs=lfs).lf_summary()

  from pandas import Panel
100%|██████████| 25000/25000 [17:27:42<00:00,  2.51s/it]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_clf_tfidf_bow,0,"[0, 1]",1.0,1.0,0.91404
lf_clf_wordembed_nlp,1,"[0, 1]",1.0,1.0,0.91404
lf_vader_sentiment,2,"[0, 1]",0.58852,0.58852,0.5416
lf_emoji_sentiment_nlp,3,"[0, 1]",0.07012,0.07012,0.06684
lf_has_person_nlp,4,[1],0.15472,0.15472,0.1446
lf_has_work_art_nlp,5,[1],0.01312,0.01312,0.01192
lf_has_3plus_entity_nlp,6,[1],0.18024,0.18024,0.1662
lf_has_please_stop_nlp,7,[1],0.06156,0.06156,0.05724
lf_has_stopwords_nlp,8,[1],0.11832,0.11832,0.1078
lf_has_harassment_nlp,9,[0],0.00012,0.00012,0.00012


In [None]:
# Iterate through each chunk of cleaned data
OPEN_PATH = ##
FILE_NAME = '20201103_clean_p1_8.txt'

temp_df = pd.read_csv(OPEN_PATH + FILE_NAME)

temp_lfs = applier.apply(temp_df)

model_name = 'lfs_{}_{}.pkl'.format(FILE_NAME[:8], FILE_NAME[-8:-4])
with open(model_name, 'wb') as file:
    pickle.dump(temp_lfs, file)

LFAnalysis(L=temp_lfs, lfs=lfs).lf_summary()

In [None]:
LFAnalysis(L=tempdf1, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_clf_tfidf_bow,0,"[0, 1]",1.0,1.0,0.91404
lf_clf_wordembed_nlp,1,"[0, 1]",1.0,1.0,0.91404
lf_vader_sentiment,2,"[0, 1]",0.58852,0.58852,0.5416
lf_emoji_sentiment_nlp,3,"[0, 1]",0.07012,0.07012,0.06684
lf_has_person_nlp,4,[1],0.15472,0.15472,0.1446
lf_has_work_art_nlp,5,[1],0.01312,0.01312,0.01192
lf_has_3plus_entity_nlp,6,[1],0.18024,0.18024,0.1662
lf_has_please_stop_nlp,7,[1],0.06156,0.06156,0.05724
lf_has_stopwords_nlp,8,[1],0.11832,0.11832,0.1078
lf_has_harassment_nlp,9,[0],0.00012,0.00012,0.00012
