# Dive into Abusive Language with Snorkel

Author: BingYune Chen 
<br>
Updated: 2021-08-02
<br><br>
Labeled Data is provided by [University of Sheffield](https://github.com/ziqizhang/data) 

----------

### Collection of Labeled Data

With the help of Twitter API, we are able to recreate or "rehydrate" the original text for each tweet using tweet IDs provided in the labeled datasets.

* Create developer account and register app
* Use consumer key and access token to access Twitter

We define abusive language to include tweets that contain any text labeled as 'sexism,' 'racism,' or 'offensive' language by the original researchers.  

In [None]:
# Imports and setup for Google Colab

# Mount Google Drive
import os, sys ## interact with Google Drive's operating system
from google.colab import drive ## module to use Google Drive with Python
drive.mount('/content/drive') ## mount to access contents

# Install python libraries
! pip install --target=$nb_path twarc

In [None]:
# Enter Twiitter API Access Information
CONSUMERKEY = ## API key (username)
CONSUMERSEECRET = ## API Secret Key (password)
ACCESSTOKEN = ## access token
ACCESSTOKENSECRET = ## access token secret 

In [None]:
# Use tweet IDs and Twitter API to hydrate tweets for labeled training data
# Recreate original dataset minus tweets deleted or removed from public view
# Twitter's Terms of Service allows users to only publish tweet IDs

# Code adapted from Deen Freelon @UNC-Chapel-Hill 
# >>> http://dfreelon.org/2017/01/03/beyond-the-hashtags-twitter-data/

from twarc import Twarc 
import json 

consumer_key = CONSUMERKEY ## API key (username)
consumer_secret = CONSUMERSECRET ## API Secret Key (password)
access_token = ACCESSTOKEN ## access token
access_token_secret = ACCESSTOKENSECRET ## access token secret 

t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)
data = []

for tweet in t.hydrate(open('./data/raw/wz-l/labeled_data.csv')): ## training data
    data.append(json.dumps(tweet))

with open('../data/interim/naacl_srw_2016_hydrate.json','w') as outfile:
    outfile.write("\n".join(data) + '\n')

In [None]:
# Import standard libraries
import numpy as np
import pandas as pd

import html

In [None]:
# Load hydrated, labeled data into pandas
label_df = pd.read_csv(
    '../data/raw/wz/NAACL_SRW_2016.csv', 
    names=['tweet_id', 'class']
    )
hydrate_df = pd.read_json(
    '../data/interim/naacl_srw_2016_hydrate.json', 
    lines=True
    )
merge_df = label_df.merge(hydrate_df, left_on='tweet_id', right_on='id')

# Change to binary labels, 0 = no hate, 1 = hate
merge_df['label'] = merge_df['class'].map(
    {'none': 0, 'sexism': 1, 'racism': 1}
    ) 
merge_df.rename(columns={'full_text': 'tweet'}, inplace=True)

# Load hate and not hate data into pandas 
twoclass_df = pd.read_csv(
    '../data/raw/dt/labeled_data_all_2classes_only.csv', 
    usecols=['tweet', 'hate_speech', 'offensive_language']
    ) 
# Include hate_speech and offensive language
twoclass_df['hate_count'] = (twoclass_df['hate_speech']
                             + twoclass_df['offensive_language']
                             )
twoclass_df['label'] = twoclass_df['hate_count'].apply(
    lambda x: 1 if x > 0 else x
    )

In [None]:
# Combine datasets
pd.set_option('display.max_colwidth', None)
frames = [
          merge_df.loc[:,['label', 'tweet']], 
          twoclass_df.loc[:,['label', 'tweet']]
]
df = pd.concat(frames, ignore_index=True)
df.label.value_counts() # 1: 24684, 0: 10637

1    24684
0    10637
Name: label, dtype: int64

In [None]:
# Remove duplicates
df.sort_values(by='label', inplace=True)
df.drop_duplicates(subset='tweet', keep='first', inplace=True)
df.label.value_counts() # 1: 24604, 0: 10597

1    24604
0    10597
Name: label, dtype: int64

In [None]:
# Reset index
df.reset_index(drop=True, inplace=True)

# Unescape HTML elements
def unescape_html(tweet_txt):
    return html.unescape(tweet_txt)

df['tweet'] = df['tweet'].apply(unescape_html)

In [None]:
# Save to csv
df.to_csv('../data/interim/labeled_combined_data.csv', index=False)
# Remove output error on rows 25571 and 25572