In [1]:
import re
import string

import numpy as np
import pandas as pd

from fastText import train_supervised, load_model

from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [2]:
translator = str.maketrans('', '', string.punctuation)

# Preprocessing

In [3]:
def preprocess_tweets(texts):
    """
    Remove RT, urls, screen names, hashtags, punctuation, whitespaces at the end
    """
    preprocessed_texts = []
    for text in texts:
        if text.startswith ('RT @'):
            text = text.replace("RT @", "@")
        text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        text = re.sub('(?<![\w.-])@[A-Za-z][\w-]+', '', text)
        text = re.sub('(?:^|\s)[＃#]{1}(\w+)', '', text)
        text = text.lower()        
        text = text.translate(translator)
        text = text.strip()
        preprocessed_texts.append(text)
    return preprocessed_texts    

In [4]:
def print_metrics(test_y, predicted_y, tags, aver):
    """
    Print main metrics
    """
    return {"Precision": metrics.precision_score(test_y, predicted_y, labels=tags, average=aver),
            "Recall": metrics.recall_score(test_y, predicted_y, labels=tags, average=aver),
            "F1": metrics.f1_score(test_y, predicted_y, labels=tags, average=aver)}

In [5]:
df = pd.read_csv("data/hate_speech_2016.csv")

In [6]:
df.shape

(13851, 2)

In [7]:
df.head()

Unnamed: 0,label,tweet
0,racism,So Drasko just said he was impressed the girls...
1,racism,Drasko they didn't cook half a bird you idiot ...
2,racism,Hopefully someone cooks Drasko in the next ep ...
3,racism,of course you were born in serbia...you're as ...
4,racism,These girls are the equivalent of the irritati...


In [8]:
# remove 'bad' rows
df = df[(df['label'] == 'sexism') | (df['label'] == 'racism') | (df['label'] == 'none')]
df['label'].value_counts()

none      8523
sexism    4233
racism    1093
Name: label, dtype: int64

In [9]:
# split into train and test set
train_df, test_df = model_selection.train_test_split(df, test_size=0.1, random_state=42)

In [10]:
train_y = train_df['label'].values.tolist()
train_X = preprocess_tweets(train_df['tweet'].values.tolist())
test_y = test_df['label'].values.tolist()
test_X = preprocess_tweets(test_df['tweet'].values.tolist())

# Reproducing the paper

Paper: https://www.aclweb.org/anthology/N16-2013

Results from the paper only with character ngrams: Precision 72.87%, Recall 77.75%, F1 Score 73.89.

In [11]:
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,4), analyzer='char_wb')),
                  ('classifier', LogisticRegression(C=0.2, solver='liblinear', multi_class='ovr'))])

In [12]:
model = model.fit(train_X, train_y)
    
predicted_y = model.predict(test_X)    

In [13]:
print_metrics(test_y, predicted_y, tags = np.unique(train_df['label']), aver='macro')

{'Precision': 0.7886690943494444,
 'Recall': 0.7799537647637927,
 'F1': 0.781568236089799}

In [14]:
print(pd.DataFrame(metrics.confusion_matrix(test_y, predicted_y, labels=np.unique(train_df['label'])), index=['true:none', 'true:racism', 'true:sexism'], columns=['pred:none', 'pred:racism', 'pred:sexism']))

             pred:none  pred:racism  pred:sexism
true:none          775           21           48
true:racism         23           78            8
true:sexism        109           18          305


In [15]:
model_coefs = pd.DataFrame({'coef_score': model.named_steps['classifier'].coef_[0],
                            'ngram': model.named_steps['vectorizer'].get_feature_names()})

In [17]:
model_coefs_sorted = model_coefs.sort_values("coef_score", ascending=False)
model_coefs_sorted.head(10)

Unnamed: 0,coef_score,ngram
2390,0.829362,p
28505,0.659558,xism
1204,0.647038,exi
17298,0.551652,ment
8711,0.539023,dh
23190,0.528422,rou
13598,0.512266,ian
15907,0.50896,lan
11729,0.48539,for
23223,0.484783,rp


# Fasttext classification

In [19]:
def prepare_for_fasttext(texts, labels, filepath):
    """
    Prepares the dataset for python fasttext implementation   
    """    
    labels = ["__label__"+str(label) for label in labels]    
    with open(filepath, 'w') as outfile:
        for text, label in zip(texts, labels):
            outfile.write(label + " " + text + "\n")

In [20]:
prepare_for_fasttext(train_X, train_y, 'hatespeech_fasttext.txt')

In [21]:
fasttext_model = train_supervised(input='hatespeech_fasttext.txt', epoch=30, thread=10,
                        lr=0.1, ws=5, loss='softmax', minCount=5, dim=100)

In [24]:
test_X_for_fasttext = [text.replace('\n', '') for text in test_X]

In [25]:
predicted_y = [fasttext_model.predict(text)[0][0].replace("__label__", "") for text in test_X_for_fasttext]

In [26]:
print_metrics(test_y, predicted_y, tags = np.unique(train_df['label']), aver='macro')

{'Precision': 0.7373877399561605,
 'Recall': 0.7360351885938142,
 'F1': 0.7366371563729935}

In [27]:
print(pd.DataFrame(metrics.confusion_matrix(test_y, predicted_y, labels=np.unique(train_df['label'])), index=['true:none', 'true:racism', 'true:sexism'], columns=['pred:none', 'pred:racism', 'pred:sexism']))

             pred:none  pred:racism  pred:sexism
true:none          718           23          103
true:racism         27           71           11
true:sexism        111           16          305
