# Online Harassment: data cleaning and first pass

In [39]:
import pandas as pd
import numpy as np
import re


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm, preprocessing, linear_model
from sklearn.metrics import accuracy_score, f1_score, classification_report

The original corpus needed some processing: removing all the tabs in the actual tweets for easy loading as tsv.

In [3]:
start_regex = re.compile("(^\d+\t\w\t)")
with open('../data/onlineHarassmentDataset.tdf', 'r', encoding='latin-1') as infile, open ('../data/corrected.tdf', 'w') as outfile:
    next(infile)
    outfile.write("id\tcode\ttweet\n")
    faulty = 0
    for line in infile:
        l = line.rstrip()
        try:
            start = start_regex.search(l).group(1)
            (_, end_pos) = start_regex.search(l).span()
            outfile.write(start + line[end_pos:].replace('\t', ' '))
        except:
            faulty += 1
print('unparsed data: %d' % faulty)

unparsed data: 81


In [4]:
df = pd.read_csv("../data/corrected.tdf", sep='\t', dtype={'id': 'int64', 'code': 'category', 'tweet': 'str'})

In [5]:
df.code.value_counts()

N    15067
H     5276
Name: code, dtype: int64

In [6]:
df[df.code == 'H'].sample(5)

Unnamed: 0,id,code,tweet
20196,20214,H,RT @VOICEOFCHID: The enemies of the Jews are t...
3866,3867,H,"\""Old ass bitch... ain't been cute since the 7..."
19531,19548,H,#whitelivesmatter #backtheblue dissolution of ...
19673,19690,H,ISIS releases video in Hebrew with a message t...
20200,20218,H,RT @WDFx2EU: Anaheim California Grand Dragon o...


In [7]:
df[df.code == 'N'].sample(5)

Unnamed: 0,id,code,tweet
2241,2242,N,RT @WhittierPal: Jewish agenda demands that Eu...
416,417,N,@mccosha @bfraser747 @nazi @Loser George Soros...
18265,18276,N,"The phrase \""rabid feminist,\"" used in any con..."
6777,6785,N,@BERTIE72 @KatzLara @dorothyofisrael or is it ...
900,901,N,I bet a Jew made this picture cause if you loo...


In [8]:
from sklearn.model_selection import train_test_split
X = df['tweet']
y = df['code'].values.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Char n-grams as features

Using character n-grams like in  for a first pass as mentioned in https://github.com/GreenParachute/hate-speech-popularity from https://arxiv.org/pdf/1805.04661.pdf.
Try:
* logistic regression
* svm (linear, rbf)

In [8]:
tfidf = TfidfVectorizer(analyzer = "char", ngram_range=(2,4))
X_ngrams = tfidf.fit_transform(X_train)

In [9]:
clf = linear_model.LogisticRegression(C=1.0)

In [10]:
clf.fit(X_ngrams,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
X_test_ngrams = tfidf.transform(X_test)

In [12]:
y_pred = clf.predict(X_test_ngrams)

In [14]:
print(classification_report(y_test, y_pred, labels=['H', 'N'],target_names=['harassment', 'Non harassment']))

                precision    recall  f1-score   support

    harassment       0.74      0.15      0.25      1315
Non harassment       0.77      0.98      0.86      3771

   avg / total       0.76      0.77      0.70      5086



Recall is pretty bad on the harassment class.

In [15]:
clf2 = svm.SVC(kernel="linear", C=1.0)

In [16]:
clf2.fit(X_ngrams,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
y_pred = clf2.predict(X_test_ngrams)

In [18]:
print(classification_report(y_test, y_pred, labels=['H', 'N'],target_names=['harassment', 'Non harassment']))

                precision    recall  f1-score   support

    harassment       0.72      0.19      0.30      1315
Non harassment       0.78      0.97      0.86      3771

   avg / total       0.76      0.77      0.72      5086



Linear kernel svm:

In [19]:
clf3 = svm.SVC(kernel="rbf", C=1.0)

In [20]:
clf3.fit(X_ngrams,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
y_pred = clf3.predict(X_test_ngrams)

In [22]:
print(classification_report(y_test, y_pred, labels=['H', 'N'],target_names=['harassment', 'Non harassment']))

                precision    recall  f1-score   support

    harassment       0.00      0.00      0.00      1315
Non harassment       0.74      1.00      0.85      3771

   avg / total       0.55      0.74      0.63      5086



  'precision', 'predicted', average, warn_for)


**TODO**: that didn't work at all. Need to understand why.

## Naive Bayes with word tokens
Using tf-idf vectorizer as above, but for words.

In [9]:
tfidf = TfidfVectorizer(analyzer = "word", ngram_range=(1,3), stop_words={'english'})

In [10]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

In [11]:
X_tfidf_words = tfidf.fit_transform(X_train)

In [32]:
X_tfidf_words

<15257x351625 sparse matrix of type '<class 'numpy.float64'>'
	with 699165 stored elements in Compressed Sparse Row format>

In [34]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

In [35]:
bayes_clf = Pipeline([('tfidf', TfidfVectorizer(analyzer = "word", ngram_range=(1,3), stop_words={'english'})), ('clf', MultinomialNB())])

In [36]:
bayes_clf = bayes_clf.fit(X_train, y_train)

In [37]:
y_pred = bayes_clf.predict(X_test)

In [40]:
print(classification_report(y_test, y_pred, labels=['H', 'N'],target_names=['harassment', 'Non harassment']))

                precision    recall  f1-score   support

    harassment       0.82      0.02      0.05      1315
Non harassment       0.75      1.00      0.85      3771

   avg / total       0.76      0.75      0.64      5086



Even worse recall ... best performer is still linear kernel SVM.

Let's try several iterations with different parameters for the tfidf features.

In [46]:
def bayesian_classifier(vectorizer):
    clf = Pipeline([('tfidf', vectorizer), ('clf', MultinomialNB())])
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred, labels=['H', 'N'],target_names=['harassment', 'Non harassment']))

In [47]:
tfidf2 = TfidfVectorizer(ngram_range=(1,3))
tfidf3 = TfidfVectorizer()
tfidf4 = TfidfVectorizer(analyzer = "char", ngram_range=(2,4))
tfidf5 = TfidfVectorizer(analyzer = "word", ngram_range=(1,2))

In [48]:
bayesian_classifier(tfidf2)

                precision    recall  f1-score   support

    harassment       0.82      0.02      0.05      1315
Non harassment       0.75      1.00      0.85      3771

   avg / total       0.76      0.75      0.64      5086



In [49]:
bayesian_classifier(tfidf3)

                precision    recall  f1-score   support

    harassment       0.81      0.02      0.03      1315
Non harassment       0.74      1.00      0.85      3771

   avg / total       0.76      0.74      0.64      5086



In [50]:
bayesian_classifier(tfidf4)

                precision    recall  f1-score   support

    harassment       1.00      0.00      0.00      1315
Non harassment       0.74      1.00      0.85      3771

   avg / total       0.81      0.74      0.63      5086



In [51]:
bayesian_classifier(tfidf5)

                precision    recall  f1-score   support

    harassment       0.95      0.01      0.03      1315
Non harassment       0.74      1.00      0.85      3771

   avg / total       0.80      0.74      0.64      5086



not much improvement for these features ...