In [167]:
import re
import ndjson
import pandas as pd
import numpy as np
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
import gensim
from sklearn.model_selection import GridSearchCV

# Loading the data

In [None]:
with open("..\\found_tweets.txt", "r") as fobj:
    df = pd.DataFrame(ndjson.load(fobj))
labels = pd.read_csv("../data/hatespeech_labels.csv").set_index("tweet_id")

In [None]:
df["full_text"].show()

## Detour - TextBlob sentiment

It seems that textblob came already with a pretrained sentiment analysis.
This might be useful, an we might want to retain on it. However, it seems
that the values are not very useful for our usecase.

In [None]:
sentiments = df.set_index("id")["full_text"].apply(lambda x: TextBlob(x).sentiment.polarity)
pd.concat([sentiments, labels], axis=1).set_index("label", append=True)["full_text"].unstack().hist()


In [15]:
pd.concat([sentiments, labels], axis=1).set_index("label", append=True)["full_text"].unstack().describe()

label,abusive,hateful,normal,spam
count,11810.0,2469.0,38476.0,8567.0
mean,-0.572286,-0.358625,0.117893,0.15978
std,0.283211,0.439207,0.323918,0.319509
min,-1.0,-1.0,-1.0,-1.0
25%,-0.7,-0.741667,0.0,0.0
50%,-0.6,-0.55625,0.0,0.033333
75%,-0.6,0.0,0.3125,0.36697
max,1.0,1.0,1.0,1.0


# Training A Basic Model

In [164]:
class Tokenizer(object):
    def __init__(self):
        self.url_re = re.compile("^\w+://")
        self.stemmer = SnowballStemmer("english", ignore_stopwords=True)
        self.tokenizer = TweetTokenizer(strip_handles=True)
        
    def __call__(self, doc):
        return [
            self.stemmer.stem(token) 
            for token in self.tokenizer.tokenize(doc)
            if not self.url_re.match(token)
        ]

from sklearn.preprocessing import LabelBinarizer

model = Pipeline([
    ("preprocess_text", ColumnTransformer([
        ("encode_text", CountVectorizer(
                tokenizer=Tokenizer(),
                stop_words=stopwords.words("english")
        ), "full_text")
    ])),
    ("classify", MultinomialNB())
])

In [165]:
X = df.set_index("id")
y = labels.loc[X.index, 'label'].replace({"hateful": "abusive"})

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

In [None]:
clf = GridSearchCV(model, param_grid={
    "preprocess_text__encode_text__ngram_range": [(1,1),(1,2),(2,2),(1,3),(2,3),(3,3),(1,4),(1,5),(1,6)],
    "classify__alpha": np.exp(np.arange(-2,2,0.1)),
    "classify__fit_prior": [True, False],
}, verbose=10, n_jobs=4)
clf.fit(Xtrain, ytrain)
pred = clf.predict(Xtest)


Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  4.2min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  5.7min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  7.7min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 11.7min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed: 15.7min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 18.8min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed: 22.4min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed: 26.9min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed: 30.8min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed: 35.5min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed: 41.9min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 46.7min
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed: 52.8min
[Parallel(

In [179]:
confusion_matrix(ytest, pred, labels=['abusive', 'normal', 'spam', 'hateful'])

array([[2841,  736,   17,    0],
       [ 340, 8882,  379,    0],
       [  56, 1487,  593,    0],
       [   0,    0,    0,    0]], dtype=int64)

In [177]:
a=1

In [178]:
a=3