## Import data

In [86]:
import pandas as pd

In [87]:
# the data is taken from https://data.world/crowdflower/hate-speech-identification
df = pd.read_csv('twitter-hate-speech-classifier.csv')

In [88]:
print(df.head())
print(df.ftypes)

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  
Unnamed: 0             int64:dense
count                  int64:dense
hate_speech            int64:dense
offensive_language     int64:dense
neither                int64:dense
class                  int64:dense
tweet                

In [89]:
# replace the class column with string:
df['class'].replace(0, "Hate speech", inplace=True)
df['class'].replace(1, "Offensive language", inplace=True)
df['class'].replace(2, "Neither", inplace=True)

print(df['class'].unique())


['Neither' 'Offensive language' 'Hate speech']


In [91]:
data = df[['class', 'tweet']]

In [92]:
print(data['class'].unique())

['Neither' 'Offensive language' 'Hate speech']


In [132]:
# data.loc[data['class'] == 'Hate speech']['tweet'].head(10).tolist()

## Predict the labels

### Classification

The data should be classified into 3 different clusters - therefore we should use **multi class** techniques

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [21]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

In [22]:
X = data['tweet']
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [24]:

from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))


accuracy 0.785743106926698


In [25]:
my_tags = ['Neither','Offensive language', 'Hate speach']
print(classification_report(y_test, y_pred,target_names=my_tags))


                    precision    recall  f1-score   support

           Neither       0.00      0.00      0.00       427
Offensive language       0.98      0.08      0.14      1261
       Hate speach       0.78      1.00      0.88      5747

         micro avg       0.79      0.79      0.79      7435
         macro avg       0.59      0.36      0.34      7435
      weighted avg       0.77      0.79      0.70      7435



  'precision', 'predicted', average, warn_for)


## Test on real Twitter tweets:

#### Load data from file

In [199]:
tweets_path = 'tweets.txt'

In [200]:
import json

def read_tweets_from_file():
    texts = []
    with open(tweets_path, 'r') as input_file:
        for line in input_file.readlines():
            data = json.loads(line)
            texts.append(data['text'])
    return {'tweets':texts}

In [201]:
tweets = read_tweets_from_file()

In [202]:
tweets_df = pd.DataFrame.from_dict(tweets)

#### Run classification of twitter data

In [203]:
res = nb.predict(tweets_df['tweets'])


In [205]:
from collections import Counter 
print(Counter(res))

Counter({'Offensive language': 1998, 'Neither': 2})
