## Problem Statement
The objective of this task is to detect hate speech in tweets. For the sake of simplicity, we say a tweet contains hate speech if it has a racist or sexist sentiment associated with it. So, the task is to classify racist or sexist tweets from other tweets.

Formally, given a training sample of tweets and labels, where label ‘1’ denotes the tweet is racist/sexist and label ‘0’ denotes the tweet is not racist/sexist, your objective is to predict the labels on the test dataset.

https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

In [110]:
import sys
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.neural_network import MLPClassifier

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import spacy

# from keras import layers, models, optimizers
# from keras.preprocessing import text, sequence

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


## Preprocessing

In [2]:
with open('corpus') as f:
    corpus = f.read()

In [3]:
corpus_list = []
for line in corpus.split('\n'):
    corpus_list.append(line)

In [4]:
label_length = len('__label__2')
y_list = []
X_list = []
for document in corpus_list:
    y_list.append(document[:label_length])
    X_list.append(document[label_length+1:])

In [5]:
processed_y_list = []
for i in y_list:
    if i == '__label__2':
        processed_y_list.append(2)
    else:
        processed_y_list.append(1)
y_list = processed_y_list

In [6]:
y_list = np.array(y_list)

y_list = y_list.reshape(-1, 1)

y_list = y_list.ravel()

## Feature Engineering

In [7]:
X_train, X_validation, y_train, y_validation = train_test_split(X_list, y_list, test_size = 0.2, random_state = 1)

### TF-IDF Vectors As Features

In [8]:
vectorizer = TfidfVectorizer(stop_words='english')
# X_vectorized = vectorizer.fit_transform(X_list)
vectorizer.fit(X_list)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [9]:
tfidf_X_train = vectorizer.transform(X_train)
tfidf_X_validation = vectorizer.transform(X_validation)

### Word Embedding As Features

In [68]:
# loading pretrained model from Spacy

import en_core_web_sm

nlp = en_core_web_sm.load()

word2vec_X_train = [nlp(sentence) for sentence in X_train]

word2vec_X_validation = [nlp(sentence) for sentence in X_validation]


In [72]:
word2vec_X_train = [sentence.vector for sentence in word2vec_X_train]

word2vec_X_validation = [sentence.vector for sentence in word2vec_X_validation]

In [79]:
word2vec_X_train = np.array(word2vec_X_train)

word2vec_X_validation = np.array(word2vec_X_validation)


## Models

### Statistical Models

In [33]:
def training(classifier, X_train, X_validation, y_train, y_validation):
    model = classifier
    model.fit(X_train, y_train)
    y_validation_predict = model.predict(X_validation)
    report = classification_report(y_validation_predict, y_validation)
    accuracy = round(accuracy_score(y_validation_predict, y_validation),2)
    print(report)
    print('accuracy is', accuracy)
    return model

In [85]:
training(LogisticRegression(), tfidf_X_train, tfidf_X_validation, y_train, y_validation)

              precision    recall  f1-score   support

           1       0.85      0.84      0.85      1006
           2       0.84      0.86      0.85       994

   micro avg       0.85      0.85      0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000

accuracy is 0.85


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [86]:
training(LogisticRegression(), word2vec_X_train, word2vec_X_validation, y_train, y_validation)

              precision    recall  f1-score   support

           1       0.71      0.67      0.69      1057
           2       0.65      0.69      0.67       943

   micro avg       0.68      0.68      0.68      2000
   macro avg       0.68      0.68      0.68      2000
weighted avg       0.68      0.68      0.68      2000

accuracy is 0.68


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [87]:
training(RandomForestClassifier(), tfidf_X_train, tfidf_X_validation, y_train, y_validation)

              precision    recall  f1-score   support

           1       0.85      0.73      0.79      1154
           2       0.70      0.83      0.76       846

   micro avg       0.78      0.78      0.78      2000
   macro avg       0.78      0.78      0.77      2000
weighted avg       0.79      0.78      0.78      2000

accuracy is 0.78


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [88]:
training(RandomForestClassifier(), word2vec_X_train, word2vec_X_validation, y_train, y_validation)

              precision    recall  f1-score   support

           1       0.73      0.58      0.65      1249
           2       0.48      0.64      0.55       751

   micro avg       0.60      0.60      0.60      2000
   macro avg       0.61      0.61      0.60      2000
weighted avg       0.64      0.60      0.61      2000

accuracy is 0.6


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [112]:
training(GradientBoostingClassifier(), tfidf_X_train, tfidf_X_validation, y_train, y_validation)

              precision    recall  f1-score   support

           1       0.83      0.76      0.79      1079
           2       0.74      0.81      0.78       921

   micro avg       0.78      0.78      0.78      2000
   macro avg       0.78      0.79      0.78      2000
weighted avg       0.79      0.78      0.78      2000

accuracy is 0.78


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [83]:
training(GradientBoostingClassifier(), word2vec_X_train, word2vec_X_validation, y_train, y_validation)

              precision    recall  f1-score   support

           1       0.72      0.66      0.69      1091
           2       0.63      0.70      0.66       909

   micro avg       0.68      0.68      0.68      2000
   macro avg       0.68      0.68      0.68      2000
weighted avg       0.68      0.68      0.68      2000

accuracy is 0.68


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [None]:
# if you want to do a gridsearch on gradient boosting classifier:

# parameters = {
#     "loss":["deviance"],
#     "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
#     "min_samples_split": np.linspace(0.1, 0.5, 12),
#     "min_samples_leaf": np.linspace(0.1, 0.5, 12),
#     "max_depth":[3,5,8],
#     "max_features":["log2","sqrt"],
#     "criterion": ["friedman_mse",  "mae"],
#     "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
#     "n_estimators":[10]
#     }

# clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv=10, n_jobs=-1)

# clf.fit(trainX, trainY)
# print(clf.score(trainX, trainY))
# print(clf.best_params_)

In [111]:
np.logspace(-4, 4, 20)

array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04])

### Neural Networks

In [None]:
tfidf_X_train

In [91]:

training(MLPClassifier(), tfidf_X_train, tfidf_X_validation, y_train, y_validation)

              precision    recall  f1-score   support

           1       0.85      0.83      0.84      1013
           2       0.83      0.84      0.84       987

   micro avg       0.84      0.84      0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000

accuracy is 0.84




MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [92]:
training(MLPClassifier(), word2vec_X_train, word2vec_X_validation, y_train, y_validation)

              precision    recall  f1-score   support

           1       0.62      0.67      0.64       916
           2       0.70      0.65      0.67      1084

   micro avg       0.66      0.66      0.66      2000
   macro avg       0.66      0.66      0.66      2000
weighted avg       0.66      0.66      0.66      2000

accuracy is 0.66




MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [108]:
# Since Multi-layer Perceptron is sensitive to feature scaling, so it is highly recommended to scale your data.

from sklearn.preprocessing import StandardScaler  

scaler = StandardScaler(with_mean=False)  

scaler.fit(tfidf_X_train) 

mlp_tfidf_X_train = scaler.transform(tfidf_X_train)  

mlp_tfidf_X_validation = scaler.transform(tfidf_X_validation)  

In [109]:
training(MLPClassifier(), mlp_tfidf_X_train, mlp_tfidf_X_validation, y_train, y_validation)

              precision    recall  f1-score   support

           1       0.80      0.77      0.78      1030
           2       0.76      0.79      0.78       970

   micro avg       0.78      0.78      0.78      2000
   macro avg       0.78      0.78      0.78      2000
weighted avg       0.78      0.78      0.78      2000

accuracy is 0.78


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)