In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
data = pd.read_csv(r'C:\Users\Dell\Documents\projects\hate speech detection\hate-speech-and-offensive-language-master\data\labeled_data.csv')

#### Import dataset

In [5]:
data

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [6]:
data['class'].value_counts()

1    19190
2     4163
0     1430
Name: class, dtype: int64

### Counting class values for using weights

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data['tweet'], data['class'], test_size=0.2, random_state=42)


In [8]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

### Basic text preprocessing done here. We lower case the text,tokenize it, remove punctuation and stop words.  

In [9]:
sentences = [sentence.split() for sentence in X_train]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

### We're developing the vocabulary for our data in word2vec. we're using word2vec with vector size of 100, window size 5, minimum count of 5 words and 4 workers for optimum performance as mine is just a quad core. 

In [12]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_train = np.array([vectorize(sentence) for sentence in X_train])
X_test = np.array([vectorize(sentence) for sentence in X_test])

#### We're getting the embeddings for all the training and testing data. For each of the training/testing instance we'll get the average of all the word embeddings or a '0' array.

In [46]:
weights={0:13.41,1:4.61,2:1}

In [None]:
### Weights

In [47]:
y_train

15272    0
9351     2
20323    1
3638     1
20579    1
        ..
21575    2
5390     1
860      1
15795    1
23654    1
Name: class, Length: 19826, dtype: int64

In [54]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Fitting the embeddings on a logistic regression for classification.

In [55]:
y_pred = clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred,average='weighted' ))
print('Recall:', recall_score(y_test, y_pred,average='weighted' ))
print('F1 score:', f1_score(y_test, y_pred,average='weighted' ))

Accuracy: 0.8448658462779908
Precision: 0.7895402630418822
Recall: 0.8448658462779908
F1 score: 0.813582930307574


  _warn_prf(average, modifier, msg_start, len(result))


#### Got a decent accuracy

# Using a deep layer

In [23]:
from keras.models import Sequential
from keras.layers import Dense
import keras
from sklearn.preprocessing import OneHotEncoder

In [24]:
X_test, val_x,  y_test, val_y = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [25]:
enc = OneHotEncoder(handle_unknown = 'ignore')
y_train=pd.DataFrame(y_train)
enc.fit(y_train)

In [26]:
y_train=enc.transform(y_train).toarray()
y_test=pd.DataFrame(y_test)
enc.fit(y_test)
y_test=enc.transform(y_test).toarray()
val_y=pd.DataFrame(val_y)
enc.fit(val_y)
val_y=enc.transform(val_y).toarray()

### Converting the target variable into one hot encoded form.

In [27]:
val_y

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [26]:
y_test.shape

(2478, 3)

In [30]:
model = Sequential()

n_cols = X_train.shape[1]

model.add(Dense(100, activation='relu', input_shape=(n_cols,)))
model.add(Dense(60, activation='relu'))
model.add(Dense(3,activation='softmax'))
model.compile(optimizer=keras.optimizers.Adam(0.001),
    loss="categorical_crossentropy",
    metrics=[keras.metrics.CategoricalAccuracy()]
)
model.fit(X_train, y_train, validation_data=(val_x, val_y),epochs=100,class_weight=weights)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100

KeyboardInterrupt: 

#### We simply used 3 layers narrowing down from 100 to 3 (we did try different combinations though manually but can't do hyperparameter tuning). 

In [28]:
model.evaluate(X_test,y_test)



[0.5071485042572021, 0.8018563389778137]

In [None]:
### Final results!! logistic regression got better results.

In [None]:
model.predict(vectorize(preprocess('')))

# checking if the data imbalance is the cause of less accuracy

In [130]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import NearMiss

In [124]:
resample=SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
X, y = resample.fit_resample(X_train, y_train)

In [125]:
y

0        0
1        2
2        1
3        1
4        1
        ..
46039    2
46040    2
46041    2
46042    2
46043    2
Name: class, Length: 46044, dtype: int64

In [141]:
undersample = NearMiss(version=1, n_neighbors=3)
X, y = undersample.fit_resample(X_train, y_train)

#### this was tested and that's why we applied weights at the starting.