In [1]:
import numpy as np 
import pandas as pd 

import pandas as pd
from sklearn import preprocessing
from sklearn.utils import class_weight as cw
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# Deep Learning Model - Keras
from keras.models import Model
from keras.models import Sequential

# Deep Learning Model - Keras - RNN
from keras.layers import Embedding, LSTM, Bidirectional, SpatialDropout1D, SimpleRNN, Bidirectional, MaxPooling1D, Conv1D

# Deep Learning Model - Keras - General
from keras.layers import Input, Add, concatenate, Dense, Activation, BatchNormalization, Dropout, Flatten
from keras.layers import LeakyReLU, PReLU, Lambda, Multiply

# Deep Learning Parameters - Keras
from tensorflow.keras.optimizers import RMSprop, Adam

In [2]:
df=pd.read_csv(r'new_cleaned_data.csv', index_col=0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184354 entries, 0 to 184353
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   is_offensive  184354 non-null  int64 
 1   new_text      184274 non-null  object
dtypes: int64(1), object(1)
memory usage: 4.2+ MB


In [4]:
df.head()

Unnamed: 0,is_offensive,new_text
0,0,go village pump suggest change language rfc set
1,1,anti greek nationalis wikipedia hi alexikoua y...
2,1,dis hoe wasnt dis violent lottery ticket
3,0,better atabay helping banned vandals pushing pov
4,0,camelcase sicko camelcase camelcase rule r bal...


In [5]:
df.tail()

Unnamed: 0,is_offensive,new_text
184349,0,template uw vandalism talk
184350,1,regrets pussies shit happens deal
184351,0,could possibly origin popular game series halo
184352,0,article submission declined wikipedia talk art...
184353,0,editors move articles except inside user space...


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [7]:
texts = df['new_text'].astype(str)
y = df['is_offensive']

In [8]:
vectorizer = CountVectorizer(stop_words='english', min_df=0.0001)
X = vectorizer.fit_transform(texts)

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
texts_train, texts_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=1)

### SVM Model training

In [10]:
model = LinearSVC(class_weight="balanced", dual=False, tol=1e-2, max_iter=1e5)
model.fit(texts_train, y_train) 

LinearSVC(class_weight='balanced', dual=False, max_iter=100000.0, tol=0.01)

### Predictions of SVM 

In [11]:
pred=model.predict(texts_test) 

### Confusion Matrix for SVM

In [12]:
print(confusion_matrix(y_test, pred))

[[42732  1560]
 [ 1175  9840]]


### Accuracy of SVM Model

In [13]:
accuracy_score(y_test, pred)

0.9505487551304537

### Precision, Recall and F1-Score for SVM

In [14]:
targets=['class 0', 'class 1']
print(classification_report(y_test, pred, target_names=targets))


              precision    recall  f1-score   support

     class 0       0.97      0.96      0.97     44292
     class 1       0.86      0.89      0.88     11015

    accuracy                           0.95     55307
   macro avg       0.92      0.93      0.92     55307
weighted avg       0.95      0.95      0.95     55307



In [18]:
from joblib import Parallel, delayed
import joblib
  
  
# Save the model as a pickle in a file
joblib.dump(model, 'svm.pkl')
  


['svm.pkl']

### Training of Multinomial Naive Bayes Model

In [19]:
from sklearn.naive_bayes import MultinomialNB
model1=MultinomialNB()
model1.fit(texts_train, y_train)

MultinomialNB()

### Predictions for Naive Bayes

In [20]:
pred1=model1.predict(texts_test)

### Confusion Matrix for Naive Bayes

In [21]:
print(confusion_matrix(y_test, pred1))

[[41700  2592]
 [ 1254  9761]]


### Accuracy for Naive Bayes

In [22]:
accuracy_score(y_test, pred1)

0.9304608819860054

### Precision, recall and F1-Score for Naive Bayes

In [23]:
targets=['class 0', 'class 1']
print(classification_report(y_test, pred1, target_names=targets))


              precision    recall  f1-score   support

     class 0       0.97      0.94      0.96     44292
     class 1       0.79      0.89      0.84     11015

    accuracy                           0.93     55307
   macro avg       0.88      0.91      0.90     55307
weighted avg       0.93      0.93      0.93     55307



### Training of K Nearest Neighbours Model

In [24]:
from sklearn.neighbors import KNeighborsClassifier
model2=KNeighborsClassifier(n_neighbors=9, algorithm='auto')
model2.fit(texts_train, y_train)


KNeighborsClassifier(n_neighbors=9)

### Predictions for KNN

In [25]:
pred2=model2.predict(texts_test)

### Confusion Matrix for KNN

In [26]:
print(confusion_matrix(y_test, pred2))

[[43774   518]
 [ 3898  7117]]


### Accuracy for KNN

In [27]:
accuracy_score(y_test, pred2)

0.92015477245195

### Precision, Recall and F1-Score for KNN

In [28]:
targets=['class 0', 'class 1']
print(classification_report(y_test, pred2, target_names=targets))

              precision    recall  f1-score   support

     class 0       0.92      0.99      0.95     44292
     class 1       0.93      0.65      0.76     11015

    accuracy                           0.92     55307
   macro avg       0.93      0.82      0.86     55307
weighted avg       0.92      0.92      0.91     55307



### Training of Adaboost Classifier

In [29]:
from sklearn.ensemble import AdaBoostClassifier 
model3=AdaBoostClassifier(n_estimators=200)
model3.fit(texts_train, y_train)

AdaBoostClassifier(n_estimators=200)

### Predictions for Adaboost

In [30]:
pred3=model3.predict(texts_test)

### Confusion Matrix of Adaboost

In [31]:
print(confusion_matrix(y_test, pred3))

[[43894   398]
 [ 2539  8476]]


### Accuracy for Adaboost

In [32]:
accuracy_score(y_test, pred3)

0.9468964145587357

### Precision, Recall and F1-Score for Adaboost

In [33]:
targets=['class 0', 'class 1']
print(classification_report(y_test, pred3, target_names=targets))

              precision    recall  f1-score   support

     class 0       0.95      0.99      0.97     44292
     class 1       0.96      0.77      0.85     11015

    accuracy                           0.95     55307
   macro avg       0.95      0.88      0.91     55307
weighted avg       0.95      0.95      0.94     55307



### Training of Random Forest Classifier

In [34]:
from sklearn.ensemble import RandomForestClassifier
model4=RandomForestClassifier(n_estimators=200)
model4.fit(texts_train, y_train)


RandomForestClassifier(n_estimators=200)

### Predictions for Random Forest

In [35]:
pred4=model4.predict(texts_test)

### Confusion Matrix of Random Forest

In [36]:
print(confusion_matrix(y_test, pred4))

[[43592   700]
 [ 1934  9081]]


### Accuracy of Random Forest

In [37]:
accuracy_score(y_test, pred4)

0.9523749254163126

### Precision, Recall and F1-Score of Random Forest

In [38]:
targets=['class 0', 'class 1']
print(classification_report(y_test, pred4, target_names=targets))

              precision    recall  f1-score   support

     class 0       0.96      0.98      0.97     44292
     class 1       0.93      0.82      0.87     11015

    accuracy                           0.95     55307
   macro avg       0.94      0.90      0.92     55307
weighted avg       0.95      0.95      0.95     55307



In [39]:
texts2 = df['new_text'].astype(str)
y2 = df['is_offensive']

In [40]:
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(y2)
print(Y)
Y = to_categorical(Y)
print(Y)

[0 1 1 ... 0 0 0]
[[1. 0.]
 [0. 1.]
 [0. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [41]:
X_train, X_test, Y_train, Y_test = train_test_split(texts2, Y, test_size=0.15)

In [42]:
X_train

82808     clarify identities major sides controversy rel...
54663     think terribly big deal think x seems pretty m...
95604     suntem forta way look might discover something...
128637    niggas like see side bitch cashin another nigg...
130697    thanks attention filled information link provi...
                                ...                        
40885     apparently manchester trafford boundaries foll...
43024     yes course edit romanian footballer past prese...
75751     cameron ward dj deleted page intrest people no...
157141    probably know understand new warned npa actual...
93424     substantive proposal agree significant coverag...
Name: new_text, Length: 156700, dtype: object

In [43]:
Y_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [44]:
max_words = 10000
max_len = 150

In [45]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

In [46]:
from keras_preprocessing.sequence import pad_sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_seq = pad_sequences(X_train_seq, maxlen=max_len)

In [47]:
X_train_seq

array([[   0,    0,    0, ..., 7619, 1167, 2162],
       [   0,    0,    0, ...,  739,  209,  622],
       [   0,    0,    0, ..., 8810,    4,  347],
       ...,
       [   0,    0,    0, ...,  234, 7245,  967],
       [   0,    0,    0, ...,   23,  114, 1425],
       [   0,    0,    0, ...,    1,  120,  345]])

In [48]:

embedding_vector_length = 32


embed_dim = 128
lstm_out = 196
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(196, dropout = 0.3, recurrent_dropout = 0.3 ))
model.add(Dropout(0.2))
model.add(Dense(100, activation = 'relu'))
model.add(Dropout(0.4))
model.add(Dense(2, activation = 'softmax'))

In [49]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 128)          1280000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 150, 128)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dropout (Dropout)           (None, 196)               0         
                                                                 
 dense (Dense)               (None, 100)               19700     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                        

In [50]:
loss = 'categorical_crossentropy'
metrics = ['accuracy']
learning_rate = 0.001
# optimizer = Adam(learning_rate)
from tensorflow.keras.optimizers import Adadelta
optimizer = Adadelta(learning_rate=1.0, rho=0.95)

In [51]:
# model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
model.compile(loss='binary_crossentropy',optimizer='adam', 
                            metrics=['accuracy'])


from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model.png', show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [52]:
verbose = 1
epochs = 5
batch_size = 128
validation_split = 0.1

In [53]:
history = model.fit(
X_train_seq,
Y_train,
batch_size=batch_size,
epochs=epochs,
verbose=verbose,
validation_split=validation_split
)

Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [54]:
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_seq = pad_sequences(X_test_seq, maxlen=max_len)

In [55]:
pred=model.predict(X_test_seq)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


In [56]:
model.save("lstm.h5")

In [57]:
pred = np.argmax(model.predict(X_test_seq),axis=1)

In [58]:
y_test_arg=np.argmax(Y_test,axis=1)

In [59]:
from sklearn.metrics import confusion_matrix
accuracy_score(y_test_arg, pred)

0.9604397193896

In [60]:
targets=['0', '1']
print(classification_report(y_test_arg, pred, target_names=targets))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     22114
           1       0.94      0.86      0.90      5540

    accuracy                           0.96     27654
   macro avg       0.95      0.92      0.94     27654
weighted avg       0.96      0.96      0.96     27654



In [61]:
history.history

{'loss': [0.16410532593727112,
  0.10976911336183548,
  0.0991462990641594,
  0.09328145533800125,
  0.08528274297714233],
 'accuracy': [0.9413599967956543,
  0.961128830909729,
  0.963709831237793,
  0.9654399752616882,
  0.9682620763778687],
 'val_loss': [0.12957754731178284,
  0.11374588310718536,
  0.11774929612874985,
  0.117925263941288,
  0.11885814368724823],
 'val_accuracy': [0.9569878578186035,
  0.958774745464325,
  0.9599234461784363,
  0.9598596096038818,
  0.9594767093658447]}

In [62]:
v=['anti greek nationalis wikipedia hi alexikoua yous vandalise wikipedia editing false information may chauvinist edit false information wikipedia nowhere percent greeks albania real information census vandalise wikipedia militant ideas']
test_X_seq = tokenizer.texts_to_sequences(v)
test_X_seq = pad_sequences(test_X_seq, maxlen=150)
test_X_seq

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,  414,  732,    3,   81, 3483,    3,   43,
         470,   26,   18,   14,  470,   26,    3, 2395, 4139, 2583, 4004,
         185,   26, 2406, 3483,    3, 

In [63]:
ypreds = np.argmax(model.predict(test_X_seq),axis=1)
ypreds

array([0], dtype=int64)

In [64]:
if ypreds==1:
    print("hate")
else:
    print("NORMAL")

NORMAL


In [65]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D

In [66]:

f_model = Sequential()
f_model.add(Embedding(max_words, 128, input_length=max_len,
                    trainable=False))

f_model.add(Dropout(0.2))

f_model.add(Conv1D(64,2,padding='valid',activation='relu'))
f_model.add(MaxPooling1D())
f_model.add(Conv1D(64,2,padding='valid',activation='relu'))
f_model.add(MaxPooling1D())

f_model.add(Conv1D(32,2,padding='valid',activation='relu'))
f_model.add(MaxPooling1D())
f_model.add(Conv1D(32,2,padding='valid',activation='relu'))
f_model.add(GlobalMaxPooling1D())

f_model.add(Dense(16, activation='relu'))
f_model.add(Dense(16, activation='relu'))
f_model.add(Dropout(0.2))

f_model.add(Dense(2, activation='softmax'))
f_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 150, 128)          1280000   
                                                                 
 dropout_2 (Dropout)         (None, 150, 128)          0         
                                                                 
 conv1d (Conv1D)             (None, 149, 64)           16448     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 74, 64)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 73, 64)            8256      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 36, 64)           0         
 1D)                                                  

In [67]:
f_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [68]:
history2 = f_model.fit(X_train_seq, Y_train, batch_size= 64, validation_split=0.2, epochs=10)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [69]:
history2.history

{'loss': [0.4292601943016052,
  0.37614789605140686,
  0.3603665828704834,
  0.3533201217651367,
  0.3482048213481903,
  0.34419628977775574,
  0.34072455763816833,
  0.33839714527130127,
  0.3352087736129761,
  0.3338567912578583],
 'accuracy': [0.8060466051101685,
  0.8257578015327454,
  0.8312779068946838,
  0.8339502215385437,
  0.8353940844535828,
  0.8368299007415771,
  0.8374840617179871,
  0.8382418751716614,
  0.8393825888633728,
  0.8394783139228821],
 'val_loss': [0.3908167779445648,
  0.36601167917251587,
  0.3627864122390747,
  0.3600771129131317,
  0.3639492988586426,
  0.3515869081020355,
  0.354796826839447,
  0.3541978895664215,
  0.34959831833839417,
  0.351121723651886],
 'val_accuracy': [0.8255903124809265,
  0.8305360674858093,
  0.8327057957649231,
  0.8333439826965332,
  0.8306955695152283,
  0.8346521854400635,
  0.8334397077560425,
  0.8346841335296631,
  0.8347479104995728,
  0.8353860974311829]}

In [70]:
f_model.save("nn.h5")