### Import Libraries

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import nltk
import re
import string
import keras
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

### Load Imbalanced Data

In [None]:
imb_data=pd.read_csv('imbalanced_data.csv')
imb_data.head()


## EDA

0-> no Hate

1-> Hate

Here 0 represents no hate and 1 represents hate and we can see data is compeletly imbalanced

In [None]:
sns.countplot(x='label',data=imb_data);

In [None]:
imb_data.shape

In [None]:
imb_data.isnull().sum()
#hence is no null entry in our data

In [None]:
imb_data.drop('id',axis=1,inplace=True)

In [None]:
imb_data.head()

### Load Raw Data

In [None]:
raw_data=pd.read_csv('raw_data.csv')
raw_data.head()

In [None]:
raw_data.columns

In [None]:
cols=['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither']

In [None]:
raw_data.shape

In [None]:
raw_data.isnull().sum()

In [None]:
raw_data.drop(columns=cols,axis=1,inplace=True)

In [None]:
raw_data.head()

In [None]:
raw_data['class'].unique()
#lets find the number of unique classes in our dataset

In [None]:
sns.countplot(x='class',data=raw_data);

class 0 -> Hate

class 1 -> Abusive

class 2 -> no Hate

In [None]:
#lets the copy the value of class 0 into class 1
#beacuse we are merging the hate and abusive columns of dataset because both
#have same nature
raw_data[raw_data['class']==0]['class']==1

In [None]:
raw_data['class'].replace({0:1},inplace=True)

In [None]:
#successfully merged Hate and Abusive Column and no we have only two classes hate
# no hate
raw_data['class'].unique()

In [None]:
raw_data.replace({2:0},inplace=True)

In [None]:
raw_data['class'].unique()
#0 means hate and 1 means no hate

In [None]:
imb_data.head()

In [None]:
raw_data.rename(columns={'class':'label'},inplace=True)

In [None]:
raw_data

In [None]:
#For Merging two DataFrames in Pandas they must have same columns names
#We will merge them row wise not column wise

### Merging DataFrames

In [None]:
frame=[imb_data,raw_data]
df=pd.concat(frame)

In [None]:
df.head()

In [None]:
sns.countplot(x='label',data=df);

### Preprocessing

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
#lets apply stemming and stopwords on data
stemmer=nltk.SnowballStemmer('english')
stop_words=set(stopwords.words('english'))

In [None]:
stopwords

### Data Cleaning

In [None]:
#lets apply Regex and Start Data Cleaning
import re
def data_cleaning(text):
  text=str(text).lower()
  text=re.sub(r'[^a-zA-Z0-9\s]', '',text)
  text=re.sub(r'<.*?>', '',text)
  text=re.sub(r'https?://\S+|www\.\S+', '',text)
  text=re.sub(r'[^\w\s]', '', text)
  text=re.sub("[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002700-\U000027BF\U000024C2-\U0001F251]+", '', text)
  text=re.sub('[^0-9a-z]'," ",text)
  text=re.sub(' +'," ",text)
  words=[word for word in text.split() if text not in stop_words]
  words=" ".join(words)
  words=[stemmer.stem(word) for word in words.split()]
  words=" ".join(words)
  return words

In [None]:
#Lets apply the data cleaning function on tweet column in our DataFrame
df['tweet']=df['tweet'].apply(data_cleaning)

In [None]:
df['tweet'][1]

In [None]:
x=df['tweet']
y=df['label']

In [None]:
type(x)
type(y)

### Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(x,y,random_state=42,shuffle=True)

In [None]:
print(len(X_train),len(y_train),len(X_test),len(y_test))

##Feature Engineering

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
max_words=50000
max_len=300
#Tokenization
tokenizer=Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
#Padding
sequences_train=tokenizer.texts_to_sequences(X_train)
sequence_matrix=pad_sequences(sequences_train,maxlen=max_len)

In [None]:
sequence_matrix

In [None]:
from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM,SpatialDropout1D,Activation,Input
from keras.optimizers import RMSprop

In [None]:
#creating Model Architecture
model=Sequential()
#Embedding Layer
model.add(Embedding(max_words,100,input_length=max_len))
model.add(SpatialDropout1D(0.2))
#LSTM Layer
model.add(LSTM(100,dropout=0.2,recurrent_dropout=0.2))
#Dense Layer
model.add(Dense(1,activation='sigmoid'))
model.summary()

In [None]:
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [None]:
history=model.fit(sequence_matrix,y_train,batch_size=128,epochs=1,validation_split=0.2)

In [None]:
test_sequences=tokenizer.texts_to_sequences(X_test)
test_sequences_matrix=pad_sequences(test_sequences,maxlen=max_len)

## Evaluation

In [None]:
#Model Evaluation
accr=model.evaluate(test_sequences_matrix,y_test)

In [None]:
lstm_prediction=model.predict(test_sequences_matrix)

In [None]:
#setting threshold for my model to predict the target label
res=[]
for pred in lstm_prediction:
  if pred[0] < 0.5:
    res.append(0)
  else:
    res.append(1)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test,res))

### Saving Model

In [None]:
import pickle
with open('tokenizer.pickle','wb') as handle: #wb means binary mode
  pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)
  #pickle.Highest Protocol means use the latest and most efficient way

In [None]:
#Let's Save the Model
model.save('model.h5')

### Load your Model

In [None]:
load_model=keras.models.load_model('model.h5')
with open('tokenizer.pickle','rb') as handle:
  load_tokenizer=pickle.load(handle)

### Final Prediction

In [None]:
def End_to_End_pipeline(text):
  #Data cleaning
  text=data_cleaning(text)
  #Tokenizer
  seq=load_tokenizer.texts_to_sequences([text])
  #Padding
  pad_seq=pad_sequences(seq,maxlen=max_len)
  #Load Model
  load_model=keras.models.load_model('model.h5')
  #Prediction
  pred=load_model.predict(pad_seq)
  if pred[0][0] < 0.5:
    return 'No Hate'
  else:
    return 'Hate and Abusive'

In [None]:
text="Hey Mother Fucker I will Fuck you"
End_to_End_pipeline(text)