In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split 
X = pd.read_csv('../input/hatred-on-twitter-during-metoo-movement/MeTooHate.csv', nrows=300000 , encoding='latin -1')
X.dropna(axis=0, subset=['text', 'category'], inplace=True)
#y = X.category
#X.drop(['category'], axis=1, inplace=True)

# Drop columns not used for modelling
cols_to_drop = ['status_id', 'created_at', 'location' ,'favorite_count','retweet_count','followers_count' ,'friends_count','statuses_count' ]
X.drop(cols_to_drop, axis=1, inplace=True)

# Split the data while maintaining the proportion of hate/non-hate (stratify) 
#X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)
#x_train, x_test, y_train, y_test = train_test_split(X['text'].values, X['category'].values, test_size=0.30)

In [None]:
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
x_test_stats = x_test.copy()

print("Total training samples:", len(x_train))
print("Total test samples:", len(x_test))

x_train.head(10)

In [None]:
x_train['category'].value_counts()

In [None]:
import re

In [None]:
def text_cleaning(text):
    
    text = re.sub(r'@[A-Za-z0-9]+', '', text)     # removing @mentions
    text = re.sub(r'@[A-Za-zA-Z0-9]+', '', text)  # removing @mentions 
    text = re.sub(r'@[A-Za-z]+', '', text)        # removing @mentions
    text = re.sub(r'@[-)]+', '', text)            # removing @mentions
    text = re.sub(r'#', '', text )                # removing '#' sign
    text = re.sub(r'RT[\s]+', '', text)           # removing RT
    text = re.sub(r'https?\/\/\S+', '', text)     # removing the hyper link
    text = re.sub(r'&[a-z;]+', '', text)          # removing '&gt;'

    return text

In [None]:
X['text'] = X['text'].apply(text_cleaning)
X.head(10)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X['text'].values, X['category'].values, test_size=0.30)

In [None]:
print('sentiment Text: ', x_train[0])
print('sentiment: ', y_train[0])


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [None]:
max_vocab = 20000000
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(x_train)

In [None]:
wordidx = tokenizer.word_index
V = len(wordidx)
print('The size of datatset vocab is: ', V)

In [None]:
train_seq = tokenizer.texts_to_sequences(x_train)
test_seq = tokenizer.texts_to_sequences(x_test)
print('Training sequence: ', train_seq[0])
print('Testing sequence: ', test_seq[0])

In [None]:
pad_train = pad_sequences(train_seq)
T = pad_train.shape[1]
print('The length of training sequence is: ', T)

In [None]:
pad_test = pad_sequences(test_seq, maxlen=T)
print('The length of testing sequence is: ', pad_test.shape[1])

In [None]:
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, GlobalMaxPooling1D
from tensorflow.keras.models import Model


In [None]:
D = 20
M = 15

i = Input (shape=(T, ))   
x = Embedding(V+1, D)(i)    # V+1 because the indexing of the words in vocab (V) start from 1 not 0
x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(32, activation='relu')(x)
x = Dense(1, activation='sigmoid')(x)


In [None]:
model = Model(i,x)


In [None]:
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [None]:
# ACCURACY_THRESHOLD = 0.93
# class myCallback(tf.keras.callbacks.Callback):
    
# 	def on_epoch_end(self, epoch, logs={}):
        
# 		if(logs.get('acc') is not None and logs.get('acc') > ACCURACY_THRESHOLD):
            
# 			print("\nReached %2.2f%% accuracy, so stopping training!!" %(ACCURACY_THRESHOLD*100))
            
# 			self.model.stop_training = True

In [None]:
model.fit?

In [None]:
r = model.fit(pad_train, y_train, validation_data=(pad_test, y_test), epochs=10 , batch_size = 32)

In [None]:
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label = 'val_loss')
plt.legend()

In [None]:
plt.plot(r.history['accuracy'], label= 'accuracy')
plt.plot(r.history['val_accuracy'], label='val_accuracy')
plt.legend()

In [None]:
def predict_sentiment(text):
    
    
   # preprocessing the given text 
    text_seq = tokenizer.texts_to_sequences(text)
    text_pad = pad_sequences(text_seq, maxlen=T)

  # predicting the class
    predicted_sentiment = model.predict(text_pad).round()
    return predicted_sentiment.astype(int)

In [None]:
x_test.size

In [None]:

y_pred = predict_sentiment(x_test)

In [None]:
text = ['i feel happy']
type(predict_sentiment(text))

In [None]:
import keras
keras.metrics.confusion_matrix(y_test, y_pred)

In [None]:
y_test

In [None]:
!pip install mlxtend

In [None]:
from mlxtend.plotting import plot_confusion_matrix

In [None]:
from sklearn.metrics import confusion_matrix


In [None]:
mat = confusion_matrix(y_test,y_pred)
plot_confusion_matrix(conf_mat = mat)