In [None]:
import nltk
from nltk.corpus import stopwords
import re
from tensorflow.keras.preprocessing.text import Tokenizer
import gensim
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('data/twitter_trainingdata.csv')
dftest = pd.read_csv('data/twitter_testdata.csv')
df.Polarity.replace({0:'Negative', 2:'Neutral', 4:'Positive'}, inplace=True)
dftest.Polarity.replace({0:'Negative', 2:'Neutral', 4:'Positive'}, inplace=True)

stop_words=set(stopwords.words('english'))
stop_words.remove('not')

corpus = []
for i in range(0, len(df)):
    review=re.sub('@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+',' ',df['Tweet'][i])
    review=review.lower()
    review=review.split()
    review=[word for word in review if not word in stop_words]
    review=' '.join(review)
    corpus.append(review)
df.Tweet=corpus

df = df.sample(n = 70000, random_state = 42)

train_df,test_df=train_test_split(df,test_size=0.20,random_state=123)

print(train_df.shape)
print(test_df.shape)

documents = [text.split() for text in train_df.Tweet]

w2v_model = gensim.models.word2vec.Word2Vec(vector_size=200,
                                            window= 5,
                                            min_count=10,
                                            workers=4)
w2v_model.build_vocab(documents)
words = w2v_model.wv.index_to_key
vocab_size = len(words)
print("Vocab size: ", vocab_size)

w2v_model.train(documents, total_examples=len(documents), epochs=30)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df.Tweet)
tokenizer.word_index

vocab_size=len(tokenizer.word_index)+1

x_train = pad_sequences(tokenizer.texts_to_sequences(train_df.Tweet), maxlen=20)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_df.Tweet), maxlen=20)
y_train = train_df.Polarity
y_test = test_df.Polarity

labelencoder = LabelEncoder()
y_train = labelencoder.fit_transform(y_train)
y_test= labelencoder.fit_transform(y_test)

embedding_matrix = np.zeros((vocab_size, 200))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], 
                            input_length=20, trainable=True)
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters = 128, kernel_size = 3, activation = "relu"))
model.add(Bidirectional(LSTM(96, dropout=0.2, recurrent_dropout=0.2, return_sequences = True)))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=3, strides = 1))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences = True)))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))

model.build(input_shape=(None, 20))
model.summary()

model.compile(loss='sparse_categorical_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.2,
                              patience=2, min_lr=0.0001)

early_stopping = EarlyStopping(monitor='val_accuracy',
                               patience=3,
                               restore_best_weights=True)

model_history=model.fit(x_train, y_train, batch_size=512, epochs=20,
                        validation_split=0.1, verbose=1, callbacks=[reduce_lr, early_stopping])

score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.Polarity.replace({0:'Negative', 2:'Neutral', 4:'Positive'}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dftest.Polarity.replace({0:'Negative', 2:'Neutral', 4:'Positive'}, inplace=True)


(56000, 6)
(14000, 6)
Vocab size:  4226
Vocab size:  4226
(39338, 200)
(39338, 200)




Epoch 1/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 161ms/step - accuracy: 0.7797 - loss: 0.5034 - val_accuracy: 0.8180 - val_loss: 0.4064 - learning_rate: 0.0010
Epoch 2/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 161ms/step - accuracy: 0.7797 - loss: 0.5034 - val_accuracy: 0.8180 - val_loss: 0.4064 - learning_rate: 0.0010
Epoch 2/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 154ms/step - accuracy: 0.8298 - loss: 0.3922 - val_accuracy: 0.8186 - val_loss: 0.3967 - learning_rate: 0.0010
Epoch 3/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 154ms/step - accuracy: 0.8298 - loss: 0.3922 - val_accuracy: 0.8186 - val_loss: 0.3967 - learning_rate: 0.0010
Epoch 3/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 163ms/step - accuracy: 0.8614 - loss: 0.3365 - val_accuracy: 0.8323 - val_loss: 0.3862 - learning_rate: 0.0010
Epoch 4/20
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 