In [1]:
import nltk
from nltk.corpus import stopwords
import re
from keras.preprocessing.text import Tokenizer
import gensim
from keras_preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import ReduceLROnPlateau
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('twitter_trainingdata.csv')
dftest = pd.read_csv('twitter_testdata.csv')
df.Polarity.replace({0:'Negative', 2:'Neutral', 4:'Positive'}, inplace=True)
dftest.Polarity.replace({0:'Negative', 2:'Neutral', 4:'Positive'}, inplace=True)

stop_words=set(stopwords.words('english'))
stop_words.remove('not')

corpus = []
for i in range(0, len(df)):
    review=re.sub('@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+',' ',df['Tweet'][i])
    review=review.lower()
    review=review.split()
    review=[word for word in review if not word in stop_words]
    review=' '.join(review)
    corpus.append(review)
df.Tweet=corpus

df = df.sample(n = 70000, random_state = 42)

train_df,test_df=train_test_split(df,test_size=0.20,random_state=123)

print(train_df.shape)
print(test_df.shape)

documents = [text.split() for text in train_df.Tweet]

w2v_model = gensim.models.word2vec.Word2Vec(vector_size=200,
                                            window= 5,
                                            min_count=10,
                                            workers=4)
w2v_model.build_vocab(documents)
words = w2v_model.wv.index_to_key
vocab_size = len(words)
print("Vocab size: ", vocab_size)

w2v_model.train(documents, total_examples=len(documents), epochs=30)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df.Tweet)
tokenizer.word_index

vocab_size=len(tokenizer.word_index)+1

x_train = pad_sequences(tokenizer.texts_to_sequences(train_df.Tweet), maxlen=20)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_df.Tweet), maxlen=20)
y_train = train_df.Polarity
y_test = test_df.Polarity

labelencoder = LabelEncoder()
y_train = labelencoder.fit_transform(y_train)
y_test= labelencoder.fit_transform(y_test)

embedding_matrix = np.zeros((vocab_size, 200))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], 
                            input_length=20, trainable=False)
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters = 128, kernel_size = 3, activation = "relu"))
model.add(Bidirectional(LSTM(96, dropout=0.2, recurrent_dropout=0.2, return_sequences = True)))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=3, strides = 1))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences = True)))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.2,
                              patience=5, min_lr=0.001)

model_history=model.fit(x_train, y_train, batch_size=1024, epochs=10,
                        validation_split=0.1, verbose=1, callbacks=[reduce_lr])

score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

ModuleNotFoundError: No module named 'keras_preprocessing'

In [4]:
!pip install tensorflow


Collecting tensorflow
  Downloading tensorflow-2.12.0-cp39-cp39-win_amd64.whl (1.9 kB)
Collecting tensorflow-intel==2.12.0
  Downloading tensorflow_intel-2.12.0-cp39-cp39-win_amd64.whl (272.8 MB)
     -------------------------------------- 272.8/272.8 MB 1.8 MB/s eta 0:00:00
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3
  Downloading protobuf-4.22.1-cp39-cp39-win_amd64.whl (420 kB)
     -------------------------------------- 420.6/420.6 kB 2.2 MB/s eta 0:00:00
Collecting absl-py>=1.0.0
  Downloading absl_py-1.4.0-py3-none-any.whl (126 kB)
     -------------------------------------- 126.5/126.5 kB 3.8 MB/s eta 0:00:00
Collecting flatbuffers>=2.0
  Downloading flatbuffers-23.3.3-py2.py3-none-any.whl (26 kB)
Collecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
     ---------------------------------------- 65.5/65.5 kB 3.7 MB/s eta 0:00:00
Collecting tensorboard<2.13,>=2.12
  Downloading tensorboard-2.12.0-py3-no

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.6.0 requires daal==2021.4.0, which is not installed.
numba 0.55.1 requires numpy<1.22,>=1.18, but you have numpy 1.23.5 which is incompatible.
