In [3]:
import warnings
warnings.filterwarnings('ignore')
from datasets import load_dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D,Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
import gensim

In [4]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [5]:
dataset = load_dataset('ade_corpus_v2','Ade_corpus_v2_classification')
df=dataset['train'].to_pandas()

Downloading builder script:   0%|          | 0.00/11.7k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.84k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/307k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/868k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/23516 [00:00<?, ? examples/s]

In [6]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

In [7]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
y = df_train['label']
X = []
stop_words = set(nltk.corpus.stopwords.words("english"))
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
for par in df_train["text"].values:
    tmp = []
    sentences = nltk.sent_tokenize(par)
    for sent in sentences:
        sent = sent.lower()
        tokens = tokenizer.tokenize(sent)
        filtered_words = [w.strip() for w in tokens if w not in stop_words and len(w) > 1]
        tmp.extend(filtered_words)
    X.append(tmp)

In [9]:
w2v_model = gensim.models.word2vec.Word2Vec(sentences=X, vector_size=300, window=5, min_count=1)

In [10]:
w2v_model.wv.most_similar(positive=["man"])

[('woman', 0.9991235733032227),
 ('year', 0.9982935786247253),
 ('yr', 0.9966797232627869),
 ('59', 0.9960654377937317),
 ('male', 0.9958118796348572),
 ('61', 0.9956179857254028),
 ('female', 0.9954177737236023),
 ('68', 0.9945355653762817),
 ('47', 0.9943128824234009),
 ('girl', 0.9940639138221741)]

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train["text"])

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 15435


In [12]:
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train["text"]), maxlen=300)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test["text"]), maxlen=300)

In [13]:
y_train = df_train.label
y_test = df_test.label
y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)

In [14]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print()
print("x_test", x_test.shape)
print("y_test", y_test.shape)

x_train (16461, 300)
y_train (16461, 1)

x_test (7055, 300)
y_test (7055, 1)


In [15]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

(15435, 300)


In [16]:
embedding_layer = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=300, trainable=False)

In [17]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 300)          4630500   
                                                                 
 dropout (Dropout)           (None, 300, 300)          0         
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 4791001 (18.28 MB)
Trainable params: 160501 (626.96 KB)
Non-trainable params: 4630500 (17.66 MB)
_________________________________________________________________


In [21]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [22]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

In [23]:
with tf.device('/GPU:0'):
  history = model.fit(x_train, y_train,
                    batch_size=2000,
                    epochs=20,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

Epoch 1/20



Epoch 2/20



Epoch 3/20



Epoch 4/20



Epoch 5/20



Epoch 6/20



Epoch 7/20



Epoch 8/20



Epoch 9/20



Epoch 10/20



Epoch 11/20



Epoch 12/20



Epoch 13/20



Epoch 14/20



Epoch 15/20



Epoch 16/20



Epoch 17/20



Epoch 18/20



Epoch 19/20



Epoch 20/20





In [26]:
score = model.evaluate(x_test, y_test, batch_size=1024)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])


ACCURACY: 0.7491140961647034
LOSS: 0.5065761208534241
