# Importing the Required Modules

You'll need to pip install the following libraries:

In [None]:
!pip install swifter
!pip install rouge
!pip install clean-text
!pip install Unidecode
!pip install torch
!pip install transformers
# !pip install datasets 
!pip install tensorflow
!pip install keras

Once that's done, let's import all the modules we'll be using:

In [None]:
# Standard Libraries
import re
import pickle

# Data Libraries
import pandas as pd
import numpy as np
import swifter

# Data Preprocessing
import nltk
from nltk.util import ngrams
from cleantext import clean

# Metrics
from rouge import Rouge 

# Data Visualisation
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML

# Functionality
from typing import List, Dict, Union

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers 
from keras.layers import Dropout, Dense,Input,Embedding,Flatten, MaxPooling1D, Conv1D
from keras.models import Sequential,Model
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.datasets import fetch_20newsgroups
from keras.layers.merge import Concatenate
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix
from keras.utils.np_utils import to_categorical



---



# Loading datasets

In [None]:
data_news_all = pd.read_csv(f"{PATH}/Research/Combined/sarcastic_nonsarcastic_news_processed-19702-20746.csv")  #35%

In [None]:
data_news_all = pd.read_csv(f"{PATH}/Research/Combined/sarcasm_dataset_1000_1000_1000.csv", delimiter='\t') 
article = data_news_all['text'].values
label = data_news_all['sarcastic'].values

In [None]:
data_news_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28447 entries, 0 to 28446
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          28447 non-null  object
 1   sarcastic   28447 non-null  object
 2   topic       28447 non-null  object
 3   url         28447 non-null  object
 4   image_path  28447 non-null  object
 5   text        28447 non-null  object
dtypes: object(6)
memory usage: 1.3+ MB


In [None]:
article = data_news_all['article_processed'].values
label = data_news_all['sarcastic'].values

In [None]:
article_train, article_rem, label_train, label_rem = train_test_split(article, label, train_size=0.8, random_state=42) 
article_valid, article_test, label_valid, label_test = train_test_split(article_rem, label_rem, test_size=0.5, random_state=42) 

In [None]:
print('Shape of training data: ')
print(article_train.shape)
print(label_train.shape)

print('Shape of val data: ')
print(article_valid.shape)
print(label_valid.shape)

print('Shape of test data: ')
print(article_test.shape)
print(label_test.shape)

Shape of training data: 
(32312,)
(32312,)
Shape of val data: 
(4039,)
(4039,)
Shape of test data: 
(4040,)
(4040,)


## RNN  

In [None]:
from keras.layers import Dropout, Dense, GRU, Embedding
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import model_from_json

In [None]:
article = data_news_all['article_processed'].values
label = data_news_all['sarcastic'].values

In [None]:
article_train, article_rem, label_train, label_rem = train_test_split(article, label, train_size=0.8, random_state=42) 
article_valid, article_test, label_valid, label_test = train_test_split(article_rem, label_rem, test_size=0.5, random_state=42) 

In [None]:
print('Shape of training data: ')
print(article_train.shape)
print(label_train.shape)

print('Shape of val data: ')
print(article_valid.shape)
print(label_valid.shape)

print('Shape of test data: ')
print(article_test.shape)
print(label_test.shape)

Shape of training data: 
(32312,)
(32312,)
Shape of val data: 
(4039,)
(4039,)
Shape of test data: 
(4040,)
(4040,)


### CREATE MODEL

In [None]:
def loadData_Tokenizer(X_train, X_test, MAX_NUM_WORDS=75000, MAX_SEQUENCE_LENGTH=500):

  np.random.seed(7)
  text = np.concatenate((X_train, X_test), axis=0)
  text = np.array(text)

  tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) #keeps the most frequent words 
  tokenizer.fit_on_texts(text)
  sequences = tokenizer.texts_to_sequences(text)
  # Xcnn_train = tokenizer.texts_to_sequences(X_train)
  # Xcnn_test = tokenizer.texts_to_sequences(X_test)

  word_index = tokenizer.word_index
  vocab_size = len(tokenizer.word_index) + 1  

  text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
  # Xcnn_train = pad_sequences(Xcnn_train, maxlen=MAX_SEQUENCE_LENGTH)
  # Xcnn_test = pad_sequences(Xcnn_test, maxlen=MAX_SEQUENCE_LENGTH) 
  print('Found %s unique tokens.' % len(word_index))

  indices = np.arange(text.shape[0])
  # np.random.shuffle(indices)
  text = text[indices]
  print(text.shape)
  X_train = text[0:len(X_train), ]
  X_test = text[len(X_train):, ]

  embeddings_index = {}
  # f = open(f"{PATH}/Research/corola.300.20.vec", encoding="utf8")
  f = open(f"{PATH}/Research/Embeddings/corola.300.20.vec")
  for line in f:
      values = line.split()
      word = values[0]
      try:
          coefs = np.asarray(values[1:], dtype='float32')
      except:
          pass
      embeddings_index[word] = coefs
  f.close()

  return X_train, X_test, word_index, embeddings_index

In [None]:
def Build_Model_RNN_Text(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=300, dropout=0.5):
    """
    def buildModel_RNN(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
    word_index in word index ,
    embeddings_index is embeddings index, look at data_helper.py
    nClasses is number of classes,
    MAX_SEQUENCE_LENGTH is maximum lenght of text sequences
    """

    model = Sequential()
    hidden_layer = 3
    gru_node = 32

    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))


    print(gru_node)
    for i in range(0,hidden_layer):
        model.add(GRU(gru_node,return_sequences=True, recurrent_dropout=0.2))
        model.add(Dropout(dropout))
    model.add(GRU(gru_node, recurrent_dropout=0.2))
    model.add(Dropout(dropout))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(nclasses, activation='softmax'))


    model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model

In [None]:
# article_train, article_test, label_train, label_test = train_test_split(article, label, test_size=0.25, random_state=42) 

In [None]:
X_train = article_train
X_test = article_test
y_train = label_train
y_test = label_test

X_train,X_test, word_index,embeddings_index = loadData_Tokenizer(X_train,X_test)

Found 163493 unique tokens.
(36352, 500)


In [None]:
model_RNN = Build_Model_RNN_Text(word_index,embeddings_index, 20)

32


In [None]:
model_RNN.fit(X_train, y_train,
                              validation_data=(X_test, y_test),
                              epochs=2,
                              batch_size=128,
                              verbose=0)

In [None]:
predicted = model_RNN.predict(X_test)

In [None]:
predicted = np.argmax(predicted, axis=1)

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6029
           1       0.99      0.98      0.99      4933

    accuracy                           0.99     10962
   macro avg       0.99      0.99      0.99     10962
weighted avg       0.99      0.99      0.99     10962



In [None]:
scores = model_RNN.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model_RNN.metrics_names[1], scores[1]*100))

accuracy: 98.77%


In [None]:
print("%s: %.2f%%" % (model_RNN.metrics_names[0], scores[1]*100))

loss: 98.77%


https://machinelearningmastery.com/save-load-keras-deep-learning-models/

In [None]:
# serialize model to JSON
model_json = model_CNN.to_json()
with open(f"{PATH}/Research/RNN/model_RNN.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model_CNN.save_weights(f"{PATH}/Research/RNN/model_RNN.h5")
print("Saved model to disk")

Saved model to disk


### LOAD MODEL

In [None]:
def loadData_Tokenizer(X_train, X_valid, X_test, MAX_NUM_WORDS=75000, MAX_SEQUENCE_LENGTH=1000):

  # np.random.seed(7)

  # tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) #keeps the most frequent words 
  # tokenizer.fit_on_texts(X_train)
  # sequences = tokenizer.texts_to_sequences(X_train)

  # word_index = tokenizer.word_index
  # vocab_size = len(tokenizer.word_index) + 1  

  # X_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
  # # Xcnn_train = pad_sequences(Xcnn_train, maxlen=MAX_SEQUENCE_LENGTH)
  # # Xcnn_test = pad_sequences(Xcnn_test, maxlen=MAX_SEQUENCE_LENGTH) 
  # print('Found %s unique tokens.' % len(word_index))

  # X_test_sequences = tokenizer.texts_to_sequences(X_test)
  # X_test = pad_sequences(X_test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

  # X_valid_sequences = tokenizer.texts_to_sequences(X_valid)
  # X_valid = pad_sequences(X_valid_sequences, maxlen=MAX_SEQUENCE_LENGTH)
  # # indices = np.arange(text.shape[0])
  # # # np.random.shuffle(indices)
  # # text = text[indices]
  # # print(text.shape)
  # # X_train = text[0:len(X_train), ]
  # # X_test = text[len(X_train):, ]

  np.random.seed(7)
  text = np.concatenate((X_train, X_valid, X_test), axis=0)
  text = np.array(text)

  tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
  tokenizer.fit_on_texts(text)

  sequences = tokenizer.texts_to_sequences(text)
  word_index = tokenizer.word_index
  text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
  print('Found %s unique tokens.' % len(word_index))

  indices = np.arange(text.shape[0])
  # np.random.shuffle(indices)
  text = text[indices]
  print(text.shape)

  X_train = text[0:len(X_train), ]
  X_valid = text[len(X_train):(len(X_train)+len(X_valid)), ]
  X_test = text[(len(X_train)+len(X_valid)):, ]
  embeddings_index = {}
  # f = open(f"{PATH}/Research/corola.300.20.vec", encoding="utf8")
  f = open(f"{PATH}/Research/Embeddings/corola.300.20.vec")
  for line in f:
      values = line.split()
      word = values[0]
      try:
          coefs = np.asarray(values[1:], dtype='float32')
      except:
          pass
      embeddings_index[word] = coefs
  f.close()

  return X_train, X_valid, X_test, word_index, embeddings_index

In [None]:
# load json and create model
json_file = open(f"{PATH}/Research/RNN-LSTM/model_RNN-LSTM-2.json", 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(f"{PATH}/Research/RNN-LSTM/model_RNN-LSTM-2.h5")
print("Loaded model from disk")

Loaded model from disk


In [None]:
loaded_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
X_train = article_train
X_test = article_test
y_train = label_train
y_test = label_test
X_valid = article_valid
y_valid = label_valid
X_train, X_valid, X_test, word_index,embeddings_index = loadData_Tokenizer(X_train,X_valid,X_test)

Found 171595 unique tokens.
(40391, 1000)


In [None]:
label_test[0]

1

In [None]:
score = loaded_model.evaluate(X_test, y_test, verbose=1)



In [None]:
print("%s: %.2f%%" % (loaded_model.metrics_names[0], score[0]*100))
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

loss: 66.09%
accuracy: 68.00%


In [None]:
print(X_test.shape)

(4040, 1000)


In [None]:
pred = loaded_model.predict(np.expand_dims(X_test[0], 0))
pred

array([[0.22549923, 0.7745008 ]], dtype=float32)

In [None]:
np.expand_dims(X_test[0],0).shape

(1, 1000)

In [None]:
X_test[0].shape

(1000,)

In [None]:
text = np.concatenate((article_train, article_valid, article_test), axis=0)
text = np.array(text)

tokenizer = Tokenizer(num_words=75000)
tokenizer.fit_on_texts(text)


In [None]:
text = np.array(["desi nu s-a auzit nicio bubuitura foarte puternica iar fumul din zona este la fel ca de obicei , mai multi bucuresteni sustin ca statia spatiala chinezeasca s-a prabusit in cartierul colentina . acestia isi sustin afirmatiile cu faptul ca peste tot e plin de chinezi si de cratere , exact ca pe luna pe vremuri , ultima oara cand m-am uitat in jur , cartierul asta arata foarte frumos . chiar tin minte cand s-a infiintat prima data spitalul aici , pe vremea lui grigore ghica al doilea"])
sequences = tokenizer.texts_to_sequences(text)
sequences = pad_sequences(sequences, maxlen=1000)
print(sequences.shape)

(1, 1000)


In [None]:
sequences = np.array(sequences)
print(sequences.shape)

(1, 5)


In [None]:
n = len(sequences[0])
sequences = np.array(([0] * (1000 - n)).extend(sequences[0]))
print(sequences.shape)

()


In [None]:
pred = loaded_model.predict(sequences)