# Classifier to classify fake reviews and real reviews. <br>
**Files Used**<br>
Fake Reviews: create a list of reviews generated by the generators <br>
Real Reviews: https://www.kaggle.com/yelp-dataset/yelp-dataset?select=yelp_academic_dataset_review.json<br>


In [None]:
#mounting the drive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth, drive
from oauth2client.client import GoogleCredentials

drive.mount('/content/drive')

In [None]:
#importing the libraries
import tensorflow as tf
from tensorflow.keras import layers
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import pickle
import random
from random import choice
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve,precision_score
from keras.preprocessing.sequence import pad_sequences
from matplotlib import pyplot


In [None]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")


In [None]:
from IPython.display import HTML, display

def set_css():
  """ A function for wrapping text displayed in the output """
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
  
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
#defining the constants
VOCAB_SIZE = 100000
INPUT_LEN = 150

# Loading the data

In [None]:
with open("fake_reviews.pkl", "rb") as f:
  fake_reviews_list = pickle.load(f)

In [None]:
with open("real_reviews.pkl", "rb") as f:
  real_reviews_list = pickle.load(f)
real_reviews_list = real_reviews_list[:len(fake_reviews_list)]

In [None]:
#cleaning the fake reviews text when using VAE_50K.txt file. Has the tokens <GO>,"\n"
for counter in range(len(real_reviews_list)):
  real_reviews_list[counter] = real_reviews_list[counter].replace("\n","")

In [None]:
print("Number of real reviews: ", len(real_reviews_list))
print("Number of fake reviews: ",len(fake_reviews_list))

In [None]:
#creating the complete reviews list

reviews_list = fake_reviews_list.copy()
reviews_list.extend(real_reviews_list)

print("Total number of reviews: ", len(reviews_list))

In [None]:
#preparing the labels
labels = [0]*len(fake_reviews_list)
labels.extend([1]*len(real_reviews_list))

# Data Pre-processing

In [None]:
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
import re
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(reviews_list :list) -> list:
  """
  A function to pre-process text.

  This function removes the punctuation marks, numbers and stopwords from the text. It converts the text
  to lower case. It also removes reviews with less than 10 words.
  Args:
    reviews_list: A list of reviews
  Returns: 
    A list of lists where each list corresponds to the words in the corresponding review.

  """
  cleaned_reviews = []
  for line in reviews_list:
    
    line = re.sub("\W", " ", line)
    #tokenize the sentences into words
    tokens = word_tokenize(line)
    
    words  = [word.lower() for word in tokens if word.isalpha()]
    
    words = [word for word in words if word not in stop_words]
    cleaned_reviews.append(words)


  return cleaned_reviews


In [None]:
#test case
sample = "The food was great!! But no AC..delicious....i"
sample_cleaned = clean_text([sample])
print(sample_tokenized)

In [None]:
#cleaning the text
cleaned_review = clean_text(reviews_list)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(cleaned_review, labels, test_size = 0.1, random_state = 100, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state = 100, shuffle=True)
print("Size of training set: ", len(X_train))
print("Size of validation set: ", len(X_val))
print("Size of test set: ", len(X_test))

In [None]:
#converting to numpy arrays
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

In [None]:
#details about the data
y_train_zeroes = np.where(y_train==0)
print("Number of fake reviews in y_train = ", len(y_train_zeroes[0]))
y_val_zeroes = np.where(y_val==0)
print("Number of fake reviews in y_val = ", len(y_val_zeroes[0]))
y_test_zeroes = np.where(y_test==0)
print("Number of fake reviews in y_test = ", len(y_test_zeroes[0]))

In [None]:
import collections
#finding the average number of words in a review in the training set
total_num = 0
lengths_dict = collections.defaultdict(int)

for review in X_train:
  total_num+=len(review)
  lengths_dict[len(review)]+=1

avg_word_nos = total_num//len(X_train)
print("Average number of words in a review: ",avg_word_nos)

#checking if the average number of words is in the range of the INPUT_LEN
if (avg_word_nos > (INPUT_LEN+20)) or (avg_word_nos < (INPUT_LEN-20)):
  print("Please update your INPUT_LEN variable.")

If training a new model from scratch, create a new tokenizer. <br>
If reusing the pre-trained model, then please load the tokenizer from the file

In [None]:
#defining the tokenizer
tokenizer  = tf.keras.preprocessing.text.Tokenizer(num_words = VOCAB_SIZE,lower = True, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [None]:
#tokenization and padding
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_seq = np.array(pad_sequences(X_train_seq, INPUT_LEN, padding="pre",truncating="post"))

In [None]:
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_val_seq = np.array(pad_sequences(X_val_seq, INPUT_LEN, padding="pre",truncating="post"))


In [None]:
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_seq = np.array(pad_sequences(X_test_seq, INPUT_LEN, padding="pre",truncating="post"))

# Building the classifier

In [None]:
# load the whole embedding into memory
embeddings_index = dict()
#path to the GloVE 300-dimensional embedding file
#download the embeddings from https://nlp.stanford.edu/projects/glove/

f = open('glove.6B.300d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
#creating the embedding matrix for the whole vocabulary
num_words = len(tokenizer.word_index)+1
print("Total number words in the vocabulary: ", num_words)

In [None]:
# create a weight matrix for words in training set
embedding_matrix = np.zeros((num_words, 300))

for word, i in tokenizer.word_index.items():
  
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and i<num_words:
        embedding_matrix[i] = embedding_vector

In [None]:
tf.keras.backend.clear_session()

In [None]:
#hyperparameters
num_of_filters = 64
kernel_size = 4

classifier_model=tf.keras.models.Sequential()
#embedding layer
classifier_model.add(layers.Embedding(num_words,300,weights=[embedding_matrix], input_length=INPUT_LEN,trainable=False)) 

#CNN layer
classifier_model.add(layers.Conv1D(filters=num_of_filters, kernel_size=kernel_size,activation='relu'))
classifier_model.add(layers.MaxPool1D(pool_size=2))
classifier_model.add(layers.BatchNormalization())

#LSTM
classifier_model.add(layers.Bidirectional(layers.LSTM(256, return_sequences=True)))
classifier_model.add(layers.Dropout(0.5))

classifier_model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=False)))
classifier_model.add(layers.Dropout(0.25))

#Dense layer
classifier_model.add(layers.Dense(128, activation='relu'))
classifier_model.add(layers.Dropout(0.25))
#classifier_model.add(layers.Dense(64, activation='relu'))
classifier_model.add(layers.Dense(32, activation='relu'))
classifier_model.add(layers.Dense(2, activation='softmax'))

classifier_model.summary()




In [None]:
#loading the model weights
classifier_model.load_weights("model_weights/classifier.keras")
classifier_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0015),
             loss='categorical_crossentropy',
            metrics=['accuracy'])


In [None]:
# Load the extension and start TensorBoard

%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
#converting to one hot encoding
y_train_one_hot = tf.keras.utils.to_categorical(y_train, num_classes=2)
y_val_one_hot = tf.keras.utils.to_categorical(y_val, num_classes=2)
y_test_one_hot = tf.keras.utils.to_categorical(y_test, num_classes=2)

In [None]:
from keras.callbacks import TensorBoard
from time import time
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))


In [None]:
history = classifier_model.fit(X_train_seq, y_train_one_hot,
                    epochs=5,
                    verbose=1,
                    batch_size = 256,
                    validation_data=(X_val_seq, y_val_one_hot),
                    callbacks=[tensorboard]
              )


In [None]:
#saving the model weights
classifier_model.save_weights("model_weights/classifier_2.keras")

# Model Evaluation

In [None]:
#getting the prediction classes for the training set
train_predictions = classifier_model.predict_classes(X_train_seq)

#confusion matrix for the training set
conf_mat_train = confusion_matrix(y_train, train_predictions)

print("Confusion Matrix: \n",conf_mat_train )

#printing the accuracy on the training set
loss,accuracy = classifier_model.evaluate(X_train_seq, y_train_one_hot)
print("Accuracy on the training set: ", accuracy)

In [None]:
#getting the prediction classes for the validation set
val_predictions = classifier_model.predict_classes(X_val_seq)

#confusion matrix for the training set
conf_mat_val = confusion_matrix(y_val, val_predictions)

print("Confusion Matrix: \n",conf_mat_val )

#printing the accuracy on the training set
loss,accuracy = classifier_model.evaluate(X_val_seq, y_val_one_hot)
print("Accuracy on the validation set: ", accuracy)

In [None]:
#getting the prediction classes for the test set
test_predictions = classifier_model.predict_classes(X_test_seq)

#confusion matrix for the training set
conf_mat_test = confusion_matrix(y_test, test_predictions)

print("Confusion Matrix: \n",conf_mat_test )

#printing the accuracy on the training set
loss,accuracy = classifier_model.evaluate(X_test_seq, y_test_one_hot)
print("Accuracy on the testing set: ", accuracy)

### Plotting precision-recall curve

In [None]:
#getting the actual prediction probabilities on training set and extracting the probability for the 0 class
t_preds = classifier_model.predict(X_train_seq)
t_preds_zeros = t_preds[:, 0]
#t_preds_zeros.shape

In [None]:
#calculating the precision and recall for the training set
t_precision, t_recall, thresholds = precision_recall_curve(y_train, t_preds_zeros, pos_label=0)

#finding the precision value at a recall of 90% and the threshold
train_ind= 0
for i in range(len(t_precision)):
  if t_precision[i]>=0.9:
    train_ind = i
    break
print("Precision at recall 90%: ", t_precision[train_ind])
print("Threshold: ", thresholds[train_ind])

In [None]:
#getting the actual prediction probabilities on validation set and extracting the probability for the 0 class
v_preds = classifier_model.predict(X_val_seq)
v_preds_zeros = v_preds[:, 0]


In [None]:
#calculating the precision and recall for the validation set
v_precision, v_recall, v_thresholds = precision_recall_curve(y_val, v_preds_zeros, pos_label=0)

#finding the precision value at a recall of 90% and the threshold
val_ind= 0
for i in range(len(v_precision)):
  if v_precision[i]>=0.9:
    val_ind = i
    break
print("Precision at recall 90%: ", v_precision[val_ind])
print("Threshold: ", v_thresholds[val_ind])

In [None]:
#getting the actual prediction probabilities on test set and extracting the probability for the 0 class
test_preds = classifier_model.predict(X_test_seq)
test_preds_zeros = test_preds[:, 0]


In [None]:
#calculating the precision and recall for the validation set
test_precision, test_recall, test_thresholds = precision_recall_curve(y_test, test_preds_zeros, pos_label=0)

#finding the precision value at a recall of 90% and the threshold
test_ind= 0
for i in range(len(test_precision)):
  if test_precision[i]>=0.9:
    test_ind = i
    break
print("Precision at recall 90%: ", test_precision[test_ind])
print("Threshold: ", test_thresholds[test_ind])