In [1]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping
import contractions
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

In [2]:
#read in training & testing dataset, and filter out unknown label
df = pd.read_csv('C:/Users/admin/Documents/USCDoc/CSCI 544/project_data/train_dataset_mixed.csv',encoding= 'unicode_escape')
df.columns = ['contents','status']
df[df.isnull().values==True]
df = df.loc[(df['status'] == 'real') | (df['status'] == 'fake')]

df2 = pd.read_csv('C:/Users/admin/Documents/USCDoc/CSCI 544/project_data/test_dataset_pure.csv',encoding= 'unicode_escape')
df2.columns = ['contents','status']
df2[df2.isnull().values==True]
df2 = df2.loc[(df2['status'] == 'real') | (df2['status'] == 'fake')]

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# function to contract words
def contraction_word(input_str):
    words = []
    for word in input_str.split():
        words.append(contractions.fix(word))
    ret_str = ' '.join(words)
    return ret_str

#function to remove stop words
def remove_stopwords(input_str):
    words = []

    for word in input_str.split(' '):
        if word not in stop_words:
            words.append(word)
    ret_str = ' '.join(words)
    return ret_str

# lemmatize word
def lemmatize_words(input_str):
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = WordNetLemmatizer()
    res = ''
    for w in tokenizer.tokenize(input_str):
        res = res + lemmatizer.lemmatize(w) + ' '
    res = res[:-1]
    return res

# function for data cleaning in general
def data_cleaning(reviews):
    reviews_cleaned = []
    for review in reviews:
        # convert the all reviews into the lower case.
        review = review.lower()
        # remove the HTML and URLs from the reviews
        review = re.sub(r'http\S+', '', review)
        review = re.sub(r'www.\S+', '', review)
        # remove non-alphabetical characters
        review = re.sub("[^a-z]+", ' ', review)
        # remove extra spaces
        review = re.sub(' +', ' ', review)
        # perform contractions on the reviews
        review = contraction_word(review)
        reviews_cleaned.append(review)

    reviews2 = []
    for review in reviews_cleaned:
        reviews2.append(remove_stopwords(review))

    reviews3 = []
    for review in reviews2:
        reviews3.append(lemmatize_words(review))

    return reviews3

In [4]:
#concat training and testing temporary to tokenize input sentences
df_concat =  pd.concat([df, df2])
#factorize output value to 0/1, with respect to fake and real
df_concat['status_id'] = [0 if i == 'fake' else 1 for i in df_concat['status']]
#perform data cleaning
clean_review = data_cleaning(df_concat['contents'])
df_concat['cut_review'] = clean_review

In [5]:
#select 15000 commonly used word, in this case, upper bound of number of unique words
MAX_NB_WORDS = 15000
#set the max sequence length, 99.9% of input sentences are smaller than 100 words
MAX_SEQUENCE_LENGTH = 100
#set dimension of embedding layers
EMBEDDING_DIM = 100

#create tokenizer, to crrate dict of word index based on frequence
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_concat['cut_review'].values)

#create sequences based on word index
X = tokenizer.texts_to_sequences(df_concat['cut_review'].values)
#unify the length of sequences of each input sentence to 100
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

#create one-hot matrix for output value
Y = pd.get_dummies(df_concat['status_id']).values

#split unified data into training and testing dataset based on input size
X_train = X[:df.shape[0]]
Y_train = Y[:df.shape[0]]
X_test = X[df.shape[0]:]
Y_test = Y[df.shape[0]:]
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(21182, 100) (21182, 2)
(1790, 100) (1790, 2)


In [6]:
#define model with embedding dimention, dropout rate, dense dimention
#optimizer method, and monitor metrics during training
model = Sequential()
# model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM))
model.add(LSTM(100, dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         1500000   
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 2)                 202       
                                                                 
Total params: 1,580,602
Trainable params: 1,580,602
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
epochs = 10
batch_size = 64
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2,
                    callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.01)])

Epoch 1/10
Epoch 2/10


In [8]:
#predict the training dataset
y_pred = model.predict(X_test)
#use argmax to convert prediction result to 0 or 1
y_pred = y_pred.argmax(axis = 1)
#convert 2D array to 1D array
Y_test = Y_test.argmax(axis = 1)



In [9]:
class_names = ['fake','real']
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
print('accuracy %s' % accuracy_score(y_pred, Y_test))
print(classification_report(Y_test, y_pred,target_names=class_names))

accuracy 0.9167597765363128
              precision    recall  f1-score   support

        fake       0.90      0.89      0.90       737
        real       0.93      0.93      0.93      1053

    accuracy                           0.92      1790
   macro avg       0.91      0.91      0.91      1790
weighted avg       0.92      0.92      0.92      1790



In [None]:
# from keras.model to save the model for web application use
model.save("C:\\Users\\admin\\Documents\\USCDoc\\CSCI 544\\project\\lstm_model.h5")