In [1]:
# ----------------------- 1) IMPORT LIBRARIES -----------------------
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# ----------------------- 2) GET THE VOCABULARY OF DATASET -----------------------

#READ .xls FILE with preprocessed data
movie_reviews = pd.read_excel("C:\\Users\\DespoinaK\\Desktop\\NN-Project\\preprocessedData.xls")

X = movie_reviews['review']
y = movie_reviews['sentiment']
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))

#Split dataset: 80% -->training, 20% -->testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


# -------------------- TFIDF(VECTORIZATION) --------------------

# settings that you use for count vectorizer will go here
tfidf_vectorizer=TfidfVectorizer(use_idf=True)

# just send in all your docs here
tfidf_vectorizer_vectors_XTrain=tfidf_vectorizer.fit_transform(X_train)

tfidf_vectorizer_vectors_XTest=tfidf_vectorizer.transform(X_test)

In [3]:
#Vocabulary
vocabulary = tfidf_vectorizer.get_feature_names()
#%store vocabulary
#%store -r vocabulary
vocabulary

['aa',
 'aaa',
 'aaaaaaaaaaaahhhhhhhhhhhhhh',
 'aaaaaaaargh',
 'aaaaagh',
 'aaaaah',
 'aaaaahhhh',
 'aaaaargh',
 'aaaaatch',
 'aaaaaw',
 'aaaahhhhhh',
 'aaaahhhhhhh',
 'aaaarrgh',
 'aaaawwwwww',
 'aaah',
 'aaahhhhhhh',
 'aaam',
 'aaand',
 'aaargh',
 'aaarrrgh',
 'aaaugh',
 'aab',
 'aachen',
 'aada',
 'aadha',
 'aag',
 'aage',
 'aaghh',
 'aah',
 'aahhh',
 'aahhhh',
 'aaila',
 'aailiyah',
 'aaja',
 'aajala',
 'aakash',
 'aake',
 'aaker',
 'aalcc',
 'aaliyah',
 'aalox',
 'aames',
 'aamir',
 'aamr',
 'aamto',
 'aan',
 'aankh',
 'aankhen',
 'aap',
 'aapke',
 'aapkey',
 'aaran',
 'aardman',
 'aardvark',
 'aarf',
 'aargh',
 'aarika',
 'aaron',
 'aarp',
 'aashok',
 'aasmaan',
 'aasman',
 'aatish',
 'aaton',
 'aau',
 'aauugghh',
 'aavjo',
 'aawip',
 'aaww',
 'ab',
 'aba',
 'aback',
 'abadi',
 'abagail',
 'abanazer',
 'abandon',
 'abandonment',
 'abanks',
 'abase',
 'abash',
 'abashidze',
 'abate',
 'abatement',
 'abattoir',
 'abba',
 'abbad',
 'abbas',
 'abbasi',
 'abbe',
 'abbey',
 'abbie',
 '

In [4]:
# ----------------------- 3) LOAD THE MODEL -----------------------

# Recreate the exact same model, including its weights and the optimizer
new_model = tf.keras.models.load_model('C:\\Users\\DespoinaK\\Desktop\\NN-Project\\my_model.h5')

# Show the model architecture
new_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 16)                1202976   
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
_________________________________________________________________
activation_3 (Activation)    (None, 1)                 0         
Total params: 1,203,265
Trainable params: 1,203,265
Non-trainable params: 0
_________________________________________________________________


In [5]:
loss, acc = new_model.evaluate(tfidf_vectorizer_vectors_XTest, y_test, verbose=1)
print('Restored model, accuracy: {:5.2f}%'.format(100*acc))

Restored model, accuracy: 89.71%


In [6]:
# ----------------------- 4) DEFINE PREPROCESS FUNCTIONS -----------------------

def preprocess_text(sen):
    
    # -------------------- REMOVE HTML TAGS --------------------
    #remove_tags function: simply replaces anything between opening and closing <> with an empty space
    clean = re.compile('<.*?>')
    sentence = re.sub(clean, ' ', sen)

    # -------------------- REMOVE PUNCTUATIONS AND NUMBERS --------------------
    word1 = "can't" #it's a special exception because when we cut the apostrophe, then the can't --> can that we don't want it
    word2 = "won't" #it's a special exception because when we cut the apostrophe, then the won't --> won that we don't want it
    
    if word1 in sentence:
        sentence = sentence.replace(word1,"can not")
    
    if word2 in sen:
        sentence = sentence.replace(word2,"will not")
    
    sentence = re.sub('[^a-zA-Z]', ' ', sentence) #remove punctuations and numbers
    
    
    # -------------------- SINGLE CHARACTER REMOVAL --------------------
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # -------------------- REMOVE MULTIPLE SPACES --------------------
    sentence = re.sub(r'\s+', ' ', sentence)
    
    # -------------------- CONVERT TO LOWERCASE --------------------
    sentence = sentence.lower()
    
    #-----------------------------------------------------------
    negations = {    'aren':'are not', 
                     'hasn':'has not',
                     'wasn':'was not',
                  '  doesn':'does not',
                  'shouldn':'should not',
                     'didn':'did not',
                    'mustn':'must not',
                     'hadn':'had not',
                    'weren':'were not',
                     'shan':'shall not',
                    'needn':'need not',
                   'wouldn':'would not',
                      'don':'do not',
                      'ain':'is not',
                    'haven':'have not',
                      'isn':'is not',
                   'mightn':'might not',
                   'couldn':'could not' }
    
    #replace only the whole word and NOT the part of the word (e.g aren --> are not BUT arena --> arena)
    sentence = ' '.join(negations[i] if i in negations else i for i in sentence.split())
    
    # -------------------- STOP-WORDS REMOVAL --------------------
    #stop_words = set(stopwords.words('english'))
    stop_words = set(stopwords.words('english')) - set(['no','nor','any','few','not'])
    word_tokens = word_tokenize(sentence)
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
  
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)

    sentence = TreebankWordDetokenizer().detokenize(filtered_sentence)
    
    # -------------------- LEMMATIZATION --------------------
    #Let's trying the lemmatization approach as it is more accurate 
    # 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()  

    sentence = word_tokenize(sentence)
    #print(sen)
    filteredSen = [] 
    for i in sentence:
        wordnetPOS = get_wordnet_pos(i)
        #print (wordnetPOS)

        # 2. Lemmatize Single Word with the appropriate POS tag
        lemma = lemmatizer.lemmatize(i,wordnetPOS)
        #print("lemma of "+i+" is: "+lemma)
        i = i.replace(i,lemma)
        filteredSen.append(i)
 
        sentence = TreebankWordDetokenizer().detokenize(filteredSen)      
        #print(sen)
    
    return sentence

def get_wordnet_pos(word):
    
    tag_dict = {   'NN':'n',
                  'NNS':'n',
                  'NNP':'n',
                 'NNPS':'n',
                   'JJ':'a',
                  'JJR':'a',
                  'JJS':'a',
                   'RB':'r',
                  'RBR':'r',
                  'RBS':'r',
                  'VB':'v',
                  'VBD':'v',
                  'VBG':'v',
                  'VBN':'v',
                  'VBP':'v',
                  'VBZ':'v'}
    
    
    wordPOS = nltk.pos_tag([word])
    #print (wordPOS[0][0],wordPOS[0][1])
    tag = wordPOS[0][1]
    
    if tag in tag_dict:  
        return tag_dict.get(tag)
    else:
        return 'n'


In [7]:
# ----------------------- 5) TAKE THE USER'S REVIEW FROM KEYBOARD -----------------------

txt =  input("Write your review: ") 

Write your review: Perfect movie. I like it


In [8]:
# ----------------------- 6) PREPROCESS THE REVIEW -----------------------

txtnew = preprocess_text(txt)
txtnew = [txtnew]

tfidf_vectorizer_vectors_TXT=tfidf_vectorizer.transform(txtnew)

dfN = pd.DataFrame(tfidf_vectorizer_vectors_TXT.T.todense(), index=vocabulary, columns=["tfidf"])
dfN.sort_values(by=["tfidf"],ascending=False)

tfidf_vectorizer_vectors_TXT

<1x75185 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [9]:
# ----------------------- 7) LOAD THE MODEL AGAIN AND MAKE PREDICTION -----------------------

new_model = tf.keras.models.load_model('C:\\Users\\DespoinaK\\Desktop\\NN-Project\\my_model.h5')
prediction = new_model.predict(tfidf_vectorizer_vectors_TXT)
#print("Prediction (0 = negative, 1 = positive) = ", end="")
#print("%0.4f" % prediction[0][0])
#print( round(prediction[0][0]))
prediction = prediction[0][0]
if (prediction >= 0.50) & (prediction <= 1.0):
    print("Your review was positive (",prediction,")")
else:
    print("Your review was negative(",prediction,")")

Your review was positive ( 0.9903448 )
