In [1]:
# training data and information for this was got at:
# https://towardsdatascience.com/detecting-bad-customer-reviews-with-nlp-d8b36134dc7e

import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

testDataPercent = 5
epocs = 10
batchSize = 512
validationSize = 1000

# stems the data
def clean_sen(sen):
    cleaned_train_sentences = []
    stop_words = stopwords.words('english')
    porter = PorterStemmer()

    for x in sen:
        tokens = word_tokenize(x)
        words = [word for word in tokens if word.isalpha()]
        words = [w for w in words if not w in stop_words]
        stemmed = [porter.stem(word) for word in tokens]
        cleaned_train_sentences.append(stemmed)
    return cleaned_train_sentences
        

# turns sentence into list of ints and removes rare words
def text_to_numbers(text, cutoff_for_rare_words = 1):
    
    # Flatten list if sublists are present
    if len(text) > 1:
        flat_text = [item for sublist in text for item in sublist]
    else:
        flat_text = text
    
    # get word freuqncy
    fdist = nltk.FreqDist(flat_text)

    # Convert to Pandas dataframe
    df_fdist = pd.DataFrame.from_dict(fdist, orient='index')
    df_fdist.columns = ['Frequency']

    # Sort by word frequency
    df_fdist.sort_values(by=['Frequency'], ascending=False, inplace=True)

    # Add word index
    number_of_words = df_fdist.shape[0]
    df_fdist['word_index'] = list(np.arange(number_of_words)+1)

    # replace rare words with index zero
    frequency = df_fdist['Frequency'].values
    word_index = df_fdist['word_index'].values
    mask = frequency <= cutoff_for_rare_words
    word_index[mask] = 0
    df_fdist['word_index'] =  word_index
    
    # Convert pandas to dictionary
    word_dict = df_fdist['word_index'].to_dict()
    print(word_dict)
    with open('trainedKeys.json', 'w') as jd:
        json.dump(word_dict, jd)
    
    # Use dictionary to convert words in text to numbers
    text_numbers = []
    for string in text:
        string_numbers = [word_dict[word] for word in string]
        text_numbers.append(string_numbers)  
    
    return (text_numbers)

def pad(data):
    return keras.preprocessing.sequence.pad_sequences(data, value=0, padding='post', maxlen=250)

def trainer(text_numbers_train, train_score):
    # set up neural network
    model = keras.Sequential()
    model.add(keras.layers.Embedding(30000, 16))
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(16, activation="relu"))
    model.add(keras.layers.Dense(1, activation="sigmoid"))

    model.summary()

    # set up data loss
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

    # split train data into train and validation data
    x_val = text_numbers_train[:validationSize] # strings in numbers
    x_train = text_numbers_train[validationSize:] #  strings in numbers

    y_val = np.asarray(train_score[:validationSize]) # score
    y_train = np.asarray(train_score[validationSize:]) # score

    # train the model
    # fitModel = model.fit(x_train, y_train, batch_size=batchSize, epochs=epocs, validation_data=(x_val, y_val), verbose=1)
    model.fit(x_train, y_train, batch_size=batchSize, epochs=epocs, validation_data=(x_val, y_val), verbose=1)
    # model.fit(x_train, y_train, epochs=epocs)

    model.save("model.h5")
    mod = tf.keras.models.load_model("model.h5")
    mod.summary()
    converter = tf.lite.TFLiteConverter.from_keras_model(mod)
    tflite_model = converter.convert()
    open("converted_model.tflite", "wb").write(tflite_model)
    
    return model

def getdata():
    # read data and store it
    reviews_df = pd.read_csv("Hotel_Reviews.csv")
    testSize = np.ceil(len(reviews_df)/testDataPercent)
    testSize = np.int_(len(reviews_df) - testSize)
    reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df["Positive_Review"]
    reviews = reviews_df["Reviewer_Score"].apply(lambda x: 1 if x < 5 else 0)

    # break data into train and test data
    train_examples, test_examples = reviews_df["review"][0:-testSize], reviews_df["review"][-testSize:-1]
    train_score, test_score = reviews[0:-testSize], reviews[-testSize:-1]
    
    return train_examples, test_examples, train_score, test_score

def nlc():
    train_examples, test_examples, train_score, test_score = getdata()

    cleaned_train_sentences = clean_sen(train_examples)
    cleaned_test_sentences = clean_sen(test_examples)

    # turn all the sentence words to ints
    text_numbers_train = text_to_numbers(cleaned_train_sentences)
    text_numbers_test = text_to_numbers(cleaned_test_sentences)

    # pad the data
    text_numbers_train = pad(text_numbers_train)
    text_numbers_test = pad(text_numbers_test)

    model = trainer(text_numbers_train, train_score)

    # testing the data for accurasy
    results = model.evaluate(text_numbers_test, np.asarray(test_score))

    print(results)

    

nlc()


{'the': 1, 'and': 2, 'wa': 3, 'to': 4, 'room': 5, 'a': 6, 'in': 7, 'veri': 8, 'staff': 9, 'of': 10, 'for': 11, 'locat': 12, 'hotel': 13, 'I': 14, 'No': 15, 'is': 16, 'were': 17, 'not': 18, 'it': 19, 'breakfast': 20, 'neg': 21, 'good': 22, 'we': 23, 'with': 24, 'great': 25, 'but': 26, 'bed': 27, 'on': 28, 'at': 29, 'friendli': 30, 't': 31, 'had': 32, 'help': 33, 'have': 34, 'that': 35, 'clean': 36, 'from': 37, 'stay': 38, 'be': 39, 'nice': 40, 'comfort': 41, 'small': 42, 'as': 43, 'our': 44, 'thi': 45, 'you': 46, 'excel': 47, 'all': 48, 'there': 49, 'so': 50, 'love': 51, 'my': 52, 'they': 53, 'are': 54, 'servic': 55, 'We': 56, 'would': 57, 'no': 58, 'noth': 59, 'bathroom': 60, 'bar': 61, 'like': 62, 'which': 63, 'realli': 64, 'out': 65, 'when': 66, 'posit': 67, 'could': 68, 'too': 69, 'night': 70, 'one': 71, 'walk': 72, 'us': 73, 'restaur': 74, 'close': 75, 'onli': 76, 'an': 77, 'shower': 78, 's': 79, 'station': 80, 'london': 81, 'everyth': 82, 'time': 83, 'check': 84, 'book': 85, 'view



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          480000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 480,289
Trainable params: 480,289
Non-trainable params: 0
_________________________________________________________________
Train on 102148 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential"
_________________________________________________________________
Layer (type)               

ValueError: Input 0 of node sequential/embedding/embedding_lookup was passed float from sequential/embedding/embedding_lookup/Read/ReadVariableOp/resource:0 incompatible with expected resource.