In [11]:
#Importing libraries
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import re
import json
import pandas as pd

# My code
from main import my_build_model, my_train

# HyperParameters:
embed_dim = 12
lstm_out = 300
batch_size = 32
vocab_size = 2500
# =============


In [3]:
## Preparing dataset

def convert(x):
    """
    Coverting JSON to pandas dataframe

    """    
    ob = json.loads(x)
    for k, v in ob.items():
        if isinstance(v, list):
            ob[k] = ','.join(v)
        elif isinstance(v, dict):
            for kk, vv in v.items():
                ob['%s_%s' % (k, kk)] = vv
            del ob[k]
    return ob



def filter_data(data):
    """
    Converting into pandas dataframe and filtering only text and ratings given by the users
    """

    df = pd.DataFrame([convert(line) for line in data])
    df.drop(columns=df.columns.difference(['text','stars']),inplace=True)
    df.loc[:, ("sentiment")] = 0
    

#     #I have considered a rating above 3 as positive and less than or equal to 3 as negative.
    df.loc[:,'sentiment']=['pos' if (x>3) else 'neg' for x in df.loc[:, 'stars']]
    df.loc[:,'text'] = df.loc[:,'text'].apply(lambda x: x.lower())
    df.loc[:,'text'] = df.loc[:,'text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
    for idx,row in df.iterrows():
        df.loc[:,'text']= [x for x in df.loc[:,'text']]
    return df


In [4]:
##Buidling the LSTM network using Keras

def build_model(X):
    model = Sequential()
    model.add(Embedding(2500, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

    return model


def train(model):

    Y = pd.get_dummies(data['sentiment']).values
    X_train, X_valid, Y_train, Y_valid = train_test_split(X,Y, test_size = 0.20, random_state = 36)

    #Here we train the Network.
    model.fit(X_train, Y_train, batch_size=batch_size, epochs=10,  verbose=5)

    # Measuring score and accuracy on validation set
    score,acc = model.evaluate(X_valid, Y_valid, verbose=2, batch_size=batch_size)
    print("Logloss score: %.2f" % (score))
    print("Validation set Accuracy: %.2f" % (acc))

In [5]:
json_filename = 'review_mockup.json'
with open(json_filename,'rb') as f:
    data = f.readlines()
data = filter_data(data)


In [None]:
print(data)

      stars                                               text sentiment
0       3.0  if you decide to eat here just be aware it is ...       neg
1       5.0  ive taken a lot of spin classes over the years...       pos
2       3.0  family diner had the buffet eclectic assortmen...       neg
3       5.0  wow  yummy different  delicious   our favorite...       pos
4       4.0  cute interior and owner  gave us tour of upcom...       pos
...     ...                                                ...       ...
4995    4.0  after trying the pizza here on halloween night...       pos
4996    5.0  went here today with my family and i must say ...       pos
4997    1.0  my room was disgusting the odor when i walked ...       neg
4998    5.0  first time my girlfriend and i came here and i...       pos
4999    1.0  by far the worst effort at chain mexican food ...       neg

[5000 rows x 3 columns]


In [20]:
tokenizer = Tokenizer(num_words = vocab_size, split=' ')
tokenizer.fit_on_texts(data.loc[:,'text'].values)
#print(tokenizer.word_index)  # To see the dicstionary
X = tokenizer.texts_to_sequences(data.loc[:,'text'].values)
#print(X)
X = pad_sequences(X)
#print(X)

test=tokenizer.texts_to_sequences(["It was quite awesome and I know why people enjoy it", "and even more really nice than I expected keep doing more like this"])
test = pad_sequences(test)
print(test)


[[  0   0   9   6 324 185   2   4 131 289 109 293   9]
 [  2  84  67  60  74  97   4 667 388 554  67  46  15]]


In [None]:
model = build_model(X)

Metal device set to: Apple M2

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



In [8]:
train(model)

Epoch 1/10


2023-05-27 22:08:52.167893: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10


KeyboardInterrupt: 

In [None]:
y_prob = model.predict(test) 
y_classes = y_prob.argmax(axis=-1)
print(y_prob)
print(y_classes)



In [26]:
############### My code
model = my_build_model(X, hidden_size=lstm_out, vocab_size = 2500)
Y = pd.get_dummies(data['sentiment']).values
print(Y)


        neg    pos
0      True  False
1     False   True
2      True  False
3     False   True
4     False   True
...     ...    ...
4995  False   True
4996  False   True
4997   True  False
4998  False   True
4999   True  False

[5000 rows x 2 columns]
[[ True False]
 [False  True]
 [ True False]
 ...
 [ True False]
 [False  True]
 [ True False]]


In [12]:
my_train(model, X, Y, lstm_out, vocab_size)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.

ValueError: shapes (300,2800) and (301,1) not aligned: 2800 (dim 1) != 301 (dim 0)