<a href="https://colab.research.google.com/github/iDunnnnno/-CTAPDEVL_EXERCISES_COM211/blob/main/Exercise4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 4

In [7]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
import numpy as np
import pandas as pd

dataset = pd.read_json('reviews.json')


mask = (dataset['rating'] > 0) & (dataset['rating'] < 3)
column_name = 'rating'
dataset.loc[mask, column_name] = 0

mask = (dataset['rating'] > 2) & (dataset['rating'] < 6)
column_name = 'rating'
dataset.loc[mask, column_name] = 1

In [9]:
dataset.head()

Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,1
1,di pareha yong mga size nila may sobrang liit ...,1
2,super worth it ang ganda Sombra grabi order na...,1
3,ganda po salamat,1
4,maayos pagkadeliver maganda den sya,1


In [10]:
reviews = dataset['review'].tolist()
rating = dataset['rating'].tolist()

training_size = int(len(reviews) * 0.8)
training_reviews = reviews[0:training_size]
testing_reviews = reviews[training_size:]
training_rating = rating[0:training_size]
testing_rating = rating[training_size:]

training_rating_final = np.array(training_rating)
testing_rating_final = np.array(testing_rating)

## 1. Tokenize the data

In [11]:
# answer here
vocab_size = 3130
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = ""

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_reviews)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_reviews)
testing_sequences = tokenizer.texts_to_sequences(testing_reviews)



## 2. Sequence the data

In [12]:
# answer here
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])


print(training_reviews[1])

di pareha yong mga size nila may sobrang liit akjejrjrjjfjjriro4k4jrjrjfjrjrjrjrjjtjrj


## 3. Pad the data

In [13]:
# answer here
padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type,
                       truncating=trunc_type)

testing_padded = pad_sequences(testing_sequences,maxlen=max_length,
                               padding=padding_type, truncating=trunc_type)

print(decode_review(padded[1]))

di pareha yong mga size nila may sobrang liit akjejrjrjjfjjriro4k4jrjrjfjrjrjrjrjjtjrj ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?


## 4. Train a sentiment model

In [14]:
# answer here
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

num_epochs = 15
model.fit(padded, training_rating_final, epochs=num_epochs, validation_data=(testing_padded, testing_rating_final))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           50080     
                                                                 
 flatten (Flatten)           (None, 1600)              0         
                                                                 
 dense (Dense)               (None, 6)                 9606      
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 59693 (233.18 KB)
Trainable params: 59693 (233.18 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epo

<keras.src.callbacks.History at 0x789eb4b9b970>

## Get files for visualing the network

In [15]:
# answer here
# First get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)


(3130, 16)


In [16]:
import io

# Create the reverse word index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Write out the embedding vectors and metadata
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [17]:
# Download the files
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 5. Predict sentiment with new reviews

In [18]:
# answer here
# Use the model to predict a review
fake_reviews = ['ang ganda sobra',
                'pangit amp',
                'di sakto sakin',
                'pangit hah thank you',
                'ganda parang tanga',
                'tagal dumating',
                'sagwa di bagay',
                'sakto lang']

print(fake_reviews)

# Create the sequences
padding_type='post'
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)

print('\nHOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!\n')

classes = model.predict(fakes_padded)

# The closer the class is to 1, the more positive the review is deemed to be
for x in range(len(fake_reviews)):
  print(fake_reviews[x])
  print(classes[x])
  print('\n')

['ang ganda sobra', 'pangit amp', 'di sakto sakin', 'pangit hah thank you', 'ganda parang tanga', 'tagal dumating', 'sagwa di bagay', 'sakto lang']

HOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!

ang ganda sobra
[0.86999255]


pangit amp
[0.314538]


di sakto sakin
[0.45107922]


pangit hah thank you
[0.74498576]


ganda parang tanga
[0.8171108]


tagal dumating
[0.6057337]


sagwa di bagay
[0.45626557]


sakto lang
[0.90785676]


