Giulia Midulla - 23330406

#Preparatory steps

In [3]:
# The %pip commands below are for use if you have to install the packages
# you can comment them out once you have installed them.
# %pip install nltk
# %pip install gensim
import glob # string manipulation for constructing directory paths
import nltk # bring in the Natural Language Tool Kit
import gensim # bring in Gensim
import os # handle Operating System file tasks
import os.path # Used to determine if a file or directory exists
import numpy as np # convenient mathematical handling
from random import shuffle # facility to generate random selections
from nltk.tokenize import TreebankWordTokenizer # String tokenizer
from gensim import models
from gensim.models.keyedvectors import KeyedVectors # word vector mapping
# Keras network construction and handling libraries
from keras.preprocessing import sequence
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, SimpleRNN, Conv1D, GlobalMaxPooling1D
print("Necessary packages installed....")

Necessary packages installed....


In [4]:
from google.colab import drive
drive.mount('/content/drive')
print("Google drive mounted....")

working_directory = '/content/drive/MyDrive/Colab_Notebooks/Sem_2/Section_2'

if not os.path.exists(working_directory) :
  os.mkdir(working_directory)
  print("Created working directory....")

os.chdir(working_directory)
print("Set current directory to working directory.... :", working_directory)

Mounted at /content/drive
Google drive mounted....
Set current directory to working directory.... : /content/drive/MyDrive/Colab_Notebooks/Sem_2/Section_2


In [5]:
# Extract the Stanford Sentiment Analysis files if you haven't already
os.chdir(working_directory)
print("Set current directory to working directory.... :", working_directory)

if os.path.exists('aclImdb_v1.tar.gz') :
  !gunzip 'aclImdb_v1.tar.gz'

if os.path.exists('aclImdb_v1.tar') :
  !tar -xvf 'aclImdb_v1.tar'

# Extract the Google News trained word vectors if you haven't already
if os.path.exists('GoogleNews-vectors-negative300.bin.gz') :
  !gunzip 'GoogleNews-vectors-negative300.bin.gz'

#
# If you need to save space on your Google Drive, then you can uncomment the
# code below and delete the now unneeded compressed files
#
if os.path.exists(os.path.join(working_directory, 'aclImdb_v1.tar')) :
   print("Removing Stanford directory tarball to save space....")
   os.remove(os.path.join(working_directory, 'aclImdb_v1.tar'))
   print("Removed")

# Check directory contents
print("The files in my working directory are", os.listdir(working_directory))

Set current directory to working directory.... : /content/drive/MyDrive/Colab_Notebooks/Sem_2/Section_2
The files in my working directory are ['GoogleNews-vectors-negative300.bin', 'aclImdb', 'prova.gdoc', 'prova cartella', 'model.png', 'cnn1_model.json', 'cnn1_weights.h5']


#General Assignment Tasks

In [6]:
def pre_process_data(filepath):
    """
    This is fairly generic code for sentiment analysis cleaning but
    it comes down to splitting the data into positive and negative
    sentiments and labelling it accordingly with target values for
    sentiment, 1 for positive and 0 for negative
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []

    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label,f.read()))

    shuffle(dataset)

    return(dataset)

In [7]:
# os.chdir(os.path.join(working_directory,"aclImdb"))
# print("Acting on data in",os.path.join(working_directory,'aclImdb') )
dataset = pre_process_data(os.path.join(working_directory,"aclImdb",'train'))
# Have a quick look at a sample
dataset[42]

(0,
 'I suppose that today this film has relevance because it was an early Sofia Loren film. She was 19 years old when the film was made in 1953.<br /><br />I viewed this film because I wanted to see some of Sofia Loren\'s early work. I was surprised when she came on camera having had her skin bronzed over in brown makeup to resemble an Ethiopian princess. Surely, today, this would have been viewed as a slur and to be avoided in movie making. It actually became annoying watching Ms. Loren in skin color paint throughout the film.<br /><br />Yes, this film would have been better made if the real opera singers had made this movie. Then, the singing and the actual facial gestures of the real artists would have been apparent. I discount the comments by others about whether the real opera singers are older and heavier in weight.<br /><br />As beautiful as Ms. Loren was at age 19 and still is today, the film would have been better received as though it were being performed on the stage. After

In [8]:
w = models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', limit = 20000, binary=True)

In [9]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(w[token])

            except KeyError:
                pass # No matching token in the downloaded Google word2vec vocab

        vectorized_data.append(sample_vecs)

    return vectorized_data

In [10]:
def collect_expected(dataset):
    """ Extract the target sentiments from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

In [11]:
# Pad or truncate each sequence of input tokens in each review so that we have
# a fixed input length (**maxlen**) of tokens for network input

def pad_trunc(data, maximumlen):
    """
    Pad or truncate each review to the size set by the hyperparameter maxlen
    because we need all the inputs of the CNN to be of fixed size.
    """
    new_data = []

    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
        if len(sample) > maximumlen: # if the input is too large, truncate it
            temp = sample[:maximumlen]
        elif len(sample) < maximumlen: # if the input is too small, pad it
            temp = sample
            # Append the appropriate number zero vectors to the list
            additional_elems = maximumlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [12]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)
print("Number of vectorized training data -", len(vectorized_data))

Number of vectorized training data - 25000


In [13]:
# When we no longer need the dataset, we delete it
# This is both good housekeeping and it stops
# Colab from complaining about RAM or disk overrun
del(dataset)

In [14]:
split_point = int(len(vectorized_data)*.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

print("Number of training vectors - ", len(x_train))

Number of training vectors -  20000


In [15]:
del(vectorized_data) # Clear some more storage!
print("Cleared out unneeded data")

Cleared out unneeded data


In [46]:
maxlen = 50
batch_size = 16
embedding_dims = 300 # This is fixed as Google News used 300 dimensional vectors
filters = 125
kernel_size = 1
hidden_dims = 125
epochs = 2

In [47]:
# Pad or truncate the reviews and convert the inputs to an optimized
# Numpy format

x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)   # Does the same with the test cases

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)

x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [48]:
# Let people know what you are doing
print('Build CNN model for our MN5002 Advanced NLP course....')
model_cnn1 = Sequential()
model_cnn1.add(Conv1D(
    filters,
    kernel_size,
    padding='valid',
    activation='relu',
    strides=1,
    input_shape=(maxlen, embedding_dims))
)
model_cnn1.add(GlobalMaxPooling1D())
model_cnn1.add(Dense(hidden_dims)) # This is provides a fully connected hidden layer
model_cnn1.add(Dropout(0.2)) # A dropout of 0.2 is added here
model_cnn1.add(Activation('relu')) # The Rectified Linear Unit activation function is used here
model_cnn1.add(Dense(1)) # Add 1 dense layer
model_cnn1.add(Activation('sigmoid')) # Apply a sigmoid to the output of that layer to take the output to between 0 and 1
print('Model built')
model_cnn1.summary()
model_cnn1.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print('Model compiled')

Build CNN model for our MN5002 Advanced NLP course....
Model built
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           (None, 50, 125)           37625     
                                                                 
 global_max_pooling1d_2 (Gl  (None, 125)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_4 (Dense)             (None, 125)               15750     
                                                                 
 dropout_2 (Dropout)         (None, 125)               0         
                                                                 
 activation_4 (Activation)   (None, 125)               0         
                                                                 
 dense_5 (Dense)             (None, 1)               

In [49]:
model_cnn1.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7fcb7f29a650>

In [50]:
os.chdir(working_directory)
model_cnn1_structure = model_cnn1.to_json()
with open("cnn1_model.json", "w") as json_file:
    json_file.write(model_cnn1_structure)
model_cnn1.save_weights("cnn1_weights.h5")

In [51]:
# Import the module to read in a JSON format model
# Remove the lines below with set of 3 inverted commas to enable code


from keras.models import model_from_json

# Now instantiate a model

os.chdir(working_directory)

with open("cnn1_model.json", "r") as json_file:
  json_string = json_file.read()
model_cnn1 = model_from_json(json_string)

# Once the model structure exists, set its characteristic weights

model_cnn1.load_weights('cnn1_weights.h5')
model_cnn1.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           (None, 50, 125)           37625     
                                                                 
 global_max_pooling1d_2 (Gl  (None, 125)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_4 (Dense)             (None, 125)               15750     
                                                                 
 dropout_2 (Dropout)         (None, 125)               0         
                                                                 
 activation_4 (Activation)   (None, 125)               0         
                                                                 
 dense_5 (Dense)             (None, 1)                 126       
                                                      

#Main Task 1

In [52]:
test1 = 'this movie is terrible'
print(test1, '- straightforward negative sentiment, used to check if the model works correctly\n')
test2 = 'this movie is wonderful, the best thing I\'ve ever seen I believe it to be an undisputed masterpiece'
print(test2, '- straightforward positive sentiment, used to check if the model works correctly\n')
test3 = 'this isn\'t the best movie of all time'
print(test3, '- negation of a positive sentiment, used to check the model\'s reaction\n')
test4 = 'this movie is as good as The Room'
print(test4, '- \"The Room\" is considered to be one of the worst movies ever made. This test is a trick\n')
test5 = 'how this movie is considered good will never fail to surprise me'
print(test5, '- different type of trick: the reviewer is admitting that the movie is generally considered good, but they have an opposing opinion\n')
test6 = 'watching this movie will make you feel like holding a puppy or eating chocolate'
print(test6, '- not explicit positive review\n')
test7 = 'Parasite is best described as a melancholy ghost story, albeit one disguised beneath umpteen layers of superbly designed (and impeccably photographed) generic mutations. Thrillingly played by a flawless ensemble cast who hit every note and harmonic resonance of Bong and co-writer Han Jin-won’s multitonal script, it’s a tragicomic masterclass that will get under your skin and eat away at your cinematic soul.'
print(test7, '- long, glowing review (source: The Guardian)')


this movie is terrible - straightforward negative sentiment, used to check if the model works correctly

this movie is wonderful, the best thing I've ever seen I believe it to be an undisputed masterpiece - straightforward positive sentiment, used to check if the model works correctly

this isn't the best movie of all time - negation of a positive sentiment, used to check the model's reaction

this movie is as good as The Room - "The Room" is considered to be one of the worst movies ever made. This test is a trick

how this movie is considered good will never fail to surprise me - different type of trick: the reviewer is admitting that the movie is generally considered good, but they have an opposing opinion

watching this movie will make you feel like holding a puppy or eating chocolate - not explicit positive review

Parasite is best described as a melancholy ghost story, albeit one disguised beneath umpteen layers of superbly designed (and impeccably photographed) generic mutations.

#Main Task 2

##Test 5

After a few attempts, it looks like the test that yelded the most accurate results was test 2, where all the hyperparameters were halved. In this attempt I halved all the hyperparameters, except for the epochs, to avoid undertraining the model.

In [53]:
# Make a sample
sample_2 ="This is one of the worst movies I have ever seen. I am surprised anyone has recommended it, it is as bad as they get."

vec_list = tokenize_and_vectorize([(1, sample_2)]) # Feed a sample to the tokenizer and vectorizer

# Convert the input to a fixed input length by padding or truncating
test_vec_list = pad_trunc(vec_list, maxlen) # convert the input to a fixed input length

# Input it to the model to predict the sentiment for it
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

# Present the prediction
model_cnn1.predict(test_vec) # predict the sentiment of the input




array([[0.15233505]], dtype=float32)

Negative - OK

This is an explicitly negative review, and the result is consistent with that.

In [54]:
# Tokenize and vectorize it
vec_list = tokenize_and_vectorize([(1, test1)]) # Feed a sample to the tokenizer and vectorizer

# Convert the input to a fixed input length by padding or truncating
test_vec_list = pad_trunc(vec_list, maxlen) # convert the input to a fixed input length

# Input it to the model to predict the sentiment for it
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

# Present the prediction
model_cnn1.predict(test_vec) # predict the sentiment of the input



array([[0.13452852]], dtype=float32)

Negative - OK

Another explicitly negative review, correctly recognised.

In [55]:
# Tokenize and vectorize it
vec_list = tokenize_and_vectorize([(1, test2)]) # Feed a sample to the tokenizer and vectorizer

# Convert the input to a fixed input length by padding or truncating
test_vec_list = pad_trunc(vec_list, maxlen) # convert the input to a fixed input length

# Input it to the model to predict the sentiment for it
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

# Present the prediction
model_cnn1.predict(test_vec) # predict the sentiment of the input



array([[0.9658818]], dtype=float32)

Positive - OK

This review is explicitly positive, and the result is very close to 1.

In [56]:
# Tokenize and vectorize it
vec_list = tokenize_and_vectorize([(1, test3)]) # Feed a sample to the tokenizer and vectorizer

# Convert the input to a fixed input length by padding or truncating
test_vec_list = pad_trunc(vec_list, maxlen) # convert the input to a fixed input length

# Input it to the model to predict the sentiment for it
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

# Present the prediction
model_cnn1.predict(test_vec) # predict the sentiment of the input



array([[0.88083464]], dtype=float32)

Negative - NO

This review is confusing: it is not meant to be strongly negative, but it is constructed in a way that could confuse the model. The review says that this is NOT the best movie of all time, but the model has recognised it as strongly positive.

In [57]:
# Tokenize and vectorize it
vec_list = tokenize_and_vectorize([(1, test4)]) # Feed a sample to the tokenizer and vectorizer

# Convert the input to a fixed input length by padding or truncating
test_vec_list = pad_trunc(vec_list, maxlen) # convert the input to a fixed input length

# Input it to the model to predict the sentiment for it
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

# Present the prediction
model_cnn1.predict(test_vec) # predict the sentiment of the input



array([[0.85251075]], dtype=float32)

Negative - NO

This is another confusing review: the tone is ironic, with the reviewer saying it is as good as a movie that's universally considered to be terrible. The model fell into the trap and recognised it as a positive review, not knowing the pop-culture reference.

In [58]:
# Tokenize and vectorize it
vec_list = tokenize_and_vectorize([(1, test5)]) # Feed a sample to the tokenizer and vectorizer

# Convert the input to a fixed input length by padding or truncating
test_vec_list = pad_trunc(vec_list, maxlen) # convert the input to a fixed input length

# Input it to the model to predict the sentiment for it
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

# Present the prediction
model_cnn1.predict(test_vec) # predict the sentiment of the input



array([[0.6730415]], dtype=float32)

Split - NO

In this case, the review states that the movie is considered good, but the reviewer disagrees with this sentiment. Doesn't understand the disagreement, and recognises it as positive, but not strongly.

In [59]:
# Tokenize and vectorize it
vec_list = tokenize_and_vectorize([(1, test6)]) # Feed a sample to the tokenizer and vectorizer

# Convert the input to a fixed input length by padding or truncating
test_vec_list = pad_trunc(vec_list, maxlen) # convert the input to a fixed input length

# Input it to the model to predict the sentiment for it
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

# Present the prediction
model_cnn1.predict(test_vec) # predict the sentiment of the input



array([[0.55929637]], dtype=float32)

Positive - NO

This review is very positive, but not explicit. The reviewer compares watching the movie to a series of pleasurable activities, but it is never outwardly stated that the movie is good and there are no positive adjectives. The model recognises it as a neutral review.

In [60]:
# Tokenize and vectorize it
vec_list = tokenize_and_vectorize([(1, test7)]) # Feed a sample to the tokenizer and vectorizer

# Convert the input to a fixed input length by padding or truncating
test_vec_list = pad_trunc(vec_list, maxlen) # convert the input to a fixed input length

# Input it to the model to predict the sentiment for it
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

# Present the prediction
model_cnn1.predict(test_vec) # predict the sentiment of the input



array([[0.7029168]], dtype=float32)

Positive - OK

This review is very positive. The model recognises it as such, but doesn't understand how stronly positive it is.