<img src="header.png" align="left"/>

# Exercise Sentiment Classification

The goal of this example is to classify movie reviews as positive or negative sentiments. This can be used to classify for example social media postings.

Parts of the example are taken from [1]. The code used the Glove model [2].

- [1] [https://stackabuse.com/python-for-nlp-movie-sentiment-analysis-using-deep-learning-in-keras/](https://stackabuse.com/python-for-nlp-movie-sentiment-analysis-using-deep-learning-in-keras/)
- [2] [https://nlp.stanford.edu/pubs/glove.pdf](https://nlp.stanford.edu/pubs/glove.pdf)


Citation GloVe [4] and dataset [5]:
```
[4] Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation.

[5] Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher, Learning Word Vectors for Sentiment Analysis, Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, June 2011, Portland, Oregon, USA, Association for Computational Linguistics, http://www.aclweb.org/anthology/P11-1015

```

**NOTE**

Document your results by simply adding a markdown cell or a python cell (as comment) and writing your statements into this cell. For some tasks the result cell is already available.


# Import of Modules

In [None]:
#
# Import of modules
#
import os
import re
import string
from urllib.request import urlretrieve
import tarfile
import zipfile
from glob import glob

import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense, SpatialDropout1D
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

In [None]:
#
# Turn off error messages
#
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=Warning)

In [None]:
#
# GPU support
#
import tensorflow as tf
print ( tf.__version__ ) 

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR )
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

# Constants

In [None]:
#
# Path and URL constants
#
urlDataSource = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
localExtractionFolder = 'data/moviereviews'
localDataArchive = localExtractionFolder + '/aclImdb_v1.tar.gz'
textData = localExtractionFolder + '/aclImdb/'

# Support functions

In [None]:
#
# Load data from URL
#
def download_dataset(url,dataset_file_path,extraction_directory):
    if (not os.path.exists(extraction_directory)):
        os.makedirs(extraction_directory)
    if os.path.exists(dataset_file_path):
        print("archive already downloaded.")
    else:
        print("started loading archive from url {}".format(url))
        filename, headers = urlretrieve(url, dataset_file_path)
        print("finished loading archive from url {} to {}".format(url,filename))

def extract_dataset(dataset_file_path, extraction_directory):
    if (not os.path.exists(extraction_directory)):
        os.makedirs(extraction_directory)
    if (dataset_file_path.endswith("tar.gz") or dataset_file_path.endswith(".tgz")):
        tar = tarfile.open(dataset_file_path, "r:gz")
        tar.extractall(path=extraction_directory)
        tar.close()
    elif (dataset_file_path.endswith("tar")):
        tar = tarfile.open(dataset_file_path, "r:")
        tar.extractall(path=extraction_directory)
        tar.close()
    print("extraction of dataset from {} to {} done.".format(dataset_file_path,extraction_directory) )

# Load the data

In [None]:
#
# Download if not already loaded
#
download_dataset(urlDataSource,localDataArchive,localExtractionFolder)

In [None]:
#
# Extract from archive
#
extract_dataset(localDataArchive,localExtractionFolder)

# How are the files organized on the file system?

Take a quick look how the files are organized on the file system.


In [None]:
#
# Collect data from the files
#
def load_texts_labels_from_folders(path, folders):
    print('scanning path {}'.format(path))
    texts,labels = [],[]
    for idx,label in enumerate(folders):
        print('scanning {}'.format(idx))
        for fname in glob(os.path.join(path, label, '*.*')):
            texts.append(open(fname, 'r').read())
            labels.append(idx)
    return texts, np.array(labels).astype(np.int8)

In [None]:
#
# Loading of positive and negative examples
#
classes = ['neg','pos']
x_train,y_train = load_texts_labels_from_folders( textData + 'train', classes)
x_test,y_test = load_texts_labels_from_folders( textData + 'test', classes)

# First checks on the data

In [None]:
#
# Check shapes of data
#
len(x_train),len(y_train),len(x_test),len(y_test)

In [None]:
#
# Check data types
#
(type(x_train),type(y_train))

In [None]:
#
# Check classes
#
np.unique(y_train)

In [None]:
#
# Print some negative examples
#
for index in range (0,1):
    print(x_train[index])
    print("label {}".format(y_train[index]))
    print()

In [None]:
#
# Print some positive examples
#
for index in range (13001,13002):
    print(x_train[index])
    print("label {}".format(y_train[index]))
    print()


# Task: Clean text (1 points)

Write a function called preprocess_text(text) which takes a text piece and **cleans out** the following artifacts:

1. html tags, but leave text between tags intact
1. punctuations and numbers
1. single characters
1. multiple white spaces

In [None]:
#
# Result: the cleaner
#
def preprocess_text(sen):
    sentence = sen
    # ...
    return sentence

In [None]:
#
# Clean all texts
#
x_train_clean = []
for review in x_train:
    x_train_clean.append(preprocess_text(review))
    
x_test_clean = []
for review in x_test:
    x_test_clean.append(preprocess_text(review))  
    
x_test = x_test_clean
x_train = x_train_clean

# Find mean text length

In [None]:
#
# Count length of text strings
#
textLength = []
for index in range (0,len(x_train)):
    textLength.append(len(x_train[index]))

#
# Plot histogram
#
plt.hist(textLength)
lengthArray = np.array(textLength)
print('text character length mean {}'.format(np.mean(lengthArray)))

# Convert words into tokens

In [None]:
#
# Split text up into tokens
#
tokenizer = Tokenizer(num_words=10000, lower=True, oov_token='unknwn')
#
# Train tokenizer
#
tokenizer.fit_on_texts(x_train)

In [None]:
#
# Convert words into integer sequences
#
x_train_v = tokenizer.texts_to_sequences(x_train)
x_test_v = tokenizer.texts_to_sequences(x_test)

In [None]:
# check original sentence
print(x_train[0], len(x_train[0]) )

In [None]:
# check token sequence
print(x_train_v[0], len(x_train_v[0]))

In [None]:
# reverse tokens to text for check
text = tokenizer.sequences_to_texts([x_train_v[0]])
print(text)

In [None]:
#
# Count length of integer sequences (aka word sequences)
#
textLength = []
for index in range (0,len(x_train_v)):
    textLength.append(len(x_train_v[index]))

#
# Plot histogram
#
plt.hist(textLength)
lengthArray = np.array(textLength)
print('vectorized length mean {}'.format(np.mean(lengthArray)))

In [None]:
#
# Get size of vocabulary of tokenizer
#
vocab_size = len(tokenizer.word_index) + 1
print('count of words {}'.format(vocab_size))

# Task: select a proper maximum length of text (1 point)

Set maxlen to a suitable value for the text length. Longer text sequences are cut off, shorter sequences are padded.

In [None]:
#maxlen = ???

In [None]:
#
# Pad sequences
#
x_train_v = pad_sequences(x_train_v, padding='post', maxlen=maxlen)
x_test_v = pad_sequences(x_test_v, padding='post', maxlen=maxlen)

# Download Glove models

In [None]:
gloveUrl = 'http://nlp.stanford.edu/data/glove.6B.zip'
gloveExtractionFolder = 'data/glove'
gloveDataArchive = gloveExtractionFolder + '/glove.6B.zip'

#
# Select 100 dims for embedding space
#
gloveData = gloveExtractionFolder + '/' + 'glove.6B.100d.txt'
gloveDims = 100

In [None]:
def unzip_dataset(dataset_file_path, extraction_directory):  
    if (not os.path.exists(extraction_directory)):
        os.makedirs(extraction_directory)        
    zip = zipfile.ZipFile(dataset_file_path)
    zip.extractall(path=extraction_directory)        
    print("extraction of dataset from {} to {} done.".format(dataset_file_path,extraction_directory) )

In [None]:
#
# Execute download
#
if ( not os.path.exists(gloveData)):
    download_dataset(gloveUrl,gloveDataArchive,gloveExtractionFolder)

In [None]:
#
# Unzip glove
#
if ( not os.path.exists(gloveData)):
    unzip_dataset(gloveDataArchive,gloveExtractionFolder)

# Load glove embeddings into memory

In [None]:
#
# Create dict of glove vectors for each word in glove model
#
embeddings_dictionary = dict()
glove_file = open(gloveData, encoding="utf8")
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [None]:
#
# Copy glove vectors for each word in the tokenizer model
#
embedding_matrix = np.zeros((vocab_size, gloveDims))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
# check shape
print(embedding_matrix.shape)

# Print some examples of glove vectors for words (1 point)

Select some random word from the tokenizer and print the glove vectors for those words.

In [None]:
#
# Result:
#

# Create a simple model

**Note** how the embedding_matrix is used in the first layer to embed the token integers into vectors.

In [None]:
def createNNModel():
    model = Sequential()
    embedding_layer = Embedding(vocab_size, gloveDims, weights=[embedding_matrix], input_length=maxlen , trainable=False)
    model.add(embedding_layer)
    model.add(Flatten())
    model.add(Dense(100, activation='sigmoid'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [None]:
model = createNNModel()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

In [None]:
history = model.fit(x_train_v, y_train, batch_size=128, epochs=12, verbose=1, validation_split=0.2)

In [None]:
score = model.evaluate(x_test_v, y_test, verbose=1)

In [None]:
print("test loss:", score[0])
print("test accuracy:", score[1])

In [None]:
def plotResults(history):
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train','test'], loc='upper left')
    plt.show()

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])

    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train','test'], loc='upper left')
    plt.show()

In [None]:
plotResults(history)

# Save model

In [None]:
#
# Save a model for later use
#
from keras.models import model_from_json

prefix = 'results/02_'
modelName = prefix + "model.json"
weightName = prefix + "model.h5"


def handle_model(model,save_model):
    # set to True if the model should be saved
    if save_model:
        model_json = model.to_json()
        with open( modelName , "w") as json_file:
            json_file.write(model_json)
        # serialize weights to HDF5
        model.save_weights( weightName )
        print("saved model to disk as {} {}".format(modelName,weightName))
        return model
    

    # load model (has to be saved before, is not part of git)    
    if not save_model:
        json_file = open(modelName, 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        loaded_model = model_from_json(loaded_model_json)
        # load weights into new model
        loaded_model.load_weights(weightName)
        print("loaded model from disk")        
        return loaded_model
    
#
# Load or save model
#
model = handle_model(model,True)

# Task: Improved model based on LSTMs (2 points)

The previous model reaches around 70% of test accuracy. This is not sufficient for your customer. So we need a better model. Research the internet for sentiment analysis models using LSTMs and implement a better version of the model based on this information.

1. Implement an LSTM based model version for sentiment analysis (you can also use a different model if you find publications for it)
1. Document the sources you have found
1. Test the model in comparison to the older model version

In [None]:
#
# Result: new model
#
def createLSTMModel():
    model = Sequential()
    
    # ...
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    return model

In [None]:
model2 = createLSTMModel()
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model2.summary())

In [None]:
history = model2.fit(x_train_v, y_train, batch_size=128, epochs=30, verbose=1, validation_split=0.2)
score = model2.evaluate(x_test_v, y_test, verbose=1)

In [None]:
print("test loss:", score[0])
print("test accuracy:", score[1])

In [None]:
plotResults(history)

In [None]:
model2 = handle_model(model2,True)

# Task: Replace 100 d model with 300 d model for embedding (2 points)

Try better embedding model with 300 dimensions instead of the 100 dimension model. Load the different Glove weights, update the vector matrix for the embedding layer and the model structure for the better Glove model.


In [67]:
#
# Result: Code and accuracy of new model
#

# Task: Replace Glove model with BERT model vectors (2 points)

Try to replace Glove with a BERT model. This is no easy task. Research the internet for tutorials about this goal and write down all changes you would need to implement for this change (concept only, implementation optional).



In [None]:
#
# Result: Concept for switching form Glove to BERT
#

# Test with your own data

In [None]:
instance = x_test_clean[56]
print(instance)

In [None]:
def sentiment(text):
    
    instance = tokenizer.texts_to_sequences(text)
    flat_list = []
    for sublist in instance:
        for item in sublist:
            flat_list.append(item)

    flat_list = [flat_list]
    instance = pad_sequences(flat_list, padding='post', maxlen=maxlen)
    sentiment = model2.predict(instance)
    
    comment = 'meh'
    if sentiment > 0.85:
        comment = 'very good'
    elif sentiment > 0.75:
        comment = 'good'
    elif sentiment > 0.50:
        comment = 'moderate'
    return sentiment,comment

In [None]:
test1 = "I simply don't like this film."
print ( sentiment(test1))

In [None]:
test1 = "I hate this film."
print ( sentiment(test1))

In [None]:
test1 = "I love this film."
print ( sentiment(test1))

In [None]:
test1 = x_test_clean[13000]
print ( sentiment(test1))