### Template for NLP project

The aim of the project is to achieve the following:
 - Train a neural network that is **at least better than random guessing** on your dataset. The template contains the IMDB dataset for sentiment analysis, however, you can choose any other language related data set with the appropriate NLP task.
 - Investigate different neural network architectures (different hyperparameters, different layers, different pre-processing). Explain in the presentation, why the final network was selected! **Do not rely on black-box mechanisms.**
 

In [1]:
# tensorflow modules
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, LayerNormalization
import tensorflow

from nltk.corpus import stopwords

# if you have installed a different version, replace 'r2.6'  with your version in links provided below
print(tensorflow.__version__)

2.9.0


In [2]:
# load imdb dataset
# links to dataset
# original dataset: https://ai.stanford.edu/~amaas/data/sentiment/
# version in tensorflow: https://www.tensorflow.org/versions/r2.6/api_docs/python/tf/keras/datasets/imdb

# select your vocabulary size
vocabularySize = 5000
# load data (it is already pre-processed)
# optional: add other pre.processing steps like stopword removal
(xTrain, yTrain), (xTest, yTest) = imdb.load_data(num_words=vocabularySize)
print('Loaded dataset with {} training samples, {} test samples'.format(len(xTrain), len(xTest)))

# look at the data
print('---review---')
print(xTrain[123])
print(xTrain[124])
print('---label---')
print(yTrain[123])

# look at the respective words
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
print('---review with words---')
print([id2word.get(i, ' ') for i in xTrain[123]])


# other related dataset already in tensorflow:  reuters newswire classification dataset
# see https://www.tensorflow.org/versions/r2.6/api_docs/python/tf/keras/datasets/reuters

Loaded dataset with 25000 training samples, 25000 test samples
---review---
[1, 307, 5, 1301, 20, 1026, 2511, 87, 2775, 52, 116, 5, 31, 7, 4, 91, 1220, 102, 13, 28, 110, 11, 6, 137, 13, 115, 219, 141, 35, 221, 956, 54, 13, 16, 11, 2714, 61, 322, 423, 12, 38, 76, 59, 1803, 72, 8, 2, 23, 5, 967, 12, 38, 85, 62, 358, 99]
[1, 518, 12, 304, 6, 22, 231, 1300, 40, 2, 8, 721, 15, 1727, 117, 142, 15, 955, 2, 5, 2, 2, 15, 2, 6, 87, 20, 42, 6, 87, 229, 83, 6, 991, 31, 18, 4, 2088, 10, 10, 45, 24, 43, 15, 1660, 4669, 65, 47, 195, 3549, 5, 2, 231, 12, 878, 18, 60, 4, 91, 2, 7, 907, 8, 717, 2304, 60, 711, 309, 161, 2396, 38, 78, 45, 89, 2, 2, 4669, 2518, 89, 29, 2, 4, 1511, 83, 268, 58, 15, 2, 4, 3537, 199, 6, 1114, 2, 5, 6, 2, 2, 11, 940, 10, 10, 2, 717, 2, 136, 9, 17, 633, 1307, 4, 20, 4608, 19, 6, 2, 2455, 4764, 1062, 60, 151, 45, 1082, 702, 885, 2699, 1993, 5, 12, 2, 33, 57, 329, 74, 2, 234, 4, 370, 2, 143, 4, 2, 2, 7, 4, 4909, 1455, 40, 12, 9, 49, 243, 7, 2, 2, 2, 18, 4665, 2, 2, 665, 2, 4, 2, 

# Preprocessing

In [3]:
from nltk.corpus import stopwords

In [4]:
#load Stopwords
stopWords = set(stopwords.words('english'))
print(stopWords)

{'i', "it's", 'be', 'hers', 'myself', 'while', 'between', "weren't", 'had', 'hadn', "doesn't", "don't", 'wasn', 'some', 'ourselves', 'about', 'on', 'nor', 'or', 'yourselves', 'through', 'few', "won't", 'am', "hadn't", "mightn't", 'then', 'any', 'down', 'don', 's', 'both', 'above', "wouldn't", 'into', 'having', 'yourself', 'from', 'before', 'being', 'm', 'who', 'been', 'herself', "couldn't", 'below', 'there', 'over', 'shouldn', 'did', 'can', 'your', 'with', 'mightn', 'where', "wasn't", 'out', 'very', 'ma', "hasn't", 'shan', 'd', 'needn', 'too', 'because', 'up', 'his', 'couldn', 'all', 'didn', 'hasn', 'whom', 'haven', 'we', 'against', 'further', 'me', 'own', 've', 'should', 'is', "you're", 'that', 'off', 'aren', 'and', 'those', 'not', 'during', 'by', 'y', 'as', 'was', "needn't", 'only', 'ours', "she's", 'an', 'himself', 'here', 'her', 'such', 'has', 'itself', 'of', "shan't", 'which', 'you', "you've", 'were', "didn't", 'wouldn', 'same', 'the', 'a', "you'd", 'in', "you'll", 'ain', 'now', '

In [5]:
stopWordNumbers = []

for word in stopWords:
    stopwordId = word2id.get(word, 0)
    stopWordNumbers.append(stopwordId)

print(stopWordNumbers)

[10, 42, 27, 6139, 543, 134, 197, 1170, 66, 0, 149, 89, 29877, 46, 3144, 41, 20, 882, 39, 9888, 140, 168, 525, 241, 1866, 52778, 92, 98, 177, 1558, 587, 196, 749, 583, 80, 257, 621, 36, 156, 109, 1980, 34, 74, 762, 423, 1905, 47, 117, 0, 119, 67, 126, 16, 0, 118, 283, 43, 52, 8634, 1478, 41501, 1092, 0, 96, 85, 53, 24, 26232, 29, 15496, 41026, 934, 19932, 72, 426, 1034, 69, 202, 13340, 141, 6, 332, 12, 122, 0, 2, 145, 21, 312, 31, 5132, 14, 13, 12421, 61, 11292, 439, 32, 306, 130, 38, 138, 44, 407, 4, 24086, 60, 22, 871, 68, 158, 39964, 169, 1, 3, 1387, 8, 487, 0, 147, 135, 710, 215, 18, 65, 260, 9127, 20781, 9, 5460, 171, 131, 254, 71, 25, 88, 11, 35, 1613, 277, 124, 0, 95, 48, 86, 58, 8725, 771, 530, 45, 6444, 827, 56, 793, 91, 78, 54, 77, 23, 24007, 100, 20830, 0, 26, 15, 464, 51, 87, 30, 82, 1601, 5, 396, 9540, 363, 33, 40, 1196, 50]


In [6]:
print(len(xTrain))
print(len(xTrain[0]))

25000
218


In [17]:

#for review in xTrain:
def removeStopWordsSpace(minValue, maxValue):
    removedWords = 0
    counter = minValue

    while counter <= maxValue:
        #print("StartLen:",len(xTrain[counter]))
        for word in xTrain[counter]:
            if word in stopWordNumbers:
                xTrain[counter].remove(word)
                removedWords+=1
        #print("EndLine:", len(xTrain[counter]))
        counter+=1
    return removedWords

def remvoeStopWords():
    print("Remove Words")
    startSpace = 0
    endSpace = 999
    while endSpace <= 25000:
        removedWords = removeStopWordsSpace(startSpace, endSpace)
        startSpace+=1000
        endSpace+=1000
        print("Step start:", startSpace, ", End:", endSpace, ", RemovedWords: ", removedWords)

remvoeStopWords()


#print("xTrain: len: ", len(xTrain[123], ", words:", xTrain[123])
        

Step start: 1000 , End: 2000
Step start: 2000 , End: 3000
Step start: 3000 , End: 4000
Step start: 4000 , End: 5000
Step start: 5000 , End: 6000
Step start: 6000 , End: 7000
Step start: 7000 , End: 8000
Step start: 8000 , End: 9000
Step start: 9000 , End: 10000
Step start: 10000 , End: 11000
Step start: 11000 , End: 12000
Step start: 12000 , End: 13000
Step start: 13000 , End: 14000
Step start: 14000 , End: 15000
Step start: 15000 , End: 16000
Step start: 16000 , End: 17000
Step start: 17000 , End: 18000
Step start: 18000 , End: 19000
Step start: 19000 , End: 20000
Step start: 20000 , End: 21000
Step start: 21000 , End: 22000
Step start: 22000 , End: 23000
Step start: 23000 , End: 24000
Step start: 24000 , End: 25000


IndexError: index 25000 is out of bounds for axis 0 with size 25000

In [None]:
# get properties of the dataset
print('Maximum train review length: {}'.format(len(max(xTrain, key=len))))
print('Maximum test review length: {}'.format(len(max(xTest, key=len))))
print('Minimum train review length: {}'.format(len(min(xTrain, key=len))))
print('Minimum test review length: {}'.format(len(min(xTest, key=len))))

In [None]:
# select maximum number of words as input lengt
# pad or truncated (this is done automatically) your data
maxWords = 1000
xTrain = sequence.pad_sequences(xTrain, maxlen=maxWords)
xTest = sequence.pad_sequences(xTest, maxlen=maxWords)

In [None]:
# setup the neural network architecture
# check out the respective tensorflow help page: https://www.tensorflow.org/guide/keras/rnn
model=Sequential()

# define size of embedding, see https://www.tensorflow.org/versions/r2.9/api_docs/python/tf/keras/layers/Embedding
# optional: use a different embedding like word2vec or other options available within tensorflow 
embeddingSize = 128
model.add(Embedding(vocabularySize, embeddingSize, input_length=maxWords))

# add recurrent layers: 
# e.g. a SimpleRNN (https://www.tensorflow.org/versions/r2.9/api_docs/python/tf/keras/layers/SimpleRNN) with
# LayerNormalization (https://www.tensorflow.org/versions/r2.9/api_docs/python/tf/keras/layers/LayerNormalization)
model.add(SimpleRNN(100))
model.add(LayerNormalization())

# add layer for output
model.add(Dense(1, activation='sigmoid'))

# print model and check number of parameters
print(model.summary())

In [None]:
# set parameters for network training
batchSize = 64
numEpochs = 5

# train your model
model.compile(loss='binary_crossentropy',  optimizer='adam', metrics=['accuracy'])
xValid, yValid = xTrain[:batchSize], yTrain[:batchSize]
xTrain2, yTrain2 = xTrain[batchSize:], yTrain[batchSize:]
hist = model.fit(xTrain2, yTrain2, validation_data=(xValid, yValid), batch_size=batchSize, epochs=numEpochs)

# check result
scores = model.evaluate(xTest, yTest, verbose=0)
print('Test accuracy:', scores[1])