Hand-In of Group 13, Jonathan Ehrengruber (jonathan.ehrengruber@students.fhnw.ch), Christian Renold (christian.renold@hslu.ch)

In [1]:
# setup_gpus() function for configuration the gpu on icolab jupyter hub implemented in separate file for reusability
import gpu_init

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
1 Physical GPUs, 1 Logical GPU
GPU Setup done


In [2]:
import numpy as np

In [3]:
def read_vocabulary_from_file(filename):
    with open(filename , 'r', encoding="ISO-8859-1") as f:
        content = f.readlines() # content is a list of lines
        content = [x.strip() for x in content] # removing newline chars
        content = [x for x in content if not x.startswith(';') and not x.endswith('+') and not x == ''] # remove comments, endswith-+ and empty strings
    return content

In [4]:
negative_words = read_vocabulary_from_file('negative-words.txt')
positive_words = read_vocabulary_from_file('positive-words.txt')
print('negative words: {0}'.format(len(negative_words)))
print('positive words: {0}'.format(len(positive_words)))

negative words: 4783
positive words: 2005


In [5]:
# to get GloVe vectors: wget http://nlp.stanford.edu/data/glove.6B.zip
def load_glove_embeddings(path):
    embeddings = {}
    with open(path , 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            w = values[0]
            vectors = np.asarray(values[1:], dtype='float32')
            embeddings[w] = vectors
    return embeddings

In [6]:
# word embeddings - load GloVe dictionary
word_dict = load_glove_embeddings('glove.6B.50d.txt')

In [7]:
train_size = 1500
# had to adjust test size from 500 to 390, because we have 112 unknown words in the positive list, 
# which leads to a total of 1893 valid positive words
test_size = 390 

def get_word_vectors_for_dataset(dataset):
    data_vector = []
    data_words = []
    for x in dataset:
        try:
            word_vector = word_dict[x]
            data_vector.append(word_vector)
            data_words.append(x)
        except KeyError as e:
            # skip unknown words
            pass #print('KeyError {0}'.format(x))
    old_size = len(dataset)
    new_size = len(data_vector)
    ignored = len(dataset) - new_size
    print('found {0} word vectors out of {1}, ignored {2} unknown words'.format(new_size, old_size, ignored))
    return data_vector, data_words

def prepare_dataset(dataset, is_negative=0):
    
    np.random.shuffle(dataset) # randomize dataset, so we don't have (example) A-M in train and N-Z in test
    vectors, words = get_word_vectors_for_dataset(dataset) # get word vectors
    
    # split into train and test
    train_vectors, test_vectors, remainder = np.split(vectors, [train_size, train_size+test_size])
    # we don't really need the words anymore for the classification, but if we want to analyse results, we have to preserve the mapping to the words
    train_words, test_words, remainder = np.split(words, [train_size, train_size+test_size])
    
    train_words= train_words.reshape(train_size, 1) # reshape for concate
    test_words = test_words.reshape(test_size, 1)
    
    train_x = np.concatenate((train_words, train_vectors), axis=1)
    test_x = np.concatenate((test_words, test_vectors), axis=1)
    
    train_y = np.zeros((train_size, 2))
    test_y = np.zeros((test_size, 2))
    train_y[:,is_negative] = 1
    test_y[:,is_negative] = 1
    
    train_x = np.concatenate((train_x, train_y), axis=1) # concat y-values, so we can shuffle [positive+negative] and preserve y mappings
    test_x = np.concatenate((test_x, test_y), axis=1)
    
    return train_x, test_x

In [8]:
train_data_a, test_data_a = prepare_dataset(positive_words, 0)
train_data_b, test_data_b = prepare_dataset(negative_words, 1)
train_data = np.concatenate((train_data_a, train_data_b))
test_data = np.concatenate((test_data_a, test_data_b))

np.random.shuffle(train_data) # shuffle, to have positive and negative words mixed
np.random.shuffle(test_data)

# extract subarrays from main array
words_train = train_data[:,0]
X_train = train_data[:,1:51]
Y_train = train_data[:,51:]
print('\nTrain words: ', words_train.shape)
print('X Train: ', X_train.shape)
print('Y Train: ', Y_train.shape)

words_test = test_data[:,0]
X_test = test_data[:,1:51]
Y_test = test_data[:,51:]
print('\nTest words: ', words_test.shape)
print('X Test: ', X_test.shape)
print('Y Test: ', Y_test.shape)

found 1893 word vectors out of 2005, ignored 112 unknown words
found 4345 word vectors out of 4783, ignored 438 unknown words

Train words:  (3000,)
X Train:  (3000, 50)
Y Train:  (3000, 2)

Test words:  (780,)
X Test:  (780, 50)
Y Test:  (780, 2)


Train and evaluate a classifier. Build a model, e.g. a double Dense layers in Keras
(MLP) and train it. Report on the evolution of the loss and accuracy along the epochs.
You should reach about 90% accuracy on the training set and 85% accuracy on the test
set. Report on your model structure and fitting strategy.

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.initializers import Constant

In [25]:
model = Sequential([
    # Embedding(3000, 50, input_length=50, trainable=False),
    Dense(units=100, input_shape=(50,), activation='relu'),
    Dense(units=2, activation='softmax')
])

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 100)               5100      
_________________________________________________________________
dense_14 (Dense)             (None, 2)                 202       
Total params: 5,302
Trainable params: 5,302
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
log = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), verbose=1) # , batch_size=128

Train on 3000 samples, validate on 780 samples


## Notes

I prepared the data so that `X_train` is a `(3000, 50)` array with each row representing a word with the vector from the GloVe word embeddings.

The Kernel seems to get stuck on the above code block on `model.fit()` and I can't figure out why. 
Unfortunately I couldn't find any hints on the lecture notes or on the internet on how to build such a model to get a word classifier 
and because of that I wasn't able to debug the error above. Therefor the exercise could not be completed properly.



