In [1]:
import os
import csv
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from get_model import get_model
import numpy as np

In [2]:
vocab_size = 2000
embedding_dim = 32
max_length = 50
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [5]:
def prepare_data(file_name):
    
    # read data from file
    
    with open(file_name, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=';')
        next(reader)
        all_data=[]
        sentences=[]
        labels=[]
        for row in reader:
            sentences.append(row[2]+" "+row[3])
            labels.append(row[1].replace(' ',''))
        
    # split data into train and test sets
    
    split = int(len(sentences)/10)
    test_sentences=sentences[:split]
    train_sentences=sentences[split:]
    test_labels=labels[:split]
    train_labels=labels[split:]
    
    # create tokenizer for most common words
    
    tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
    tokenizer.fit_on_texts(train_sentences)
    word_index = tokenizer.word_index
    
    # translate sentences into index sequences
    # fiull up with zeroes due to fixed imput length
    
    train_sequences = tokenizer.texts_to_sequences(train_sentences)
    train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)
    test_sequences = tokenizer.texts_to_sequences(test_sentences)
    test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)
    
    # prepare labels
    
    label_tokenizer = Tokenizer()
    label_tokenizer.fit_on_texts(labels)
    training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
    test_label_seq = np.array(label_tokenizer.texts_to_sequences(test_labels))
    
    return (train_padded, test_padded,
            training_label_seq, test_label_seq,
            tokenizer, label_tokenizer)

In [8]:
train_padded, test_padded, training_label_seq, test_label_seq, tokenizer, label_tokenizer = prepare_data("testset_C.csv")


In [9]:
model=get_model(vocab_size, embedding_dim, max_length)

In [10]:
num_epochs = 30
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(test_padded, test_label_seq), verbose=2)

Epoch 1/30
225/225 - 1s - loss: 1.3121 - accuracy: 0.6026 - val_loss: 0.8865 - val_accuracy: 0.9388
Epoch 2/30
225/225 - 0s - loss: 0.4020 - accuracy: 0.9772 - val_loss: 0.1549 - val_accuracy: 0.9825
Epoch 3/30
225/225 - 0s - loss: 0.0962 - accuracy: 0.9904 - val_loss: 0.0692 - val_accuracy: 0.9887
Epoch 4/30
225/225 - 0s - loss: 0.0476 - accuracy: 0.9950 - val_loss: 0.0452 - val_accuracy: 0.9912
Epoch 5/30
225/225 - 0s - loss: 0.0305 - accuracy: 0.9957 - val_loss: 0.0350 - val_accuracy: 0.9912
Epoch 6/30
225/225 - 0s - loss: 0.0221 - accuracy: 0.9969 - val_loss: 0.0300 - val_accuracy: 0.9925
Epoch 7/30
225/225 - 0s - loss: 0.0170 - accuracy: 0.9969 - val_loss: 0.0260 - val_accuracy: 0.9925
Epoch 8/30
225/225 - 0s - loss: 0.0137 - accuracy: 0.9976 - val_loss: 0.0229 - val_accuracy: 0.9937
Epoch 9/30
225/225 - 0s - loss: 0.0112 - accuracy: 0.9978 - val_loss: 0.0203 - val_accuracy: 0.9950
Epoch 10/30
225/225 - 0s - loss: 0.0094 - accuracy: 0.9979 - val_loss: 0.0199 - val_accuracy: 0.9975

In [None]:
import pickle

os.makedirs("dl_data", exist_ok=True)
model.save_weights("dl_data/gfk_model.h5")
PIK = "dl_data/tokenizers.dat"
data = [tokenizer, label_tokenizer]
with open(PIK, "wb") as f:
    pickle.dump(data, f)

In [None]:
print(label_tokenizer.word_index)

In [None]:
results = model.predict(test_padded)
result_labels = []
for res in results:
    result_labels.append(np.argmax(res))
    
for i in range(len(results)):
    if result_labels[i] != test_label_seq[i][0]:
        print("wrong prediction": result_labels[i], test_label_seq[i][0])