In [2]:
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from unigram_model import unigram_model
import numpy as np

In [2]:
#vocab_size = 1000
#oov_tok = "<OOV>"

In [3]:
def prepare_data(file_name):

    # open data file

    with open(file_name, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=';')
        next(reader)
        all_data=[]
        sentences=[]
        labels=[]
        for row in reader:
            sentences.append(row[2]+" "+row[3]+" "+row[4])
            labels.append(row[1].replace(' ',''))
            
    # split data into train and test sets
            
    split = int(len(sentences)/10)
    test_sentences=sentences[:split]
    train_sentences=sentences[split:]
    test_labels=labels[:split]
    train_labels=labels[split:]

    # create tokenizer for most common words
    
    #tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_sentences)
    word_index = tokenizer.word_index

    train_sequences = tokenizer.texts_to_sequences(train_sentences)
    test_sequences = tokenizer.texts_to_sequences(test_sentences)
    
    # prepare label data
    
    label_tokenizer = Tokenizer()
    label_tokenizer.fit_on_texts(labels)
    
    training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
    test_label_seq = np.array(label_tokenizer.texts_to_sequences(test_labels))
    
    return (train_sequences, test_sequences,
            training_label_seq, test_label_seq,
            tokenizer, label_tokenizer)

In [4]:
train_sequences, test_sequences, training_label_seq, test_label_seq, tokenizer, label_tokenizer = prepare_data("testset_C.csv")

In [5]:
# create for each category a language model
unigram_models={}
for category in label_tokenizer.word_index.values():
    model=unigram_model(category, len(tokenizer.word_index))
    unigram_models[category]=model

# fill models with counts
for i in range(len(train_sequences)):
    
    # get the right index for the label
    label = training_label_seq[i][0]
    for word in train_sequences[i]:
        unigram_models[label].add_word_entity(word)

# and normalize them to probs
for key in unigram_models.keys():
    unigram_models[key].normalize_probs()

In [7]:
# analyze accuracy

right=0.0
wrong=0.0
for i in range(len(test_sequences)):
    label_gt = test_label_seq[i][0]
    p_max=0
    for key in unigram_models.keys():
        p = unigram_models[key].get_sequence_probability(test_sequences[i])
        if p > p_max:
            p_max = p
            label_predict = key
            
    if label_gt != label_predict:
        wrong+=1
        print(i,label_gt, label_predict)
    else:
        right+=1
        
print("Accuracy: ", right/(right+wrong))

13 1 3
83 2 3
Accuracy:  0.9975


In [9]:
import pickle
PIK = "unigram_data/unigrams.dat"
data = [unigram_models, tokenizer, label_tokenizer]
with open(PIK, "wb") as f:
    pickle.dump(data, f)