In [9]:
from sklearn.preprocessing import OneHotEncoder
from numpy import array
import sys
import numpy
import scipy

import json
import re
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction.text import TfidfVectorizer 
import pandas as pd
import csv
import os
import pickle
Stopwords = set(stopwords.words('english'))
wordlemmatizer = WordNetLemmatizer()
# stemmer = PorterStemmer()
stemmer = nltk.stem.SnowballStemmer('english')


In [10]:
# FOR_VECTORIZING
# Preprocessing functions

def remove_special_characters(text):
    regex = r'[^a-zA-Z0-9\-\s]'
    text = re.sub(regex, '', text)
    return text

def noun_verb_extraction(text):
    pos_tag = nltk.pos_tag(text.split())
    pos_tagged_noun_verb = []
    for word, tag in pos_tag:
        if tag == "NN" or tag == "NNP" or tag == "NNS" or tag == "VB" or tag == "VBD" or tag == "VBG" or tag == "VBN" or tag == "VBP" or tag == "VBZ":
            pos_tagged_noun_verb.append(word)
    return pos_tagged_noun_verb

def stem_words(words):
    stemmed_words = []
    for word in words:
        stemmed_words.append(stemmer.stem(word))
    return stemmed_words

def lemmatize_words(words):
    lemmatized_words = []
    for word in words:
        tag = nltk.pos_tag([word])[0][1]
        lemmatized_words.append(wordlemmatizer.lemmatize(word, get_wordnet_pos(tag)))
    return lemmatized_words

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def preprocess_sentence(text):
    text = remove_special_characters(str(text))
    text = re.sub(r'\d+', '', text)
    text = noun_verb_extraction(text)
    text = [word.lower() for word in text if len(word) > 1 and word not in Stopwords]
    text = stem_words(text)
#     text = lemmatize_words(text)
    return text


In [11]:
# Helper functions

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    try:
        with open(name + '.pkl', 'rb') as f:
            return pickle.load(f)    
    except FileNotFoundError as e:
        return False;

In [12]:
# FOR_VECTORIZING
# VECTORIZE DATASET & ONE HOT ENCODE CONCEPTS

concepts = []      

## ONE HOT ENCODE FILTERED CONCEPTS
dict = load_obj('helper_objects/dict_concept_filtered')
concepts = array(list(dict.keys()))
print(len(concepts))
concepts = array(concepts)
onehot_encoder = OneHotEncoder()
onehot_encoder.fit(concepts.reshape(len(concepts), 1))


documents_list = []
documents_concepts = []      
with open('../data/dataset/dataset_filtered.csv', 'r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter=',')
    header = next(reader)
    print(header)
    counter = 0
    for row in reader:
        combined = "\n".join([row[0], row[1], row[2]]) #, row[0], row[1], row[2], row[3], row[4]
        preprocessed = preprocess_sentence(combined)
        documents_list.append(' '.join(preprocessed))
        c = array(eval(row[5]))
        # Combine vectorized concepts to one array.
        concept_vector_to_process = array([])
        temp = onehot_encoder.transform(c.reshape(len(c), 1)).toarray()
        for concept_vector in temp:
            concept_vector_to_process = concept_vector.astype(int) | concept_vector_to_process.astype(int) if len(concept_vector_to_process) > 0 else concept_vector
        documents_concepts.append(concept_vector_to_process)
        
        if counter % 500 == 0:
            print("STIGAO DO " + str(counter))
        counter = counter + 1
        
         
print("TFIDF poceo")
vectorizer = TfidfVectorizer()
tfidf_vectorizer=TfidfVectorizer() 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(documents_list)
print("TFIDF gotov")


scipy.sparse.save_npz("vectors/vectors_stemmed_dash_tf_idf_title_header_recitals.npz", tfidf_vectorizer_vectors)
scipy.sparse.save_npz("vectors/concepts_stemmed_dash_tf_idf_title_header_recitals.npz", scipy.sparse.csr_matrix(documents_concepts))

1289
['title', 'header', 'recitals', 'main_body', 'attachments', 'concepts']
STIGAO DO 0
STIGAO DO 500
STIGAO DO 1000
STIGAO DO 1500
STIGAO DO 2000
STIGAO DO 2500
STIGAO DO 3000
STIGAO DO 3500
STIGAO DO 4000
STIGAO DO 4500
STIGAO DO 5000
STIGAO DO 5500
STIGAO DO 6000
STIGAO DO 6500
STIGAO DO 7000
STIGAO DO 7500
STIGAO DO 8000
STIGAO DO 8500
STIGAO DO 9000
STIGAO DO 9500
STIGAO DO 10000
STIGAO DO 10500
STIGAO DO 11000
STIGAO DO 11500
STIGAO DO 12000
STIGAO DO 12500
STIGAO DO 13000
STIGAO DO 13500
STIGAO DO 14000
STIGAO DO 14500
STIGAO DO 15000
STIGAO DO 15500
STIGAO DO 16000
STIGAO DO 16500
STIGAO DO 17000
STIGAO DO 17500
STIGAO DO 18000
STIGAO DO 18500
STIGAO DO 19000
STIGAO DO 19500
STIGAO DO 20000
STIGAO DO 20500
STIGAO DO 21000
STIGAO DO 21500
STIGAO DO 22000
STIGAO DO 22500
STIGAO DO 23000
STIGAO DO 23500
STIGAO DO 24000
TFIDF poceo
TFIDF gotov


In [None]:
# Test if keras using gpu (it does automatically if device_type: "GPU" is listed as one of devices) - CUDA drivers are needed, tensorflow_gpu also
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [1]:
# Load data and split

from sklearn.model_selection import train_test_split
import scipy

X = scipy.sparse.load_npz("vectors/vectors_lemmatized_tf_idf_title_header_recitals.npz")
y = scipy.sparse.load_npz("vectors/concepts_lemmatized_tf_idf_title_header_recitals.npz")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)

(19361, 26022)


In [2]:
# NEURAL NETWORK RELATED STUFF
# Research other loss and metric functions

from tensorflow.keras import metrics
from tensorflow.keras import layers, metrics
import tensorflow as tf
import keras.backend.tensorflow_backend as tfb

batch_size = 64
epochs = 13

POS_WEIGHT = 1.7  # multiplier for positive targets, needs to be tuned

def weighted_binary_crossentropy(target, output):
    """
    Weighted binary crossentropy between an output tensor 
    and a target tensor. POS_WEIGHT is used as a multiplier 
    for the positive targets.

    Combination of the following functions:
    * keras.losses.binary_crossentropy
    * keras.backend.tensorflow_backend.binary_crossentropy
    * tf.nn.weighted_cross_entropy_with_logits
    """
    # transform back to logits
    _epsilon = tfb._to_tensor(tfb.epsilon(), output.dtype.base_dtype)
    output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
    output = tf.math.log(output / (1 - output))
    # compute weighted loss
    loss = tf.compat.v1.nn.weighted_cross_entropy_with_logits(labels=target,
                                                    logits=output,
                                                    pos_weight=POS_WEIGHT)
    return tf.reduce_mean(loss, axis=-1)


def define_model_architecture(input_dimension, output_dimension):
    model = tf.keras.Sequential()
    model.add(layers.Dense(units=2048, input_shape=(input_dimension,)))
    model.add(layers.Activation('selu'))
    model.add(layers.Dense(output_dimension, activation='sigmoid'))
    model.compile(loss=weighted_binary_crossentropy, optimizer="adam",
                  metrics=[metrics.top_k_categorical_accuracy])
    return model

Using TensorFlow backend.


In [3]:
# Run model and save it

model = define_model_architecture(X_train.shape[1], y_train.shape[1])
print("DEFINED")
history = model.fit(X_train.todense(), y_train.todense(), validation_split=0.2, epochs=epochs)
print("TRAINED")

DEFINED
Train on 15488 samples, validate on 3873 samples
Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13
TRAINED


In [4]:
import numpy
from sklearn.metrics import classification_report
y_pred = model.predict(X_test.todense(), batch_size=64, verbose=1)
y_pred = (y_pred > 0.5)
print("Testing documents: " + str(X_test.shape[0]))
print("Concepts in test data: " + str(len(numpy.unique(numpy.ravel(numpy.argmax((y_test.todense() > 0.5) , axis=1))))))
y_pred_bool = numpy.argmax(y_pred, axis=1)
print("Concepts predicted: " + str(len(numpy.unique(y_pred_bool))))

print(classification_report(y_test, y_pred, zero_division=0)) # , labels=numpy.unique(y_pred_bool)

Testing documents: 4841
Concepts in test data: 634
Concepts predicted: 594
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.47      0.51      0.49        68
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00         1
           4       1.00      1.00      1.00         2
           5       0.00      0.00      0.00         4
           6       0.80      0.59      0.68        41
           7       0.00      0.00      0.00         7
           8       0.67      0.17      0.27        24
           9       1.00      0.33      0.50         6
          10       1.00      0.75      0.86         4
          11       0.87      1.00      0.93        26
          12       0.88      0.64      0.74        11
          13       1.00      0.33      0.50         3
          14       0.95      0.83      0.89        24
          15       0.85      0.63      0.72        27
      

In [None]:
# Model saving

model.save("nn_model.h5")

In [4]:
# Model loading

from tensorflow import keras
model = keras.models.load_model("nn_model.h5", compile=False)
model.compile(loss=weighted_binary_crossentropy, optimizer="adam",
                  metrics=[metrics.top_k_categorical_accuracy])