In [2]:
#Ignore warnings/info messages
import warnings
import logging
warnings.filterwarnings('ignore')
logging.getLogger("tensorflow").setLevel(logging.WARNING)

In [3]:
#Import libraries
import os
import pandas as pd
import numpy as np
import spacy
from keras.preprocessing.text import Tokenizer
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from text_preprocessing import CleanText
from utils import *

2021-09-22 12:30:44.011479: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-22 12:30:44.011508: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


#### News pieces are grouped within subfolders in ./data directory. Names of subfolder mean category to which news piece is related to. 
#### Loading data is done using load_data() function from utils module.

In [4]:
df_data = load_data('./data')

#### Punctuations, stop words, digits, whitespaces, empty lines from each news piece were removed. Also words were converted to their base form using lemmatization technique. These actions were done using CleanText class and its methods from text_preprocessing module.

In [5]:
df_data['Text_cleaned'] = apply_many_functions(df_data['Text'], lambda text: text.lower(),
              lambda text: CleanText(text).remove_punctuations(),
              lambda text: CleanText(text).remove_stop_words(), 
              lambda text: CleanText(text).remove_digits(), 
              lambda text: CleanText(text).remove_whitespaces(), 
              lambda text: CleanText(text).remove_empty_lines(),
              lambda text: CleanText(text).lemmatize_words())

#### In classification I will use pretrained word embeddings. So, I downloaded spaCy library and  its trained word embeddings. I decided to use spaCy because the embeddings load faster than for example the GloVe ones. For this case we can also use word2vec vectors using gensim library.

In [6]:
#If you don't have spacy model already downloaded, pease uncomment the line below.
#! python -m spacy download en_core_web_sm 
spacy_nlp_object = spacy.load('en_core_web_md')

#### Using count_word_freq() function from utils module I counted how many times each word appeared in the text corpus. Text corpus is all news pieces.

#### During tokenization words that appeared in text corpus more than 1 time were took into account.  Tokenizer vectorized a text corpus into a list of integers. and after that text sequences were padded by zeros.

In [7]:
word_freq_dict = count_word_freq(list(df_data.Text_cleaned))
vocab_size = len([(word, count) for (word, count) in word_freq_dict.items() if count > 1])
tokenizer_object = Tokenizer(num_words=vocab_size + 1)
text_pad_sequences = text_to_keras_sequence(tokenizer_object, list(df_data.Text_cleaned))

####  I will use trained embeddings in further model, so I need to create embedding matrix for it. Each encoded word  from tokenizer object will be a row index in the embedding matrix and the vector  for that word will be the the vector from the spaCy model. 

In [8]:
embedding_matrix = create_spacy_embedding_matrix(tokenizer_object, spacy_nlp_object)

#### Labels were converted to categorical data matrix

In [9]:
categorical_labels = transform_dependent_variable(np.array(df_data['Label']))

#### Identifying simple neural network architecture also for this case we can try CNN model

In [11]:
model=Sequential()
model.add(Embedding(tokenizer_object.num_words, embedding_matrix.shape[1], weights=[embedding_matrix],
                  trainable=False)) 
model.add(LSTM(64,return_sequences=True,dropout=0.1))
model.add(GlobalMaxPooling1D())
model.add(Dense(16,activation='relu')) 
model.add(Dense(3,activation='softmax')) 
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=["acc"]) 

#### Uczenie sieci neuronowej without splitting to train/test/val dataset, because we have not enouhg source data

In [12]:
history=model.fit(text_pad_sequences,categorical_labels,
                  epochs=10,
                  verbose=2)

2021-09-22 12:36:02.162603: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
1/1 - 2s - loss: 1.2593 - acc: 0.3529
Epoch 2/10
1/1 - 0s - loss: 1.1466 - acc: 0.3529
Epoch 3/10
1/1 - 0s - loss: 1.0987 - acc: 0.3529
Epoch 4/10
1/1 - 0s - loss: 1.0681 - acc: 0.3529
Epoch 5/10
1/1 - 0s - loss: 1.0439 - acc: 0.4706
Epoch 6/10
1/1 - 0s - loss: 1.0193 - acc: 0.5294
Epoch 7/10
1/1 - 0s - loss: 0.9962 - acc: 0.7059
Epoch 8/10
1/1 - 0s - loss: 0.9620 - acc: 0.7059
Epoch 9/10
1/1 - 0s - loss: 0.9390 - acc: 0.7059
Epoch 10/10
1/1 - 0s - loss: 0.9075 - acc: 0.7059


#### Loss and accuracy for the model

In [13]:
loss, accuracy = model.evaluate(text_pad_sequences,categorical_labels, verbose=False)

#### Saving model to the file

In [14]:
model.save('keras_model.h5')