In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# change the path to where your notebook is located
%cd "gdrive/MyDrive/Introduction to Natural Language Processing"

/content/gdrive/.shortcut-targets-by-id/1syPiNVQUgh60gmvbP12NyL0c-k21dKeU/Introduction to Natural Language Processing


## Collection of texts

In [None]:
# Download the collection of news with categories assigned
import nltk
nltk.download('reuters')
# Check what is inside the collection
# You will see that each article might have more than 1 category
from nltk.corpus import reuters

def collection_stats():
    # List of documents
    documents = reuters.fileids()
    print("Number of documents: %d" % (len(documents)))

    train_docs = list(filter(lambda doc: doc.startswith("train"),
                        documents))
    print("Size of a training set: %d" % (len(train_docs)))

    test_docs = list(filter(lambda doc: doc.startswith("test"),
                       documents))
    print("Size of a test set: %d" % (len(test_docs)))

    # List of categories
    categories = reuters.categories()
    print("Number of various categories: %d" % (len(categories)))

    # Documents in a category
    category_docs = reuters.fileids("acq")

    # Categories of a document
    document_id_list = category_docs[:2]
    print("Possible document categories:")
    print(list(map(lambda doc: tuple([doc, reuters.categories(doc)]), document_id_list)))

    # Words for a document
    document_id = document_id_list[0]
    document_words = reuters.words(document_id)
    print("Example of a tokenized text:")
    print(document_words)

    # Raw text of a document
    print("Corresponding raw text:")
    print(reuters.raw(document_id));
collection_stats()

Number of documents: 10788
Size of a training set: 7769
Size of a test set: 3019
Number of various categories: 90
Possible document categories:
[('test/14843', ['acq']), ('test/14852', ['acq', 'copper'])]
Example of a tokenized text:
['SUMITOMO', 'BANK', 'AIMS', 'AT', 'QUICK', 'RECOVERY', ...]
Corresponding raw text:
SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER
  Sumitomo Bank Ltd &lt;SUMI.T> is certain to
  lose its status as Japan's most profitable bank as a result of
  its merger with the Heiwa Sogo Bank, financial analysts said.
      Osaka-based Sumitomo, with desposits of around 23.9
  trillion yen, merged with Heiwa Sogo, a small, struggling bank
  with an estimated 1.29 billion dlrs in unrecoverable loans, in
  October.
      But despite the link-up, Sumitomo President Koh Komatsu
  told Reuters he is confident his bank can quickly regain its
  position.
      "We'll be back in position in first place within three
  years," Komatsu said in an interview.
      He said that w

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [None]:
# Evaluate F-measure values for different classes

from sklearn.metrics import f1_score

def print_f1(model, x_test, y_test):
    y_pred = model.predict(x_test)

    # Here we take the top class, but in fact, you can use more than one most-probable predictions
    #   from probability distribution (e.g., when one example may correspond to several classes)
    y_pred = [list(i).index(max(i)) for i in y_pred]
    y_true = [list(i).index(max(i)) for i in y_test]
    print("Macro F1-score: %.2f" % (f1_score(y_true, y_pred, average='macro')))
    print("Micro F1-score: %.2f" % (f1_score(y_true, y_pred, average='micro')))
    print(list(zip(reuters.get_label_names(), f1_score(y_true, y_pred, average=None))))
    return f1_score(y_true, y_pred, average='macro')

## First model (simple multi-layer perceptron)

In [None]:
from __future__ import print_function

import numpy as np
import keras
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer

max_words = 10000
batch_size = 64
epochs = 10

print('Loading data...')
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words,
                                                         test_split=0.2)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words=max_words)
x_train_mlp = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test_mlp = tokenizer.sequences_to_matrix(x_test, mode='binary')
print('x_train shape:', x_train_mlp.shape)
print('x_test shape:', x_test_mlp.shape)

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

print('Building model...')
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train_mlp, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)
score = model.evaluate(x_test_mlp, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

f1 = print_f1(model, x_test_mlp, y_test)

Loading data...
8982 train sequences
2246 test sequences
46 classes
Vectorizing sequence data...
x_train shape: (8982, 10000)
x_test shape: (2246, 10000)
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (8982, 46)
y_test shape: (2246, 46)
Building model...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 1.0818123817443848
Test accuracy: 0.8107746839523315
Macro F1-score: 0.62
Micro F1-score: 0.81
[('cocoa', 0.761904761904762), ('grain', 0.74235807860262), ('veg-oil', 0.6666666666666665), ('earn', 0.9182926829268293), ('acq', 0.8556806550665302), ('wheat', 0.0), ('copper', 0.896551724137931), ('housing', 0.5), ('money-supply', 0.6987951807228916), ('coffee', 0.7916666666666667), ('sugar', 0.9180327868852459), ('trade', 0.7039106145251396), ('reserves', 0.3529411764705882), ('ship', 0.6486486486486487), ('cotton', 0.6666666666666666), ('carcass', 0.3076923076923076

## Best Model (Bidirectional LSTM)

In [None]:
from keras.datasets import reuters
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, GRU, SimpleRNN, Bidirectional
from sklearn.metrics import f1_score
import numpy as np
from pyfasttext import FastText
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Dropout, Bidirectional, TimeDistributed
from keras.layers import LSTM, GRU, SimpleRNN
from keras.models import Model

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.preprocessing.sequence import pad_sequences

# Constants
max_words = 10000
max_sequence_length = 100
batch_size = 32

embedding_dim_glove = 100
embedding_dim_fasttext = 300
using_pyfasttext = False

# Load the dataset and preprocess
print('Loading data...')
max_words = 30000
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2)
word_index = reuters.get_word_index()

# Ensure that classes are in a one-hot encoded format
num_classes = max(max(y_train), max(y_test)) + 1
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

x_train_seq = pad_sequences(x_train, maxlen=max_sequence_length, padding='post', truncating='post')
x_test_seq = pad_sequences(x_test, maxlen=max_sequence_length, padding='post', truncating='post')

In [None]:
def print_f1_scores(model, x_test, y_test):
    y_pred_prob = model.predict(x_test)
    y_pred = np.argmax(y_pred_prob, axis=1)
    y_true = np.argmax(y_test, axis=1)
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    micro_f1 = f1_score(y_true, y_pred, average='micro')
    print(f"Macro F1-score: {macro_f1:.2f}")
    print(f"Micro F1-score: {micro_f1:.2f}")
    return macro_f1

def load_glove_embeddings(filepath):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index, len(coefs)

def load_fasttext_embeddings(filepath):
    fasttext_model = FastText()
    fasttext_model.load_model(filepath)
    return fasttext_model

def create_embedding_matrix_glove(word_index, embeddings_index, embedding_dim, max_words):
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

def create_embedding_matrix_fasttext(word_index, fasttext_model, embedding_dim, max_words):
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            embedding_vector = fasttext_model.get_numpy_vector(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

def create_embedding_matrix(embedding_type, embedding_dim, word_index, max_words):
    if embedding_type == 'glove':
        filepath = f'glove.6B.{embedding_dim}d.txt'
        embeddings_index, _ = load_glove_embeddings(filepath)
        return create_embedding_matrix_glove(word_index, embeddings_index, embedding_dim, max_words)
    elif embedding_type == 'fasttext':
        fasttext_model = load_fasttext_embeddings('cc.en.300.bin')
        return create_embedding_matrix_fasttext(word_index, fasttext_model, embedding_dim, max_words)

In [None]:
def build_configurable_model(layer_type, num_units, dropout_rate, return_sequences, second_layer, embedding_dim, max_sequence_length, num_classes, optimizer, embedding_type, padding='post', truncating='post', stack_layers=2, trainable_embeddings=True):
    if embedding_type == 'glove':
        embeddings_index, _ = load_glove_embeddings('glove.6B.300d.txt' if embedding_dim == 300 else 'glove.6B.100d.txt')
        embedding_matrix = create_embedding_matrix_glove(word_index, embeddings_index, embedding_dim, max_words)
    else:
        embedding_matrix = np.random.random((max_words, embedding_dim))

    embedding_layer = Embedding(
        input_dim=len(embedding_matrix),
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_sequence_length,
        trainable=trainable_embeddings
    )
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    layer_out = embedded_sequences
    for i in range(stack_layers - 1):
        if layer_type == 'LSTM':
            layer_out = LSTM(num_units, return_sequences=True)(layer_out)
        elif layer_type == 'Bidirectional LSTM':
            layer_out = Bidirectional(LSTM(num_units, return_sequences=True))(layer_out)

    if layer_type == 'LSTM':
        layer_out = LSTM(num_units, return_sequences=False)(layer_out)
    elif layer_type == 'Bidirectional LSTM':
        layer_out = Bidirectional(LSTM(num_units, return_sequences=False))(layer_out)

    if dropout_rate > 0:
        layer_out = Dropout(dropout_rate)(layer_out)

    output = Dense(num_classes, activation='softmax')(layer_out)
    model = Model(sequence_input, output)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
    return model

In [None]:
configurations = [

    {'layer_type': 'Bidirectional LSTM', 'num_units': 256, 'dropout_rate': 0.0, 'return_sequences': True, 'second_layer': True, 'embedding_dim': 300, 'max_sequence_length': 50, 'batch_size': 32, 'epochs': 10, 'optimizer': 'adam', 'padding': 'pre', 'truncating': 'pre', 'embedding_type': 'glove', 'stack_layers': 0}
]

In [None]:
# Train and evaluate each model
results = []
for config in configurations:
    model = build_configurable_model(
        layer_type=config['layer_type'],
        num_units=config['num_units'],
        dropout_rate=config['dropout_rate'],
        return_sequences=config['return_sequences'],
        second_layer=config['second_layer'],
        embedding_dim=config['embedding_dim'],
        max_sequence_length=config['max_sequence_length'],
        num_classes=num_classes,
        optimizer=config['optimizer'],
        embedding_type=config['embedding_type'],
        padding=config['padding'],
        truncating=config['truncating'],
        stack_layers=config.get('stack_layers', 2)
    )

    # Update input sequences with the specified padding and truncating methods
    x_train_seq = pad_sequences(x_train, maxlen=config['max_sequence_length'], padding=config['padding'], truncating=config['truncating'])
    x_test_seq = pad_sequences(x_test, maxlen=config['max_sequence_length'], padding=config['padding'], truncating=config['truncating'])

    print(f"Training {config['layer_type']} model with {config['num_units']} units, {config['embedding_dim']}d embeddings, {config['max_sequence_length']} sequence length, {config['batch_size']} batch size, and {config['epochs']} epochs using {config['optimizer']}...")
    model.fit(x_train_seq, y_train, epochs=config['epochs'], batch_size=config['batch_size'], validation_split=0.1, verbose=2)
    scores = model.evaluate(x_test_seq, y_test, verbose=0)
    print(f"Test accuracy for {config['layer_type']} model: {scores[1] * 100:.2f}%")
    f1_macro = print_f1_scores(model, x_test_seq, y_test)
    results.append((config['layer_type'], scores[1], f1_macro))

# Sort results by accuracy
results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
print("\nResults ordered by accuracy:")
print("{:<20} | {:<10} | {:<10}".format("Model Type", "Accuracy", "F1 Score"))
print("-" * 40)
for name, accuracy, f1_macro in results_sorted:
    print("{:<20} | {:<10.2f} | {:<10.2f}".format(name, accuracy * 100, f1_macro * 100))