This colab notebook is based on : https://developers.google.com/machine-learning/guides/text-classification.

Note this is the Option B implementation.

Read In Data

In [1]:
import pandas as pd
import json

from google.colab import drive

drive.mount('/content/drive', force_remount=True)

# the base Google Drive directory
root_dir = "/content/drive/My Drive/"

# Should probably organize by project
#project_folder = "Colab Notebooks/My Project Folder/"

base_data_location = root_dir + 'Colab Notebooks/data'
devotional_corpus = base_data_location + '/corpus_mod.json'

#load devotionals
df = pd.read_json(devotional_corpus)

# Get devotional ids for each collection and use as a label for supervised training
#collections = {"782" : "love", "924" : "joy", "290" : "peace", "906" : "hope", "809" : "depression"}
# Will also encode here
collections = {"782" : 0, "924" : 1, "290" : 2, "906" : 3, "809" : 4}
#collections = {"1" : "toy"}
devo_labels = {}
for collection in collections.keys():
    input_file = open(base_data_location + '/collections/collection_' + str(collection) + '.json')
    collection_data = json.load(input_file)
    page = {}
    for page in collection_data.values():
        for reading_plan in page['collections'][0]['items']:
            reading_plan_id = reading_plan['id']
            #create array with reading_plan id and collection id
            devo_labels[reading_plan_id] = collections[collection]
    input_file.close()





Mounted at /content/drive


In [2]:
  import nltk
  nltk.download('stopwords')
  nltk.download('punkt')
  nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
import nltk
def clean(text):
    wn = nltk.WordNetLemmatizer()
    stopword = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    lower = [word.lower() for word in tokens]
    no_stopwords = [word for word in lower if word not in stopword]
    no_alpha = [word for word in no_stopwords if word.isalpha()]
    lemm_text = [wn.lemmatize(word) for word in no_alpha]
    clean_text = lemm_text
    return clean_text

In [4]:
print(df.head)
#print(df.iloc[0:2,])
print(df.iloc[0])
print(df.iloc[10])

<bound method NDFrame.head of                source_id  ...                                         references
YV_RP_29045_1      29045  ...  [JHN.3.16+JHN.3.17, LUK.2.11, ISA.7.14, COL.1....
YV_RP_29045_2      29045  ...  [EPH.2.8+EPH.2.9, 2CO.5.21, EPH.1.7, ROM.6.6, ...
YV_RP_29045_3      29045  ...                     [JHN.14.6, JHN.11.25, ACT.1.8]
YV_RP_28889_1      28889  ...                                         [JAS.2.14]
YV_RP_28889_2      28889  ...                                        [PRO.12.18]
...                  ...  ...                                                ...
YV_RP_28716_1      28716  ...  [GEN.1.28, GEN.1.31, PSA.116.7, LUK.10.30+LUK....
YV_RP_28716_2      28716  ...                                  [GEN.5.1+GEN.5.2]
YV_RP_28716_3      28716  ...  [GEN.1.26, EXO.20.4+EXO.20.5+EXO.20.6, DEU.5.8...
YV_RP_28716_4      28716  ...  [GEN.1.28, GEN.14.18+GEN.14.19+GEN.14.20, PSA....
YV_RP_28716_5      28716  ...  [PSA.110.1, ISA.53, MRK.10.45, HEB.2.14+HEB.2.15

In [5]:
print(df.head)
print(df[0:1])

# Create dataframe from label dictionary
label_df = pd.DataFrame.from_dict(devo_labels, orient='index', columns=['Collection'])
print(label_df.head)

# Combine devotionals with collection labels
combined_df = pd.merge(df, label_df, how='left', left_on=['source_id'], right_index=True)
print(combined_df.head)
print(combined_df.iloc[0])
print(combined_df.iloc[10])

combined_df['clean_text'] = combined_df.apply(lambda row : clean(row['text']), axis = 1)
# Need a version with tokens recombined so they can be retokenized later...
combined_df['clean_text_combined'] = combined_df.apply(lambda row : " ".join(row['clean_text']), axis = 1)

print(combined_df.head)
print(combined_df.iloc[0])
print(combined_df.iloc[10])



<bound method NDFrame.head of                source_id  ...                                         references
YV_RP_29045_1      29045  ...  [JHN.3.16+JHN.3.17, LUK.2.11, ISA.7.14, COL.1....
YV_RP_29045_2      29045  ...  [EPH.2.8+EPH.2.9, 2CO.5.21, EPH.1.7, ROM.6.6, ...
YV_RP_29045_3      29045  ...                     [JHN.14.6, JHN.11.25, ACT.1.8]
YV_RP_28889_1      28889  ...                                         [JAS.2.14]
YV_RP_28889_2      28889  ...                                        [PRO.12.18]
...                  ...  ...                                                ...
YV_RP_28716_1      28716  ...  [GEN.1.28, GEN.1.31, PSA.116.7, LUK.10.30+LUK....
YV_RP_28716_2      28716  ...                                  [GEN.5.1+GEN.5.2]
YV_RP_28716_3      28716  ...  [GEN.1.26, EXO.20.4+EXO.20.5+EXO.20.6, DEU.5.8...
YV_RP_28716_4      28716  ...  [GEN.1.28, GEN.14.18+GEN.14.19+GEN.14.20, PSA....
YV_RP_28716_5      28716  ...  [PSA.110.1, ISA.53, MRK.10.45, HEB.2.14+HEB.2.15

In [6]:
#Create single arrays of cleaned text and labels for training/testing
y = combined_df['Collection'].to_numpy()
X = combined_df['clean_text_combined'].to_numpy()

In [7]:
#Split training and test sets
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size = 0.3, random_state = 0)



Step 3: Prepare Data

In [8]:
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

# Vectorization parameters
# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
MAX_SEQUENCE_LENGTH = 500

def sequence_vectorize(train_texts, val_texts):
    """Vectorizes texts as sequence vectors.

    1 text = 1 sequence vector with fixed length.

    # Arguments
        train_texts: list, training text strings.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val, word_index: vectorized training and validation
            texts and word index dictionary.
    """
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)

    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)
    return x_train, x_val, tokenizer.word_index

Construct a four-layer sepCNN model

In [15]:
def _get_last_layer_units_and_activation(num_classes):
    """Gets the # units and activation function for the last network layer.
    # Arguments
        num_classes: int, number of classes.
    # Returns
        units, activation values.
    """
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation

In [24]:
#from tensorflow.python.keras import models
#from tensorflow.python.keras import initializers
#from tensorflow.python.keras import regularizers

#from tensorflow.python.keras.layers import Dense
#from tensorflow.python.keras.layers import Dropout
#from tensorflow.python.keras.layers import Embedding
#from tensorflow.python.keras.layers import SeparableConv1D
#from tensorflow.python.keras.layers import MaxPooling1D
#from tensorflow.python.keras.layers import GlobalAveragePooling1D

from tensorflow.keras import models
from tensorflow.keras import initializers
from tensorflow.keras import regularizers

from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SeparableConv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import GlobalAveragePooling1D

def sepcnn_model(blocks,
                 filters,
                 kernel_size,
                 embedding_dim,
                 dropout_rate,
                 pool_size,
                 input_shape,
                 num_classes,
                 num_features,
                 use_pretrained_embedding=False,
                 is_embedding_trainable=False,
                 embedding_matrix=None):
    """Creates an instance of a separable CNN model.

    # Arguments
        blocks: int, number of pairs of sepCNN and pooling blocks in the model.
        filters: int, output dimension of the layers.
        kernel_size: int, length of the convolution window.
        embedding_dim: int, dimension of the embedding vectors.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        pool_size: int, factor by which to downscale input at MaxPooling layer.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.
        num_features: int, number of words (embedding input dimension).
        use_pretrained_embedding: bool, true if pre-trained embedding is on.
        is_embedding_trainable: bool, true if embedding layer is trainable.
        embedding_matrix: dict, dictionary with embedding coefficients.

    # Returns
        A sepCNN model instance.
    """
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()

    # Add embedding layer. If pre-trained embedding is used add weights to the
    # embeddings layer and set trainable to input is_embedding_trainable flag.
    if use_pretrained_embedding:
        model.add(Embedding(input_dim=num_features,
                            output_dim=embedding_dim,
                            input_length=input_shape[0],
                            weights=[embedding_matrix],
                            trainable=is_embedding_trainable))
    else:
        model.add(Embedding(input_dim=num_features,
                            output_dim=embedding_dim,
                            input_length=input_shape[0]))

    for _ in range(blocks-1):
        model.add(Dropout(rate=dropout_rate))
        model.add(SeparableConv1D(filters=filters,
                                  kernel_size=kernel_size,
                                  activation='relu',
                                  bias_initializer='random_uniform',
                                  depthwise_initializer='random_uniform',
                                  padding='same'))
        model.add(SeparableConv1D(filters=filters,
                                  kernel_size=kernel_size,
                                  activation='relu',
                                  bias_initializer='random_uniform',
                                  depthwise_initializer='random_uniform',
                                  padding='same'))
        model.add(MaxPooling1D(pool_size=pool_size))

    model.add(SeparableConv1D(filters=filters * 2,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
    model.add(SeparableConv1D(filters=filters * 2,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(op_units, activation=op_activation))
    return model

Build and Train Model

In [25]:
def get_num_classes(labels):
    """Gets the total number of classes.
    # Arguments
        labels: list, label values.
            There should be at lease one sample for values in the
            range (0, num_classes -1)
    # Returns
        int, total number of classes.
    # Raises
        ValueError: if any label value in the range(0, num_classes - 1)
            is missing or if number of classes is <= 1.
    """
    num_classes = max(labels) + 1
    missing_classes = [i for i in range(num_classes) if i not in labels]
    if len(missing_classes):
        raise ValueError('Missing samples with label value(s) '
                         '{missing_classes}. Please make sure you have '
                         'at least one sample for every label value '
                         'in the range(0, {max_class})'.format(
                            missing_classes=missing_classes,
                            max_class=num_classes - 1))

    if num_classes <= 1:
        raise ValueError('Invalid number of labels: {num_classes}.'
                         'Please make sure there are at least two classes '
                         'of samples'.format(num_classes=num_classes))
    return num_classes

In [26]:
"""Module to train sequence model.

Vectorizes training and validation texts into sequences and uses that for
training a sequence model - a sepCNN model. We use sequence model for text
classification when the ratio of number of samples to number of words per
sample for the given dataset is very large (>~15K).
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import time

import tensorflow as tf
import numpy as np

#import build_model
#import load_data
#import vectorize_data
#import explore_data

FLAGS = None

# Limit on the number of features. We use the top 20K features.
TOP_K = 20000


def train_sequence_model(data,
                         learning_rate=1e-3,
                         epochs=1000,
                         batch_size=128,
                         blocks=2,
                         filters=64,
                         dropout_rate=0.2,
                         embedding_dim=200,
                         kernel_size=3,
                         pool_size=3):
    """Trains sequence model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        blocks: int, number of pairs of sepCNN and pooling blocks in the model.
        filters: int, output dimension of sepCNN layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.
        embedding_dim: int, dimension of the embedding vectors.
        kernel_size: int, length of the convolution window.
        pool_size: int, factor by which to downscale input at MaxPooling layer.

    # Raises
        ValueError: If validation data has label values which were not seen
            in the training data.
    """
    # Get the data.
    (train_texts, train_labels), (val_texts, val_labels) = data

    # Verify that validation labels are in the same range as training labels.
    #num_classes = explore_data.get_num_classes(train_labels)
    num_classes = get_num_classes(train_labels)
    unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError('Unexpected label values found in the validation set:'
                         ' {unexpected_labels}. Please make sure that the '
                         'labels in the validation set are in the same range '
                         'as training labels.'.format(
                             unexpected_labels=unexpected_labels))

    # Vectorize texts.
    #x_train, x_val, word_index = vectorize_data.sequence_vectorize(train_texts, val_texts)
    x_train, x_val, word_index = sequence_vectorize(train_texts, val_texts)
    
    # Number of features will be the embedding input dimension. Add 1 for the
    # reserved index 0.
    num_features = min(len(word_index) + 1, TOP_K)

    # Create model instance.
    #model = build_model.sepcnn_model(blocks=blocks,
    model = sepcnn_model(blocks=blocks,
                                     filters=filters,
                                     kernel_size=kernel_size,
                                     embedding_dim=embedding_dim,
                                     dropout_rate=dropout_rate,
                                     pool_size=pool_size,
                                     input_shape=x_train.shape[1:],
                                     num_classes=num_classes,
                                     num_features=num_features)

    # Compile model with learning parameters.
    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

    # Train and validate model.
    history = model.fit(
            x_train,
            train_labels,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val, val_labels),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('devo_classifier_sepcnn_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1]




In [28]:
data = [(train_texts, train_labels), (val_texts, val_labels)]
train_sequence_model(data)


Epoch 1/1000
42/42 - 24s - loss: 1.5985 - acc: 0.2511 - val_loss: 1.5987 - val_acc: 0.2499 - 24s/epoch - 567ms/step
Epoch 2/1000
42/42 - 23s - loss: 1.5929 - acc: 0.2615 - val_loss: 1.5993 - val_acc: 0.2499 - 23s/epoch - 548ms/step
Epoch 3/1000
42/42 - 23s - loss: 1.5931 - acc: 0.2615 - val_loss: 1.5990 - val_acc: 0.2499 - 23s/epoch - 556ms/step
Validation accuracy: 0.2498900145292282, loss: 1.5990124940872192


(0.2498900145292282, 1.5990124940872192)

In [19]:
print(tf.__version__)

2.7.0
