This colab notebook is based on : https://developers.google.com/machine-learning/guides/text-classification.

Note this is the Option A implementation.

Read In Data

In [2]:
import pandas as pd
import json

#from google.colab import drive
#drive.mount('/content/drive', force_remount=True)

# the base Google Drive directory
#root_dir = "/content/drive/My Drive/"

# Should probably organize by project
#project_folder = "Colab Notebooks/My Project Folder/"

#base_data_location = root_dir + 'Colab Notebooks/data'
#devotional_corpus = base_data_location + '/corpus_mod.json'


base_data_location = 'C:/Users/Dave/git_repos/mec-mini-projects/mec-14-8-capstone-model/data'
base_model_location = 'C:/Users/Dave/git_repos/mec-mini-projects/mec-14-8-capstone-model/models'
devotional_corpus = base_data_location + '/corpus_mod.json'



#load devotionals
df = pd.read_json(devotional_corpus)

# Get devotional ids for each collection and use as a label for supervised training
#collections = {"782" : "love", "924" : "joy", "290" : "peace", "906" : "hope", "809" : "depression"}
# Will also encode here
collections = {"782" : 0, "924" : 1, "290" : 2, "906" : 3, "809" : 4}
#collections = {"1" : "toy"}
devo_labels = {}
for collection in collections.keys():
    input_file = open(base_data_location + '/collections/collection_' + str(collection) + '.json')
    collection_data = json.load(input_file)
    page = {}
    for page in collection_data.values():
        for reading_plan in page['collections'][0]['items']:
            reading_plan_id = reading_plan['id']
            #create array with reading_plan id and collection id
            devo_labels[reading_plan_id] = collections[collection]
    input_file.close()





In [3]:
  import nltk
  nltk.download('stopwords')
  nltk.download('punkt')
  nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dave\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dave\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dave\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [4]:
import nltk
def clean(text):
    wn = nltk.WordNetLemmatizer()
    stopword = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    lower = [word.lower() for word in tokens]
    no_stopwords = [word for word in lower if word not in stopword]
    no_alpha = [word for word in no_stopwords if word.isalpha()]
    lemm_text = [wn.lemmatize(word) for word in no_alpha]
    clean_text = lemm_text
    return clean_text

In [5]:
print(df.head)
#print(df.iloc[0:2,])
print(df.iloc[0])
print(df.iloc[10])

<bound method NDFrame.head of                source_id      source          type  day  \
YV_RP_29045_1      29045  YouVersion  reading plan    1   
YV_RP_29045_2      29045  YouVersion  reading plan    2   
YV_RP_29045_3      29045  YouVersion  reading plan    3   
YV_RP_28889_1      28889  YouVersion  reading plan    1   
YV_RP_28889_2      28889  YouVersion  reading plan    2   
...                  ...         ...           ...  ...   
YV_RP_28716_1      28716  YouVersion  reading plan    1   
YV_RP_28716_2      28716  YouVersion  reading plan    2   
YV_RP_28716_3      28716  YouVersion  reading plan    3   
YV_RP_28716_4      28716  YouVersion  reading plan    4   
YV_RP_28716_5      28716  YouVersion  reading plan    5   

                                                            text  \
YV_RP_29045_1  [IMAGE CONTENT] \n\nTHE GIFT OF JESUS \n\n  \n...   
YV_RP_29045_2  [IMAGE CONTENT] \n\nTHE GIFT OF SALVATION \n\n...   
YV_RP_29045_3  SHARE JESUS, THE PERFECT GIFT \n\n  \n\n\n

In [6]:
print(df.head)
print(df[0:1])

# Create dataframe from label dictionary
label_df = pd.DataFrame.from_dict(devo_labels, orient='index', columns=['Collection'])
print(label_df.head)

# Combine devotionals with collection labels
combined_df = pd.merge(df, label_df, how='left', left_on=['source_id'], right_index=True)
print(combined_df.head)
print(combined_df.iloc[0])
print(combined_df.iloc[10])

combined_df['clean_text'] = combined_df.apply(lambda row : clean(row['text']), axis = 1)
# Need a version with tokens recombined so they can be retokenized later...
combined_df['clean_text_combined'] = combined_df.apply(lambda row : " ".join(row['clean_text']), axis = 1)

print(combined_df.head)
print(combined_df.iloc[0])
print(combined_df.iloc[10])



<bound method NDFrame.head of                source_id      source          type  day  \
YV_RP_29045_1      29045  YouVersion  reading plan    1   
YV_RP_29045_2      29045  YouVersion  reading plan    2   
YV_RP_29045_3      29045  YouVersion  reading plan    3   
YV_RP_28889_1      28889  YouVersion  reading plan    1   
YV_RP_28889_2      28889  YouVersion  reading plan    2   
...                  ...         ...           ...  ...   
YV_RP_28716_1      28716  YouVersion  reading plan    1   
YV_RP_28716_2      28716  YouVersion  reading plan    2   
YV_RP_28716_3      28716  YouVersion  reading plan    3   
YV_RP_28716_4      28716  YouVersion  reading plan    4   
YV_RP_28716_5      28716  YouVersion  reading plan    5   

                                                            text  \
YV_RP_29045_1  [IMAGE CONTENT] \n\nTHE GIFT OF JESUS \n\n  \n...   
YV_RP_29045_2  [IMAGE CONTENT] \n\nTHE GIFT OF SALVATION \n\n...   
YV_RP_29045_3  SHARE JESUS, THE PERFECT GIFT \n\n  \n\n\n

In [7]:
#Create single arrays of cleaned text and labels for training/testing
y = combined_df['Collection'].to_numpy()
#X = combined_df['clean_text_combined'].to_numpy()
X = combined_df['clean_text_combined'].to_numpy()

In [8]:
#Split training and test sets
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size = 0.3, random_state = 0)



Step 3: Prepare Data

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Vectorization parameters

# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
MAX_SEQUENCE_LENGTH = 500

def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as ngram vectors.
    1 text = 1 tf-idf vector the length of vocabulary of uni-grams + bi-grams.
    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.
    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    #x_train = vectorizer.fit_transform(train_texts)
    x_train = vectorizer.fit_transform(train_texts).todense()
    
    # Vectorize validation texts.
    #x_val = vectorizer.transform(val_texts)
    x_val = vectorizer.transform(val_texts).todense()

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train)
    x_val = selector.transform(x_val)

    x_train = x_train.astype('float32')
    x_val = x_val.astype('float32')
    return x_train, x_val

Construct a four-layer sepCNN model

In [10]:
def _get_last_layer_units_and_activation(num_classes):
    """Gets the # units and activation function for the last network layer.
    # Arguments
        num_classes: int, number of classes.
    # Returns
        units, activation values.
    """
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation

In [11]:
from tensorflow.keras import models
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    """Creates an instance of a multi-layer perceptron model.
    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.
    # Returns
        An MLP model instance.
    """
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=op_units, activation=op_activation))
    return model

Build and Train Model

In [12]:
def get_num_classes(labels):
    """Gets the total number of classes.
    # Arguments
        labels: list, label values.
            There should be at lease one sample for values in the
            range (0, num_classes -1)
    # Returns
        int, total number of classes.
    # Raises
        ValueError: if any label value in the range(0, num_classes - 1)
            is missing or if number of classes is <= 1.
    """
    num_classes = max(labels) + 1
    missing_classes = [i for i in range(num_classes) if i not in labels]
    if len(missing_classes):
        raise ValueError('Missing samples with label value(s) '
                         '{missing_classes}. Please make sure you have '
                         'at least one sample for every label value '
                         'in the range(0, {max_class})'.format(
                            missing_classes=missing_classes,
                            max_class=num_classes - 1))

    if num_classes <= 1:
        raise ValueError('Invalid number of labels: {num_classes}.'
                         'Please make sure there are at least two classes '
                         'of samples'.format(num_classes=num_classes))
    return num_classes

In [13]:
"""Module to train n-gram model.

Vectorizes training and validation texts into n-grams and uses that for
training a n-gram model - a simple multi-layer perceptron model. We use n-gram
model for text classification when the ratio of number of samples to number of
words per sample for the given dataset is very small (<~1500).
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import time

import tensorflow as tf
import numpy as np

FLAGS = None


def train_ngram_model(data,
                      learning_rate=1e-3,
                      epochs=1000,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.2):
    """Trains n-gram model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of Dense layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.

    # Raises
        ValueError: If validation data has label values which were not seen
            in the training data.
    """
    # Get the data.
    (train_texts, train_labels), (val_texts, val_labels) = data

    # Verify that validation labels are in the same range as training labels.
    num_classes = get_num_classes(train_labels)
    unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError('Unexpected label values found in the validation set:'
                         ' {unexpected_labels}. Please make sure that the '
                         'labels in the validation set are in the same range '
                         'as training labels.'.format(
                             unexpected_labels=unexpected_labels))

    # Vectorize texts.
    #x_train, x_val = vectorize_data.ngram_vectorize(
    x_train, x_val = ngram_vectorize(
        train_texts, train_labels, val_texts)

    # Create model instance.
    #model = build_model.mlp_model(layers=layers,
    model = mlp_model(layers=layers,
                                  units=units,
                                  dropout_rate=dropout_rate,
                                  input_shape=x_train.shape[1:],
                                  num_classes=num_classes)

    
    # Compile model with learning parameters.
    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=3)]

    # Train and validate model.
    history = model.fit(
            x_train,
            train_labels,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val, val_labels),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('devo_classification_mlp_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1]



In [15]:
# With early stop 2
data = [(train_texts, train_labels), (val_texts, val_labels)]
train_ngram_model(data)





Epoch 1/1000
42/42 - 1s - loss: 1.5722 - acc: 0.3465 - val_loss: 1.5348 - val_acc: 0.3647
Epoch 2/1000
42/42 - 0s - loss: 1.4234 - acc: 0.5374 - val_loss: 1.4217 - val_acc: 0.4932
Epoch 3/1000
42/42 - 0s - loss: 1.2181 - acc: 0.7287 - val_loss: 1.2987 - val_acc: 0.5636
Epoch 4/1000
42/42 - 0s - loss: 0.9994 - acc: 0.8148 - val_loss: 1.1977 - val_acc: 0.5768
Epoch 5/1000
42/42 - 0s - loss: 0.8140 - acc: 0.8596 - val_loss: 1.1267 - val_acc: 0.5908
Epoch 6/1000
42/42 - 0s - loss: 0.6645 - acc: 0.8812 - val_loss: 1.0818 - val_acc: 0.5970
Epoch 7/1000
42/42 - 0s - loss: 0.5431 - acc: 0.9047 - val_loss: 1.0468 - val_acc: 0.6058
Epoch 8/1000
42/42 - 0s - loss: 0.4531 - acc: 0.9249 - val_loss: 1.0257 - val_acc: 0.6076
Epoch 9/1000
42/42 - 0s - loss: 0.3812 - acc: 0.9345 - val_loss: 1.0148 - val_acc: 0.6054
Epoch 10/1000
42/42 - 0s - loss: 0.3204 - acc: 0.9536 - val_loss: 1.0062 - val_acc: 0.6146
Epoch 11/1000
42/42 - 0s - loss: 0.2807 - acc: 0.9598 - val_loss: 1.0021 - val_acc: 0.6128
Epoch 12

(0.6159260869026184, 1.0084409713745117)

In [14]:
print(tf.__version__)

2.3.0


In [16]:
from sklearn.metrics import accuracy_score, classification_report
def print_report(y_test,y_pred):
    class_report = classification_report(y_test, y_pred)
    print('\nAccuracy: ', accuracy_score(y_test, y_pred))
    print('\n', class_report)

In [28]:
from sklearn.model_selection import GridSearchCV
from joblib import parallel_backend
import joblib
def cv_optimize(clf, parameters, X_train, y_train, n_folds=5):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(X_train, y_train)
    print("Optimal params: " + str(gs.best_params_))
    return gs.best_estimator_

In [23]:
from joblib import parallel_backend
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=11)

#data = [(train_texts, train_labels), (val_texts, val_labels)]

with parallel_backend('threading', n_jobs=-1):
    x_train, x_val = ngram_vectorize(train_texts, train_labels, val_texts)
    clf.fit(x_train, train_labels)
    y_pred = clf.predict(x_val)

print_report(val_labels,y_pred)




Accuracy:  0.2833260008798944

               precision    recall  f1-score   support

           0       0.90      0.05      0.09       365
           1       0.26      0.99      0.41       568
           2       0.94      0.03      0.07       435
           3       0.91      0.02      0.05       423
           4       0.72      0.07      0.14       482

    accuracy                           0.28      2273
   macro avg       0.75      0.24      0.15      2273
weighted avg       0.71      0.28      0.17      2273



In [29]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
parameters = {"n_neighbors": range(1,40,5) }
with parallel_backend('threading', n_jobs=-1):
    x_train, x_val = ngram_vectorize(train_texts, train_labels, val_texts)
    clf = cv_optimize(clf, parameters, x_train, train_labels)
    clf.fit(x_train, train_labels)
    y_pred = clf.predict(x_val)

print_report(val_labels,y_pred)



Optimal params: {'n_neighbors': 1}

Accuracy:  0.33040035195776507

               precision    recall  f1-score   support

           0       0.79      0.13      0.23       365
           1       0.28      0.94      0.43       568
           2       0.53      0.11      0.19       435
           3       0.63      0.12      0.20       423
           4       0.67      0.14      0.24       482

    accuracy                           0.33      2273
   macro avg       0.58      0.29      0.25      2273
weighted avg       0.56      0.33      0.27      2273



In [30]:
# With early stop 3
data = [(train_texts, train_labels), (val_texts, val_labels)]
train_ngram_model(data)




Epoch 1/1000
42/42 - 1s - loss: 1.5707 - acc: 0.3496 - val_loss: 1.5297 - val_acc: 0.3704
Epoch 2/1000
42/42 - 0s - loss: 1.4157 - acc: 0.5695 - val_loss: 1.4128 - val_acc: 0.5029
Epoch 3/1000
42/42 - 0s - loss: 1.2077 - acc: 0.7295 - val_loss: 1.2921 - val_acc: 0.5517
Epoch 4/1000
42/42 - 0s - loss: 0.9903 - acc: 0.8214 - val_loss: 1.1902 - val_acc: 0.5856
Epoch 5/1000
42/42 - 0s - loss: 0.8087 - acc: 0.8563 - val_loss: 1.1202 - val_acc: 0.5904
Epoch 6/1000
42/42 - 0s - loss: 0.6548 - acc: 0.8817 - val_loss: 1.0723 - val_acc: 0.6027
Epoch 7/1000
42/42 - 0s - loss: 0.5417 - acc: 0.9085 - val_loss: 1.0426 - val_acc: 0.6014
Epoch 8/1000
42/42 - 0s - loss: 0.4489 - acc: 0.9245 - val_loss: 1.0247 - val_acc: 0.6023
Epoch 9/1000
42/42 - 0s - loss: 0.3763 - acc: 0.9396 - val_loss: 1.0087 - val_acc: 0.6106
Epoch 10/1000
42/42 - 0s - loss: 0.3222 - acc: 0.9562 - val_loss: 0.9974 - val_acc: 0.6150
Epoch 11/1000
42/42 - 0s - loss: 0.2743 - acc: 0.9630 - val_loss: 0.9928 - val_acc: 0.6168
Epoch 12

(0.6119665503501892, 0.9986212849617004)