# Use Apriori analysis to find phrases, or interesting patterns in a novel.

Use the nltk library corpus gutenberg API and load the novel 'carroll-alice.txt' which is the Alice in Wonderland by L. Carroll. Use any means to parse/extract words and save in CSV format to be read by Weka framework.

In [1]:
import nltk
import pandas as pd
import re
import numpy as np
import csv
from keras.datasets import mnist

In [2]:
alice = nltk.corpus.gutenberg.sents('carroll-alice.txt')
alice_words = nltk.corpus.gutenberg.words('carroll-alice.txt')

In [3]:
alice_words = np.array(alice_words)

In [4]:
stop_words = nltk.corpus.stopwords.words('english')

In [19]:
TermsSentences = []
for terms in alice:
    terms = [w for w in terms if w not in stop_words]
    terms = [w for w in terms if re.search(r'^[a-zA-Z]{2}', w) is not None]
    TermsSentences.append(terms)

In [20]:
alice_df = pd.DataFrame(TermsSentences)

In [24]:
alice_csv = alice_df.to_csv('alice.csv')

In [25]:
Transactions_list = []  # a list of transactions
Items_names = {}  # Lookup item ID to name
Items_ids = {}  # Lookup item name to ID

Items = None  # a list of item IDs, normally an increasing sequence of numbers

# Process the data
with open('alice.csv', 'r') as fin:
    reader = csv.reader(fin, delimiter=',')
    item_id = 0
    for row in reader:
        transaction = []
        for item in row:
            if item not in Items_ids:
                Items_ids[item] = item_id
                Items_names[item_id] = item
                item_id += 1
            #
            transaction += [Items_ids[item]]
        #
        Transactions_list += [transaction]

M, N = len(Items_ids), len(Transactions_list)

Items = np.arange(0,M)

# Information
print(f'M={M} items, N={N} transactions')

M=4497 items, N=1704 transactions


In [26]:
# Convert to numpy arrays
Transactions = np.full((N,M), False, dtype=bool)

for i, t in enumerate(Transactions_list):
    for item in t:
        Transactions[i][item] = True

# Sanity, print row index 10, 11
print(f'{Transactions[10:12].astype(int)}')

[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [27]:
Filename = 'alice.csv'

with open(Filename, 'w') as fout:
    writer = csv.writer(fout, delimiter=',', quoting=csv.QUOTE_ALL, quotechar="'", lineterminator='\n')
    writer.writerow([Items_names[i] for i in range(M)])
    for i in range(N):
        writer.writerow(list(map(lambda x: '' if x == False else 'True',  Transactions[i])))

## Interesting Patterns

A lowerBoundMinimumSupport of 0.004 produced 20 rules and a lowerBoundMinimumSupport of 0.003 produces 38 rules.

Everytime the word "golden" appears the word "little" also appears.

The word "said" commonly occurs with "turtle" and "hare."

When the word "join" appears, so does the word "dance."

Alice does not appear in the rules until the lowerBoundMinimumSupport is decreased to 0.003.

As more rules are produced, more common words appear, such as "went", "would", and "said."

# Two Hidden Layer Network

In the lecture module, the class NeuralNetMLP is a single hidden layer neural network implementation. Make the necessary modifications to upgrade it to a 2 hidden layer network. Run it on the MNIST dataset and report its performance.

In [12]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

print(f'Rows= {X_train.shape[0]}, columns= {X_train.shape[1]}')
print(f'Rows= {X_test.shape[0]}, columns= {X_test.shape[1]}')

Rows= 60000, columns= 28
Rows= 10000, columns= 28


In [13]:
X_train = X_train.reshape((60000, 784))
X_test = X_test.reshape((10000, 784))

print(f'Rows= {X_train.shape[0]}, columns= {X_train.shape[1]}')
print(f'Rows= {X_test.shape[0]}, columns= {X_test.shape[1]}')

Rows= 60000, columns= 784
Rows= 10000, columns= 784


In [14]:
class NeuralNetMLP(object):

    def __init__(self, n_hidden=30, n_hidden2 = 60, epochs=100, eta=0.001, minibatch_size=1, seed=None):
        self.random = np.random.RandomState(seed)  # used to randomize weights
        self.n_hidden = n_hidden  # size of the hidden layer
        self.n_hidden2 = n_hidden2
        self.epochs = epochs  # number of iterations
        self.eta = eta  # learning rate
        self.minibatch_size = minibatch_size  # size of training batch - 1 would not work
    
    @staticmethod
    def onehot(y, n_classes):  # one hot encode the input class y
        onehot = np.zeros((n_classes, y.shape[0]))
        for idx, val in enumerate(y.astype(int)):
            onehot[val, idx] = 1.0
        return onehot.T
    
    @staticmethod
    def sigmoid(z):  # Eq 1
        return 1.0 / (1.0 + np.exp(-np.clip(z, -250, 250)))

    def _forward(self, X):  # Eq 2       (takes in batch from training test)
        z_h = np.dot(X, self.w_h)
        a_h = self.sigmoid(z_h)
        z_h2 = np.dot(a_h, self.w_h2)
        a_h2 = self.sigmoid(z_h2)
        z_out = np.dot(a_h2, self.w_out)
        a_out = self.sigmoid(z_out)
        return z_h, a_h, z_h2, a_h2, z_out, a_out

    @staticmethod
    def compute_cost(y_enc, output):  # Eq 4
        term1 = -y_enc * (np.log(output))
        term2 = (1.0-y_enc) * np.log(1.0-output)
        cost = np.sum(term1 - term2)
        return cost

    def predict(self, X):
        z_h, a_h, z_h2, a_h2, z_out, a_out = self._forward(X)
        y_pred = np.argmax(z_out, axis=1)
        return y_pred

    def fit(self, X_train, y_train, X_valid, y_valid):
        import sys
        n_output = np.unique(y_train).shape[0]  # number of class labels
        n_features = X_train.shape[1]
        self.w_out = self.random.normal(loc=0.0, scale=0.1, size=(self.n_hidden2, n_output)) #output weights
        self.w_h = self.random.normal(loc=0.0, scale=0.1, size=(n_features, self.n_hidden)) #hidden layer weights
        self.w_h2 = self.random.normal(loc=0.0, scale=0.1, size=(self.n_hidden, self.n_hidden2)) #2nd hidden layer weights
        y_train_enc = self.onehot(y_train, n_output)  # one-hot encode original y
        for i in range(self.epochs):
            indices = np.arange(X_train.shape[0])
            for start_idx in range(0, indices.shape[0] - self.minibatch_size + 1, self.minibatch_size):
                batch_idx = indices[start_idx:start_idx + self.minibatch_size]
                z_h, a_h, z_h2, a_h2, z_out, a_out = self._forward(X_train[batch_idx]) #(forward propagation)
                sigmoid_derivative_h2 = a_h2 * (1.0-a_h2)  # Eq 3 (sigmoid function derivative)
                sigmoid_derivative_h = a_h * (1.0-a_h)  # Eq 3 (sigmoid function derivative)
                delta_out = a_out - y_train_enc[batch_idx]  # Eq 5 (backpropagate error) (predicted - actual)
                delta_h2 = (np.dot(delta_out, self.w_out.T) * sigmoid_derivative_h2)  # Eq 6 (hidden layer error matrix)
                delta_h = (np.dot(delta_h2, self.w_h2.T) * sigmoid_derivative_h)  # Eq 6
                grad_w_out = np.dot(a_h2.T, delta_out)  # Eq 7 (loss gradient)
                grad_w_h2 = np.dot(a_h.T, delta_h2)  # Eq 8
                grad_w_h = np.dot(X_train[batch_idx].T, delta_h)  # Eq 8
                self.w_out -= self.eta*grad_w_out  # Eq 9
                self.w_h2 -= self.eta*grad_w_h2  # Eq 9
                self.w_h -= self.eta*grad_w_h  # Eq 9

            # Evaluation after each epoch during training
            z_h, a_h, z_h2, a_h2, z_out, a_out = self._forward(X_train)
            cost = self.compute_cost(y_enc=y_train_enc, output=a_out)
            y_train_pred = self.predict(X_train)  # monitoring training progress through reclassification
            y_valid_pred = self.predict(X_valid)  # monitoring training progress through validation
            train_acc = ((np.sum(y_train == y_train_pred)).astype(float) / X_train.shape[0])
            valid_acc = ((np.sum(y_valid == y_valid_pred)).astype(float) / X_valid.shape[0])
            sys.stderr.write('\r%d/%d | Cost: %.2f ' '| Train/Valid Acc.: %.2f%%/%.2f%% '%
                (i+1, self.epochs, cost, train_acc*100, valid_acc*100))
            sys.stderr.flush()
        return self

In [15]:
# Define and fit the neural network
nn = NeuralNetMLP(n_hidden=20, n_hidden2=40, epochs=300, eta=0.0005, minibatch_size=100, seed=1)

nn.fit(X_train=X_train[:55000], y_train=y_train[:55000], X_valid=X_train[55000:], y_valid=y_train[55000:]) ;

300/300 | Cost: 26000.77 | Train/Valid Acc.: 91.85%/93.16% 

# Performance

1 hidden layer cost = 30051, training accuracy = 91.21%, validation accuracy = 92.96%

2 hidden layer cost = 26000, training accuracy = 91.85%, validation accuracy = 93.16%

Adding a second hidden layer improved both the training and validation accuracy by a very small amount.  The benefit likely does not outweigh the additional cost of adding a layer in this case.