In [6]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import  LatentDirichletAllocation
import numpy as np
from bs4 import BeautifulSoup
import re
import xml.sax.saxutils as saxutils
from nltk.tokenize import RegexpTokenizer, sent_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
import sys
import tensorflow as tf


  from ._conv import register_converters as _register_converters


In [7]:
this = sys.modules[__name__]


nltk.download('punkt')
nltk.download('stopwords')

this.tokenizer = RegexpTokenizer('[\'a-zA-Z]+')
this.lemmatizer = WordNetLemmatizer()
this.vocabulary = []
this.categories = []
this.stop_words = set(stopwords.words('english'))

def generate_categories():
    this.categories = []
    """Generate the list of categories."""
    topics = 'all-topics-strings.lc.txt'

    with open('./reuters21578/' + topics, 'r') as file:
        for category in file.readlines():
            this.categories.append(category.strip().lower())


def vectorize_docs(documents, w2v_model):
    """A weird oneshot representation for word2vec."""
    document_max_num_words = 100
    num_features = 500

    x = np.zeros(shape=(this.number_of_documents, document_max_num_words,
                        num_features)).astype(np.float32)

    empty_word = np.zeros(num_features).astype(np.float32)

    for idx, document in enumerate(documents):
        for jdx, word in enumerate(document):
            if jdx == document_max_num_words:
                break

            else:
                if word in w2v_model:
                    x[idx, jdx, :] = w2v_model[word]
                else:
                    x[idx, jdx, :] = empty_word

    return x


def vectorize_categories(categories):
    num_categories = len(this.categories)

    y = np.zeros(shape=(this.number_of_documents, num_categories)).astype(np.float32)

    for idx, key in enumerate(categories.keys()):
        y[idx, :] = categories[key]

    return y


def unescape(text):
    """Unescape charactes."""
    return saxutils.unescape(text)


def unique(arr):
    return list(set(arr))


def add_to_vocab(elements):
    for element in elements:
        if element not in this.vocabulary:
            this.vocabulary.append(element)


def add_to_categories(elements):
    for element in elements:
        if element not in this.categories:
            this.categories.append(element)


def transform_to_indices(elements):
    res = []
    for element in elements:
        res.append(this.vocabulary.index(element))
    return res


def transform_to_category_indices(element):
    return this.categories.index(element)


def strip_tags(text):
    """String tags for a better vocabulary."""
    return re.sub('<[^<]+?>', '', text).strip()


def to_category_onehot(categories):
    """Create onehot vectors for categories."""
    target_categories = this.categories
    vector = np.zeros(len(target_categories)).astype(np.float32)

    for i in range(len(target_categories)):
        if target_categories[i] in categories:
            vector[i] = 1.0

    return vector

def read_retuters_files(path="./reuters21578/"):
    x_train = {}
    x_test = {}
    y_train = {}
    y_test = {}

    for file in os.listdir(path):
        if file.endswith(".sgm"):
            print("reading ", path + file)
            f = open(path + file, 'r')
            data = f.read()

            soup = BeautifulSoup(data)
            posts = soup.findAll("reuters")
            
            
            for post in posts:
                post_id = post['newid']
                body = unescape(strip_tags(str(post('text')))
                                .replace('reuter\n&#3;', ''))
                post_categories = []

                topics = post.topics.contents

                for topic in topics:
                    post_categories.append(strip_tags(str(topic)))

                category_onehot = to_category_onehot(post_categories)
                
                cross_validation_type = post["lewissplit"]
                if (cross_validation_type == "TRAIN"):
                    x_train[post_id] = body
                    y_train[post_id] = category_onehot
                else:
                    x_test[post_id] = body
                    y_test[post_id] = category_onehot
            
    return (x_train, y_train), (x_test, y_test)

def tokenize(document):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [this.lemmatizer.lemmatize(t.lower()) for t in this.tokenizer.tokenize(sentence)
                  if t.lower() not in this.stop_words]
        words += tokens

    return words


def tokenize_docs(document):
    tokenized_docs = []
    this.number_of_documents = len(document)

    for key in document.keys():
        tokenized_docs.append(tokenize(document[key]))

    return tokenized_docs

[nltk_data] Downloading package punkt to /Users/gobidasu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gobidasu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
generate_categories()
(x_train, y_train), (x_test, y_test) = read_retuters_files()

('reading ', './reuters21578/reut2-004.sgm')




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html5lib")

  markup_type=markup_type))


('reading ', './reuters21578/reut2-010.sgm')
('reading ', './reuters21578/reut2-011.sgm')
('reading ', './reuters21578/reut2-005.sgm')
('reading ', './reuters21578/reut2-013.sgm')
('reading ', './reuters21578/reut2-007.sgm')
('reading ', './reuters21578/reut2-006.sgm')
('reading ', './reuters21578/reut2-012.sgm')
('reading ', './reuters21578/reut2-016.sgm')
('reading ', './reuters21578/reut2-002.sgm')
('reading ', './reuters21578/reut2-003.sgm')
('reading ', './reuters21578/reut2-017.sgm')
('reading ', './reuters21578/reut2-001.sgm')
('reading ', './reuters21578/reut2-015.sgm')
('reading ', './reuters21578/reut2-014.sgm')
('reading ', './reuters21578/reut2-000.sgm')
('reading ', './reuters21578/reut2-019.sgm')
('reading ', './reuters21578/reut2-018.sgm')
('reading ', './reuters21578/reut2-020.sgm')
('reading ', './reuters21578/reut2-008.sgm')
('reading ', './reuters21578/reut2-009.sgm')
('reading ', './reuters21578/reut2-021.sgm')


In [10]:
nltk.download("wordnet")
 # wordnet
num_features = 500



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gobidasu/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [11]:
x_train_token = tokenize_docs(x_train)

w2v_model = Word2Vec(x_train_token,
                     size=num_features,
                     min_count=1,
                     window=10)
w2v_model.init_sims(replace=True)

x_train = vectorize_docs(x_train_token, w2v_model)
y_train = vectorize_categories(y_train)




In [12]:
x_test_token = tokenize_docs(x_test)

w2v_model = Word2Vec(x_test_token,
                         size=num_features,
                         min_count=1,
                         window=10)

w2v_model.init_sims(replace=True)
x_test = vectorize_docs(x_test_token, w2v_model)
y_test = vectorize_categories(y_test)



In [13]:
print(type(x_train))
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)


<type 'numpy.ndarray'>
(14668, 100, 500)
(14668, 135)
(6910, 100, 500)


In [14]:
x_train = np.reshape(x_train, [14668, 50000])
x_test = np.reshape(x_test, [6910, 50000])

In [15]:
print(x_train.shape)

print(y_train.shape)
print(x_test.shape)

(14668, 50000)
(14668, 135)
(6910, 50000)


In [16]:
def next_batch(num, data, labels):
    '''
    Return a total of `num` random samples and labels. 
    '''
    idx = np.arange(0 , len(data))
    np.random.shuffle(idx)
    idx = idx[:num]
    data_shuffle = [data[ i] for i in idx]
    labels_shuffle = [labels[ i] for i in idx]

    return np.asarray(data_shuffle), np.asarray(labels_shuffle)

In [None]:
X = tf.placeholder(tf.float32, [None, 50000])
Y = tf.placeholder(tf.float32, [None, 135 ])

# Define parameters W and b of your model
W1 = tf.get_variable("W11", shape=[50000, 50], initializer=tf.contrib.layers.xavier_initializer())
b1 = tf.Variable(tf.zeros([50]))
W2 = tf.get_variable("W12", shape=[50, 135], initializer=tf.contrib.layers.xavier_initializer())
b2 = tf.Variable(tf.zeros([135]))

# Define your model's tensorflow graph
Z1 = tf.matmul(X,W1) + b1
A1 = tf.nn.relu(Z1)
Z2 = tf.matmul(A1,W2) + b2
A2 = tf.nn.softmax(Z2)

# Compute the cost function
cross_entropy_cost = -tf.reduce_sum(Y * tf.log(A2))

# Define accuracy metric
num_correct = tf.equal(tf.argmax(A2,1), tf.argmax(Y,1))
num_correct = tf.cast(num_correct, tf.float32)
accuracy = tf.reduce_mean(num_correct)

# Define optimization method, learning rate and the the training step
optimizer = tf.train.AdamOptimizer(0.00005)
train_step = optimizer.minimize(cross_entropy_cost)

# Initialize the variables of the graph, create tensorflow session and run the initialization of global variables.
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

# Implement the Optimization Loop for 100 iterations
for i in range(20000):
    # Load batch of images and labels
    batch_X, batch_Y = next_batch(100, x_train, y_train)
    # Create feed dictionary
    feed_dict={X: batch_X, Y: batch_Y}
    # Run the session train
    _, cost = sess.run([train_step, cross_entropy_cost], feed_dict=feed_dict)
    # Print cost and iteration
    if i % 1000 == 0:
        print("Iteration: " + str(i) + ", training cost = " + str(cost))

# Evaluate your accuracy and cost on the train and test sets
train_data={X: x_train, Y: y_train}
a,c = sess.run([accuracy, cross_entropy_cost], feed_dict=train_data)
print("Train accuracy = " + str(a) + ", Train cost = " + str(c))

test_data={X: x_test, Y: y_test}
a,c = sess.run([accuracy, cross_entropy_cost], feed_dict=test_data)
print("Test accuracy = " + str(a) + ", Test cost = " + str(c))

Iteration: 0, training cost = 353.79593
Iteration: 1000, training cost = 95.28506
Iteration: 2000, training cost = 89.67037
Iteration: 3000, training cost = 70.17113
Iteration: 4000, training cost = 34.530533
Iteration: 5000, training cost = 29.625038


In [None]:
num_epochs = 100
m = 14668
seed = 1 
minibatch_size = 64

optimizer = tf.train.AdamOptimizer(learning_rate = 0.005).minimize(cross_entropy_cost)
costs = []   

# Initialize all the variables
init = tf.global_variables_initializer()

# Start the session to compute the tensorflow graph
with tf.Session() as sess:
    # Run the initialization
    sess.run(init)
        
    # Do the training loop
    for epoch in range(num_epochs):

        epoch_cost = 0.                       # Defines a cost related to an epoch
        num_minibatches = int(m / minibatch_size) # number of minibatches of size minibatch_size in the train set
        seed = seed + 1
        minibatches = random_mini_batches(X_train, Y_train, minibatch_size, seed)

        for minibatch in minibatches:
        # Select a minibatch
            (minibatch_X, minibatch_Y) = minibatch
                
            _ , minibatch_cost = sess.run([optimizer, cross_entropy_cost], feed_dict={X: minibatch_X, Y: minibatch_Y})
                
            epoch_cost += minibatch_cost / num_minibatches

            # Print the cost every epoch
            if print_cost == True and epoch % 100 == 0:
                print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
            if print_cost == True and epoch % 5 == 0:
                costs.append(epoch_cost)
    # plot the cost
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per tens)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()

    
        # Define accuracy metric
    num_correct = tf.equal(tf.argmax(A2,1), tf.argmax(Y,1))
    num_correct = tf.cast(num_correct, tf.float32)
    accuracy = tf.reduce_mean(num_correct)

    print ("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train}))
    print ("Test Accuracy:", accuracy.eval({X: X_test, Y: Y_test}))

In [None]:
#LSTM  code 

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM
import sys

this = sys.modules[__name__]


def lstm(X_train, Y_train, X_test, Y_test):
    """Create the LSTM model."""
    document_max_num_words = 100
    num_features = 500
    num_categories = 135

    tb_callback = keras.callbacks.TensorBoard(log_dir='./tb', histogram_freq=0,
                                              write_graph=True, write_images=True)

    model = Sequential()

    model.add(LSTM(int(document_max_num_words * 1.5), input_shape=(document_max_num_words, num_features)))
    model.add(Dropout(0.3))
    model.add(Dense(num_categories))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(X_train, Y_train, batch_size=128, nb_epoch=5,
              validation_data=(X_test, Y_test), callbacks=[tb_callback])

    model.save('lstm_reuters.h5')

    score, acc = model.evaluate(X_test, Y_test, batch_size=128)

    print('Score: %1.4f' % score)
    print('Accuracy: %1.4f' % acc)

In [None]:
def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):
    """
    Creates a list of random minibatches from (X, Y)
    
    Arguments:
    X -- input data, of shape (input size, number of examples)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    mini_batch_size - size of the mini-batches, integer
    seed -- this is only for the purpose of grading, so that you're "random minibatches are the same as ours.
    
    Returns:
    mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y)
    """
    
    m = X.shape[0]                  # number of training examples
    mini_batches = []
    np.random.seed(seed)
    
    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    #print('X.shape:', X.shape, 'Y.shape:', Y.shape)
    shuffled_X = X[permutation, :, :]
    
    shuffled_Y = Y[permutation, :] #.reshape((m, Y.shape[1]))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = int(math.floor(m/mini_batch_size)) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[k * mini_batch_size : k * mini_batch_size + mini_batch_size, :, :]
        mini_batch_Y = shuffled_Y[k * mini_batch_size : k * mini_batch_size + mini_batch_size, :]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[num_complete_minibatches * mini_batch_size : m, :, :]
        mini_batch_Y = shuffled_Y[num_complete_minibatches * mini_batch_size : m, :]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

# clear old variables
tf.reset_default_graph()

# setup input (e.g. the data that changes every batch)
# The first dim is None, and gets sets automatically based on batch size fed in
L = l # 1014 
NUM_FILTERS = 256 # 1024 for large, 256 for small
NOUT = 1024 # 2048 for large, 1024 for small 
NUM_CHAR_OPTIONS = 70
STD_INIT = 0.02 # 0.02 for large, 0.05 for small 
x = tf.placeholder(tf.float32, [None, L, NUM_CHAR_OPTIONS])
y = tf.placeholder(tf.int64, [None, 1])
is_training = tf.placeholder(tf.bool)

num_epochs = 5
minibatch_size = 10
seed = 1
num_categories = 10
costs = [] 

def simple_model(x,y):    
    conv1 = tf.layers.conv1d(x, filters=NUM_FILTERS, kernel_size=7, strides=1, padding='SAME', 
                                   kernel_initializer=tf.initializers.random_normal(mean=0, stddev=STD_INIT), 
                                   # defaults to NWC
                                   activation=tf.nn.relu, reuse=None, name="conv1") 
    pool1 = tf.layers.max_pooling1d(conv1, pool_size=3, strides=1, padding='VALID', data_format='channels_last', name="pool1") 
    
    conv2 = tf.layers.conv1d(pool1, filters=NUM_FILTERS, kernel_size=7, strides=1, padding='SAME', 
                                   activation=tf.nn.relu, reuse=None, name="conv2") 
    pool2 = tf.layers.max_pooling1d(conv2, pool_size=3, strides=1, padding='VALID', data_format='channels_last', name="pool2")
    
    conv3 = tf.layers.conv1d(pool2, filters=NUM_FILTERS, kernel_size=3, strides=1, padding='SAME', 
                                   activation=tf.nn.relu, reuse=None, name="conv3") 
    
    conv4 = tf.layers.conv1d(conv3, filters=NUM_FILTERS, kernel_size=3, strides=1, padding='SAME', 
                                   activation=tf.nn.relu, reuse=None, name="conv4") 
    
    conv5 = tf.layers.conv1d(conv4, filters=NUM_FILTERS, kernel_size=3, strides=1, padding='SAME', 
                                   activation=tf.nn.relu, reuse=None, name="conv5") 
    
    conv6 = tf.layers.conv1d(conv5, filters=NUM_FILTERS, kernel_size=3, strides=1, padding='SAME', 
                                   activation=tf.nn.relu, reuse=None, name="conv6") 
    pool6 = tf.layers.max_pooling1d(conv6, pool_size=3, strides=1, padding='VALID', data_format='channels_last', name="pool6")
    
    fc7 = tf.contrib.layers.fully_connected(inputs=tf.contrib.layers.flatten(pool6), num_outputs=NOUT, reuse=None, 
                                      scope="fc7")
    fc8 = tf.contrib.layers.fully_connected(inputs=tf.contrib.layers.flatten(fc7), num_outputs=NOUT, reuse=None, 
                                      scope="fc8")
    fc9 = tf.contrib.layers.fully_connected(inputs=tf.contrib.layers.flatten(fc7), num_outputs=num_categories, reuse=None, 
                                      scope="fc9")
    
    return fc9

print('x.shape', x.shape)
print('y.shape', y.shape)
print('X_int[0, :,  :]', X_int[0, :,  :].shape)
y_out = simple_model(x,y)

# define our loss
total_loss = tf.nn.softmax_cross_entropy_with_logits(logits = y_out, labels = tf.one_hot(y, 10)) # yahoo 

# tf.losses.hinge_loss(tf.one_hot(y,10),logits=y_out)
mean_loss = tf.reduce_mean(total_loss)

# define our optimizer
optimizer = tf.train.AdamOptimizer(5e-4).minimize(mean_loss)

init = tf.global_variables_initializer()
              
with tf.Session() as sess:
    sess.run(init)
    # should loop over minibatches
    for epoch in range(num_epochs):
        #print epoch
        epoch_cost = 0.                       # Defines a cost related to an epoch
        num_minibatches = int(X_int.shape[0] / minibatch_size) # number of minibatches of size minibatch_size in the train set
        seed = seed + 1
        minibatches = random_mini_batches(X_int, Y_train, minibatch_size, seed)
        #minibatches = copy.deepcopy(minibatches_result)
        
        for minibatch in minibatches:

            # Select a minibatch
            (minibatch_X, minibatch_Y) = minibatch
                
            # IMPORTANT: The line that runs the graph on a minibatch.
            # Run the session to execute the "optimizer" and the "cost", the feedict should contain a minibatch for (X,Y).
            ### START CODE HERE ### (1 line)
            #print('minibatch shapes:', minibatch_X.shape, minibatch_Y.shape)
            _ , minibatch_cost = sess.run([optimizer, mean_loss], feed_dict={x: minibatch_X, y: minibatch_Y})
            ### END CODE HERE ###
                
            epoch_cost += minibatch_cost / num_minibatches

        # Print the cost every epoch
        if epoch % 1 == 0: #100
            print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
        if epoch % 1 == 0: #5
            costs.append(epoch_cost)
                
    # plot the cost
    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per tens)')
    plt.title("Learning rate =" + str(5e-4))
    plt.show()

    # lets save the parameters in a variable
    #parameters = sess.run(parameters)
    #print ("Parameters have been trained!")

    # Calculate the correct predictions
    correct_prediction = tf.equal(tf.argmax(y_out), tf.argmax(y))

    # Calculate accuracy on the test set
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

    print ("Train Accuracy:", accuracy.eval({X: X_int, Y: Y_train}))
    #print ("Test Accuracy:", accuracy.eval({X: X_test, Y: Y_test}))

    #_ , minibatch_cost = sess.run([optimizer, cost], feed_dict={x: X_int, y: Y_train}) # minibatch_X, minibatch_Y
    #print ("Cost:", minibatch_cost)
    #result = sess.run(y_out)
    #print result
    