In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import urllib
import tarfile

In [2]:
url = 'https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz'

def getunzipped(theurl):
    name = './dbpedia_csv.tar.gz'
    try:
        name, hdrs = urllib.request.urlretrieve(theurl, name)
    except IOError as e:
        print("Can't retrieve %r to %r: %s" % (theurl, thedir, e))
        return
    try:
        z = tarfile.open(name, "r:gz")
        z.extractall()
        z.close()
    except tarfile.error as e:
        print("Bad zipfile (from %r): %s" % (theurl, e))
        return

    print("Data Downloaded and unzipped!")

In [3]:
getunzipped(url)

Data Downloaded and unzipped!


In [38]:
train = pd.read_csv('dbpedia_csv/train.csv',header=None,names=['class','title','text'], nrows=1500)
train.head()

Unnamed: 0,class,title,text
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...


In [39]:
test = pd.read_csv('dbpedia_csv/test.csv',header=None,names=['class','title','text'], nrows=1500)
test.head()

Unnamed: 0,class,title,text
0,1,TY KU,TY KU /taɪkuː/ is an American alcoholic bever...
1,1,Odd Lot Entertainment,OddLot Entertainment founded in 2001 by longt...
2,1,Henkel,Henkel AG & Company KGaA operates worldwide w...
3,1,GOAT Store,The GOAT Store (Games Of All Type Store) LLC ...
4,1,RagWing Aircraft Designs,RagWing Aircraft Designs (also called the Rag...


In [40]:
X_train = train['text'].get_values()
X_test = test['text'].get_values()
y_train = train['class'].get_values()
y_test= train['class'].get_values()

In [41]:
max_doc_length = 50

train_lengths = train['text'].apply(lambda x: min(max_doc_length, len(x.split(' '))))
test_lengths = test['text'].apply(lambda x: min(max_doc_length, len(x.split(' '))))

In [42]:
train_lengths = train_lengths.values
test_lengths = test_lengths.values

### VocabularyProcessor 
Has an sklearn type interface - instantiate, call .fit() and .transform()

Creates a mapping between words and ID's.

Fit creates the mapping word-> ID, and transform replaces the words with relevant IDS.

We can then use tf.nn.embedding_lookup(...) to look up the vector representations that correspond to each ID.

In [43]:
preprocessor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_doc_length, min_frequency=2)

In [44]:
X_train_transformed = np.array(list(preprocessor.fit_transform(X_train)))

As you can see, we have replaced each word with a numerical ID corresponding to a word in our vocabulary

In [45]:
X_train_transformed

array([[   0,    4,    0, ...,    0,    0,    0],
       [   0,    5,    6, ...,    0,    0,    0],
       [   0,    5,    6, ...,   11,    0,    0],
       ..., 
       [2263, 1804,   54, ...,  946,  630,  279],
       [   0,  425,    0, ...,    0,    0,    0],
       [3196,  397,    5, ...,  104,   75,    2]])

In [46]:
print("The word 'the' corresponds to ID:",preprocessor.vocabulary_.get('the')) #Get the ID for a word
print("ID 25 corresponds to:",preprocessor.vocabulary_.reverse(25)) #Get the word that corresponds to an ID

The word 'the' corresponds to ID: 3
ID 25 corresponds to: which


In [47]:
vocab_length = len(preprocessor.vocabulary_)

In [48]:
X_test_transformed = np.array(list(preprocessor.transform(X_test)))

In [49]:
X_test_transformed[:2]

array([[   0,    0,    0,    5,   14,   28, 3332, 1057,    7,   21,  242,
           2,    0,    1,   64,    0,    8,    0,    7,    9,   19,    2,
         240,    1,    5,   39,    2,   36,   86,   77,   36,   86, 2376,
          18,    2,   36,   86,    0,    0,  461,    0,   33,  414,    2,
         248,   97,    6,  526,  333,   15],
       [   0,    0,  169,   19,    2,  324,   11,    0, 3616,    0,    0,
           1,    0, 2164,    0,    8,    0,    0,    5,    6,  136,   78,
           1,    0,    7,   18,    2,    0,   77,   71,    0,    0,  236,
           3,  136,    0,    4,    0, 1036,    0,    0, 1972,    0, 1134,
          80,  136,    0,    4,  478, 1972]])

In [50]:
next(preprocessor.reverse(X_train_transformed[0,None].tolist()))

'<UNK> of <UNK> E D <UNK> Limited was a British <UNK> business based in <UNK> Surrey trading under that name from 1929 A major part of their output was under <UNK> to motor vehicle manufacturers Their business closed in 1972 <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>'

### Now our data is prepared we can start modelling...

The basics have been implemented, but you need to implement the RNN functionality, and define the loss function.

As a starting point, you several comments have been provided in the relevant name scopes.

In [51]:
graph = tf.Graph()
embedding_size=100
rnn_cell_size = 100
n_classes=19
learning_rate = 0.001

with graph.as_default():
    with tf.name_scope('placeholders'):
        tf_X = tf.placeholder(shape=[None, max_doc_length], dtype=tf.int32) #This is a matrix of word IDs
        tf_y = tf.placeholder(shape=[None,1], dtype=tf.int64) #This is a list of class labels e.g. 1,5,3 etc.
        tf_lengths = tf.placeholder(shape=[None,], dtype=tf.int32) #This is the lengths 
                                                                    #of each document in sample
        
    with tf.name_scope('variables'):
        embedding_matrix = tf.Variable(tf.random_uniform(shape=[vocab_length, embedding_size],
                                                         dtype=tf.float32))
        W = tf.Variable(tf.truncated_normal(shape=[rnn_cell_size, n_classes ]))
        b = tf.Variable(tf.random_uniform(shape=[n_classes,]))
        tf_y_one_hot = tf.one_hot(tf_y,n_classes)
    
    with tf.name_scope('embbeding_lookup'):
        #You need to look up the word ID and get replace with the associated word vector.
        #Verify the size is (?, max_doc_length, embedding_size)
        tf_X_embedded = tf.nn.embedding_lookup(embedding_matrix, tf_X) 
        
    with tf.name_scope('run_rnn'):
        #You need to define a cell type - use GRUCell
        #Also need to feed data into the RNN and get the final_state - use dynamic_rnn
        cell = tf.contrib.rnn.GRUCell(rnn_cell_size)
        outputs, final_state = tf.nn.dynamic_rnn(cell,tf_X_embedded, tf_lengths, dtype=tf.float32)
        
    with tf.name_scope('output_layer'):
        #Create a variable logits which is final_state * W + b
        #Pass this through a softmax to create prediction probabilities.
        logits = tf.matmul(final_state,W)+b
        predictions = tf.nn.softmax(logits)
        
    with tf.name_scope('loss'):
        #Add a loss and an optimizer
        loss= tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_y_one_hot))
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
        
    with tf.name_scope('validation'):
        accuracy = tf.reduce_mean(tf.cast(tf.equal(tf_y,tf.argmax(predictions,1)), tf.float32))
        
    with tf.name_scope('init'):
        init_op = tf.global_variables_initializer()

In [56]:
batch_size = 32
print_step = 100

with tf.Session(graph=graph) as session:
    number_of_steps = 500
    train_indicies = np.arange(X_train_transformed.shape[0])
    test_indicies = np.arange(X_test_transformed.shape[0])
    
    session.run(init_op)
    
    for step in range(number_of_steps):
        ind = np.random.choice(train_indicies, size=batch_size, replace=False)
        feed_dict = {tf_X:X_train_transformed[ind], 
                     tf_y:y_train[ind,None], 
                     tf_lengths: train_lengths[ind] }
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        
        if step%print_step == 0:
            ind = np.random.choice(test_indicies, size=batch_size, replace=False)
            feed_dict = {tf_X:X_test_transformed[ind], 
                     tf_y:y_test[ind,None], 
                     tf_lengths: test_lengths[ind] }
            a = session.run([accuracy], feed_dict=feed_dict)
            print('-'*40, "step "+str(step), '-'*40)
            print(a)

---------------------------------------- step 0 ----------------------------------------
[1.0]
---------------------------------------- step 100 ----------------------------------------
[1.0]


KeyboardInterrupt: 