In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import urllib
import tarfile

In [2]:
url = 'https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz'

def getunzipped(theurl):
    name = './dbpedia_csv.tar.gz'
    try:
        name, hdrs = urllib.request.urlretrieve(theurl, name)
    except IOError as e:
        print("Can't retrieve %r to %r: %s" % (theurl, thedir, e))
        return
    try:
        z = tarfile.open(name, "r:gz")
        z.extractall()
        z.close()
    except tarfile.error as e:
        print("Bad zipfile (from %r): %s" % (theurl, e))
        return

    print("Data Downloaded and unzipped!")

In [3]:
getunzipped(url)

Data Downloaded and unzipped!


In [3]:
train = pd.read_csv('dbpedia_csv/train.csv',header=None,names=['class','title','text'])
train.head()

Unnamed: 0,class,title,text
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...


In [4]:
test = pd.read_csv('dbpedia_csv/test.csv',header=None,names=['class','title','text'])
test.head()

Unnamed: 0,class,title,text
0,1,TY KU,TY KU /taɪkuː/ is an American alcoholic bever...
1,1,Odd Lot Entertainment,OddLot Entertainment founded in 2001 by longt...
2,1,Henkel,Henkel AG & Company KGaA operates worldwide w...
3,1,GOAT Store,The GOAT Store (Games Of All Type Store) LLC ...
4,1,RagWing Aircraft Designs,RagWing Aircraft Designs (also called the Rag...


In [5]:
X_train = train['text'].get_values()
X_test = test['text'].get_values()
y_train = train['class'].get_values()
y_test= test['class'].get_values()

In [6]:
max_doc_length = 50

train_lengths = train['text'].apply(lambda x: min(max_doc_length, len(x.split(' '))))
test_lengths = test['text'].apply(lambda x: min(max_doc_length, len(x.split(' '))))

In [7]:
train_lengths = train_lengths.values
test_lengths = test_lengths.values

### VocabularyProcessor 
Has an sklearn type interface - instantiate, call .fit() and .transform()

Creates a mapping between words and ID's.

Fit creates the mapping word-> ID, and transform replaces the words with relevant IDS.

We can then use tf.nn.embedding_lookup(...) to look up the vector representations that correspond to each ID.

In [24]:
preprocessor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_doc_length,min_frequency=3)

In [25]:
X_train_transformed = np.array(list(preprocessor.fit_transform(X_train)))

As you can see, we have replaced each word with a numerical ID corresponding to a word in our vocabulary

In [26]:
print("The word 'the' corresponds to ID:",preprocessor.vocabulary_.get('the')) #Get the ID for a word
print("ID 25 corresponds to:",preprocessor.vocabulary_.reverse(25)) #Get the word that corresponds to an ID

The word 'the' corresponds to ID: 1
ID 25 corresponds to: known


In [27]:
vocab_length = len(preprocessor.vocabulary_)

In [28]:
X_test_transformed = np.array(list(preprocessor.transform(X_test)))

In [29]:
X_test_transformed[:2]

array([[     0,  33095,      0,      4,     15,     34,  13638,  12026,
            58,     32,   2092,      2,  34220,      6,    154,  19432,
             8,      0,     58,      7,    108,      2,    254,      6,
             4,    672,      2,     46,    111,    133,     46,    111,
          1789,     69,      2,     46,    111,      0,  33095,   1012,
         12873,     40,    218,      2,    342,    137,      5,   2554,
          2603,     17],
       [  7874,  13438,   1130,    108,      2,    346,      9,   5924,
          3640,  26563,  53628,      6,   9446,   4389, 115134,      8,
         10402, 102537,      4,      5,     22,    454,      6,  11213,
            58,     69,      2,  14655,    133,    144,   7874,  13438,
           138,      1,     22,    579,      3,   9822,   1198,  38006,
         13657,     82,  27928,   2842,     56,     22,    579,      3,
           155,     82]], dtype=int64)

In [30]:
next(preprocessor.reverse(X_train_transformed[0,None].tolist()))

'Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929 A major part of their output was under sub-contract to motor vehicle manufacturers Their business closed in 1972 <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>'

### Now our data is prepared we can start modelling...

The basics have been implemented, but you need to implement the RNN functionality, and define the loss function.

As a starting point, you several comments have been provided in the relevant name scopes.

In [33]:
graph = tf.Graph()
embedding_size=100
rnn_cell_size = 100
n_classes=15
learning_rate = 0.005

with graph.as_default(), tf.device('/cpu:0'):
    with tf.name_scope('placeholders'):
        tf_X = tf.placeholder(shape=[None, max_doc_length], dtype=tf.int32) #This is a matrix of word IDs
        tf_y = tf.placeholder(shape=[None], dtype=tf.int64) #This is a list of class labels e.g. 1,5,3 etc.
        tf_lengths = tf.placeholder(shape=[None], dtype=tf.int32) #This is the lengths 
                                                                    #of each document in sample
        
    with tf.name_scope('variables'):
        embedding_matrix = tf.Variable(tf.random_uniform([vocab_length,embedding_size],-1.0,1.0)) 
        
        W = tf.Variable(tf.truncated_normal(shape=[rnn_cell_size, n_classes ], dtype=tf.float32))
        b = tf.Variable(tf.random_uniform([n_classes], -1.0,1.0))
        tf_y_one_hot = tf.one_hot(tf_y,n_classes, dtype=tf.int64)
    
    with tf.name_scope('embbeding_lookup'):
        #You need to look up the word ID and get replace with the associated word vector.
        #Verify the size is (?, max_doc_length, embedding_size)
        tf_X_embedded = tf.nn.embedding_lookup(embedding_matrix, tf_X) 
        
    with tf.name_scope('run_rnn'):
        #You need to define a cell type - use GRUCell
        #Also need to feed data into the RNN and get the final_state - use dynamic_rnn
        cell = tf.contrib.rnn.GRUCell(rnn_cell_size)
        outputs, final_state = tf.nn.dynamic_rnn(cell,tf_X_embedded, sequence_length=tf_lengths, dtype=tf.float32, swap_memory=True)
        
    with tf.name_scope('output_layer'):
        #Create a variable logits which is final_state * W + b
        #Pass this through a softmax to create prediction probabilities.
        logits = tf.matmul(final_state,W)+b
        predictions=tf.argmax(tf.nn.softmax(logits),1)
        
    with tf.name_scope('loss'):
        #Add a loss and an optimizer
        loss= tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_y_one_hot,logits=logits))
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
        
    with tf.name_scope('validation'):
        correct_prediction = tf.equal(tf_y, predictions)
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        
    with tf.name_scope('init'):
        init_op = tf.global_variables_initializer()

In [34]:
batch_size = 128
print_step = 25
from tqdm import tqdm

with tf.Session(graph=graph) as session:
    number_of_steps = 251
    train_indicies = np.arange(X_train_transformed.shape[0])
    test_indicies = np.arange(X_test_transformed.shape[0])
    
    session.run(init_op)
    average_loss = 0
    
    for step in tqdm(range(number_of_steps)):
        ind =  np.random.choice(np.arange(X_train_transformed.shape[0]), batch_size)
        feed_dict = {tf_X:X_train_transformed[ind], 
                     tf_y:y_train[ind], 
                     tf_lengths: train_lengths[ind] }
        l,_ = session.run([loss,optimizer ], feed_dict=feed_dict)
        average_loss+=l
        
        if step%print_step == 0:
            print ("average loss:",average_loss/print_step)
            ind2 =  np.random.choice(np.arange(X_test_transformed.shape[0]), batch_size*200)
            test_dict = {tf_X:X_test_transformed[ind2], 
                     tf_y:y_test[ind2], 
                     tf_lengths: test_lengths[ind2] }
            print("accuracy:",accuracy.eval(feed_dict=test_dict))
            #print(predictions.eval(feed_dict=test_dict))
            #print(y_test[ind2])
            print('-'*40, "step "+str(step), '-'*40)
            
            average_loss=0

  0%|                                                                                          | 0/251 [00:00<?, ?it/s]

0.288281898499
0.102461
---------------------------------------- step 0 ----------------------------------------


 10%|████████                                                                         | 25/251 [00:10<01:05,  3.47it/s]

5.08914985657
0.16375
---------------------------------------- step 25 ----------------------------------------


 20%|████████████████▏                                                                | 50/251 [00:21<00:55,  3.60it/s]

3.37159119606
0.241953
---------------------------------------- step 50 ----------------------------------------


 30%|████████████████████████▏                                                        | 75/251 [00:31<00:49,  3.56it/s]

2.84929045677
0.239648
---------------------------------------- step 75 ----------------------------------------


 40%|███████████████████████████████▊                                                | 100/251 [00:42<00:42,  3.58it/s]

2.81688102722
0.27957
---------------------------------------- step 100 ----------------------------------------


 50%|███████████████████████████████████████▊                                        | 125/251 [00:52<00:35,  3.54it/s]

2.48666309357
0.305117
---------------------------------------- step 125 ----------------------------------------


 60%|███████████████████████████████████████████████▊                                | 150/251 [01:03<00:28,  3.54it/s]

2.35196380615
0.31293
---------------------------------------- step 150 ----------------------------------------


 70%|███████████████████████████████████████████████████████▊                        | 175/251 [01:13<00:21,  3.59it/s]

2.25806551933
0.361875
---------------------------------------- step 175 ----------------------------------------


 80%|███████████████████████████████████████████████████████████████▋                | 200/251 [01:24<00:14,  3.61it/s]

2.20005481243
0.340078
---------------------------------------- step 200 ----------------------------------------


 90%|███████████████████████████████████████████████████████████████████████▋        | 225/251 [01:35<00:07,  3.57it/s]

2.50613738537
0.253594
---------------------------------------- step 225 ----------------------------------------


100%|███████████████████████████████████████████████████████████████████████████████▋| 250/251 [01:45<00:00,  3.52it/s]

2.47864121914
0.343477
---------------------------------------- step 250 ----------------------------------------


100%|████████████████████████████████████████████████████████████████████████████████| 251/251 [01:49<00:00,  1.35s/it]


In [None]:
session.close()