In [1]:
%qtconsole

"""
Notebook to look at a how to use a word vector approach to catagorise sentances.

This is to help with analysing and assessing transaction data from a company based upon the 
description of the transaction.

The approach is to use a pre-trained set of word vectors - in this case the GloVe set 6.B with each vector being of 
length 300.

In this analysis the approach is to define a fixed sentance length and pad sentances where they are less than this.

I have tried lengths of 5 to 10, and 6 or 7 seems to work best. At present I am padding the sentances in my routine, 
I intend to see how well the padding in the pytorch embedding layer works and whether this is any better since
I am not sure if the padding is having a detrimental effect upon the analysis.

After the embedding layer I am using a simple 3 level neural network, the first two layers with rectified linear and 
then finally a softmax output

The previous analysis achieved accuracy of 88.9% on training data but 66.2% on the test data using a simple 
bag of words approach.

The present analysis gives me 89% on the training data and 92% on the test data, hence it is a big improvement in the analysis of the untrained datasets.  More could be done but the actual data itself needs some work and some of the catagories are not very well represented.

An improvement I would like to try at some point is to use the natural language tool kit and 
stemmer from the nltk library, however, I think this model is fundamentally limited and so won't 
do so with this version

"""

In [2]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from myData import Dataset
from myData import DataLoader
#from torchtext import data as t_data
#from torchtext import utils
from torchtext.vocab import load_word_vectors
import numpy as np
#import re
from collections import Counter
import itertools
import sys
import csv
import os
import word_processing as wp


In [32]:
# System parameters
my_file_path='johnrichmond/Dropbox/Machine Learning/text classification/Andrew/'
csv_file_name='Payment items.csv'

stop_list= set("for a c e do h i if is it in g o p or r t u v y 's ' of the and mr ms to nd we".
               split())
pad='<pad>'

remove_single_words=True
max_sent_length=7
min_freq=1
use_subset_data=True
max_cases=30000

# only there are catagories 1-14 are valid, all others should be rejected

min_cat=0
max_cat=13
num_cat=max_cat+1
label_to_idx={"Activity":0, 
              "Course": 1,
              "Exam resit":2,
              "Fees or contribution": 3,
              "Letting": 4,
              "Meal":5,
              "Patents evening": 6,
              "School bus":7,
              "Tickets":8,
              "Trip":9,
              "Tuition":10,
              "Uniform":11,
              "Wraparound care":12,
              "Other":13
             }
load_word_vector_set='glove.6B'
word_vector_length=300
word_vector_path='johnrichmond/Dropbox/Machine Learning/text classification/'

In [49]:
# Analysis Hyper parameters
batch_size=32
no_epochs=300
HL1_size=200
HL2_size=80
HL3_size=30
lr=0.006
momentum=0.2
nesterov=False
L2=0.000
val_percentage=20


In [34]:
# Make the file paths the same whether using Mac or Linux
if sys.platform == 'darwin':
    start='/Users/'
else: start='/home/'
    
file_name=my_file_path+csv_file_name
txt_file=start+file_name
word_vec_path=os.path.join(start,word_vector_path)

In [35]:
replace_list={"years":"year", "yr":"year", "wks":"week","tickets": "ticket",
              "terms":"term", "students":"student","pupils":"pupil","meals": "meal",
              "lakes":"lake"}

In [36]:
# Utility function to measure classification accuracy - move to module once programme 
# is operating

def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    # The next line is not needed in this case since it is done prior to the call
    #_, pred = output.topk(maxk, 1, True, True) # topk is torch function to return highest values in array
    pred = output.t()  #Transpose
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    """
    Note - the expand is a torch command to expend one tensor to the size of another
    target os a one D tensor. target.view(1,-1) reshapes the tensor.  The -1 means this 
    is chosen by the software to get the right total size.  The first 1 indicates the number 
    of rows to use.
    The net outcome is an array with one column of length maxk for each target value.  
    The entire column is filled with the target value to facilitate easy comparison. 
    The correct array then contains an array with true wherever the tar
    """
    #get value matches the prediction
    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)    # The nomeclature[:k] returns the top k rows Since
                                                           # there is no second array we get the every column.
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

# Load sentances to catagorize

In [37]:
with open(txt_file,'rU') as file_obj:
    f_data=[]
    num=0
    lines=[]
    reader=csv.reader(file_obj)
    for line in reader:
        if reader.line_num<>1:
    #       with col in line:
            text_str=line[0]
            catagory=line[15]
            if wp.is_integer(catagory):
                f_data.append([text_str,int(catagory)])
                num=num+1
        if reader.line_num>max_cases and use_subset_data: break

In [38]:
# All data loaded, will now process each line of data
# Initially remove numbers and punctuation
sentances=[]
catagories=[]
catagory=[]
identity=np.identity(num_cat)
for row in f_data:
    row[0]=wp.clean_str(row[0])
    row[0]=wp.rem_numbers(row[0])# Done separately since I might not always want to do this

    if row[0] in (None, ""):
        # row rejected
        continue
    elif row[1] <1 or row[1]> num_cat or row[1] in (None, ""):
        continue
    sentance=row[0].split(" ")
    # remove stop list words
    sentance=wp.remove_stop_words(sentance,stop_list)
    sentance=wp.replace_similar_words(sentance,replace_list)
    if len(sentance)==0: continue
    row[1]=row[1]-1   
    
    # Getting to this point implies the row is ok and still has valid words, therefore will add
    sentances.append(sentance)
    catagories.append(identity[row[1]-1,:])
    catagory.append(row[1])
    
# Remove single words  
if remove_single_words==True:
    final_sentances=[]
    final_catagories=[]
    final_catagory=[]
    word_counts = Counter(itertools.chain(*sentances))
    new_sentances=[[word for word in sentance if word_counts[word]>1] 
                    for sentance in sentances]
    #Remove empty entries from both sentances and catagories
    for index,sentance in enumerate(new_sentances):
        if len(sentance)<>0:
            final_sentances.append(sentance)
            final_catagories.append(catagories[index])
            final_catagory.append(catagory[index])
    sentances=final_sentances
    catagories=final_catagories
    catagory=final_catagory
    
# Identify the longest sentance
max_length=0
for sentance in sentances:
    if len(sentance)> max_length: max_length=len(sentance)
print "Maximum sentance length: {} words".format(max_length)

# Pad sentances to maximum length
if max_sent_length<>0:
    pad_to=max_sent_length
else:
    pad_to=max_length
for sentance in sentances:
    len_sent=len(sentance)
    if len_sent<pad_to:
        for pos in range(len_sent,pad_to):
            sentance.append(pad)

for i in range(len(sentances)):
    sentances[i]=sentances[i][0:max_sent_length]
    


Maximum sentance length: 22 words


In [39]:
# Make use of the use of the pytext Vocab class 
from torchtext import vocab
test=Counter(itertools.chain(*sentances))
v=vocab.Vocab(test,wv_type=load_word_vector_set,wv_dim=300, unk_init='random', specials=[pad], min_freq=min_freq)

# We now have v.itos which containes an ordered list of words
# v.stoi which is a disctionary that links a word to an index
# The number of words is given by len(v.itos)
vocab_size=len(v.itos)

loading word vectors from /Users/johnrichmond/Dropbox/Machine Learning/text classification/glove.6B.300d.pt


In [40]:
# Now need to shuffle the input data
from random import shuffle
no_sentances=len(catagory)
# Create index and shuffle
shuffle_idx=[i for i in range(no_sentances)]
shuffle(shuffle_idx)
new_sentances=[]
new_catagory=[]
for idx in shuffle_idx:
    new_sentances.append(sentances[idx])
    new_catagory.append(catagory[idx])
# for now will save the originals but do not need to in future
sentances_bak=sentances[:]
catagory_bak=catagory[:]
sentances=new_sentances
catagory=new_catagory

# The lists are now shuffled


In [41]:
# will setup simple batches and sets for now
total_sentances=len(sentances)
total_batches=int(total_sentances/batch_size)
train_batches=int(total_batches*0.8)
val_batches=total_batches-train_batches

print "Total valid sentances: {:d}".format(total_sentances)

Total valid sentances: 16370


In [43]:
class sent_dataset(Dataset):
    """
    This is a very simplified version of a Pytorch dataset that I am using with my own 
    bespoke versions of the Pytorch dataiterator since I could not get the proper versions
    to work and this seemed the easiest way.  It does not support multi workers, transforms etc
    
    Args:
        sentances: expected to be a list of sentnaces, each containing a list of words 
                    as strings
        catagory: the corresponding catagory number of each sentance
        word_to_idx: a dictionary linking a word to an index
    """

    def __init__(self, sentances, catagory, word_to_idx):

        self.sentances = sentances
        self.catagory = catagory
        self.word_to_idx = word_to_idx


    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            context_idxs: a list of the word indexes in the sentance corresponding to the 
            sentance index
            catagory[index]: the catagory of the sentance
        """
        context_idxs=(map(lambda w: self.word_to_idx[w], sentances[index]))
             
        #print "sentance nums= ",sentance

        return context_idxs, catagory[index]

    def __len__(self):
        return len(self.catagory) 

In [44]:
train_ds=sent_dataset(sentances[0:train_batches*batch_size],
                      catagory[0:train_batches*batch_size],
                      v.stoi)
val_ds=sent_dataset(sentances[train_batches*batch_size:total_batches*batch_size],
                      catagory[train_batches*batch_size:total_batches*batch_size],
                      v.stoi)
train_loader=DataLoader(dataset=train_ds,
                                   batch_size=batch_size,
                                   shuffle=True)
val_loader=DataLoader(dataset=val_ds,
                                   batch_size=batch_size,
                                   shuffle=False)

# Simple model

In [45]:
class TextClassifier_simple(nn.Module):
    def __init__(self, catagories, vocab_size, sent_length,embedding_dim, HL1_size, HL2_size):
        super(TextClassifier_simple, self).__init__()
        self.embeddings=nn.Embedding(vocab_size, embedding_dim)
        self.linear1=nn.Linear(embedding_dim*sent_length,HL1_size)
        self.dropout=nn.Dropout()
        self.linear2=nn.Linear(HL1_size, HL2_size)
        self.linear3=nn.Linear(HL2_size, catagories)
        
    def forward(self, inputs):
        #embeds=self.embeddings(inputs).view(1,-1)
        embeds=self.embeddings(inputs).view(len(inputs),-1)
        out=F.relu(self.linear1(embeds))
        out=self.dropout(out)
        out=F.relu(self.linear2(out))
        out=self.linear3(out)
        log_probs=F.log_softmax(out)
        return log_probs

In [50]:
# Define classifier, optimiser and prevent modification of the word vector weights
losses=[]
loss_function=nn.NLLLoss()
model=TextClassifier_simple(num_cat, vocab_size, pad_to, word_vector_length, HL1_size, HL2_size)
optimizer=optim.SGD(model.parameters(),lr, weight_decay=L2, momentum=momentum, nesterov=nesterov)
model.embeddings.weight.data.copy_(v.vectors)
model.embeddings.weight.requires_grad = False
loss_function.parameters = filter(lambda p: p.requires_grad, model.parameters())

In [51]:

ntopk=3
predict=torch.LongTensor(train_batches*batch_size,ntopk).zero_()
all_targets=torch.LongTensor(train_batches*batch_size).zero_()
for epoch in xrange(no_epochs):
    total_loss = torch.Tensor([0])
    train_iter=iter(train_loader)
    for batch in xrange(train_batches):
        context_idxs=[]
        cats=[]
        # Step 1. Prepare the inputs to be passed to the model.  To do this we will:
        #  iterate around each sentance in the batch creating a numerical array of the 
        #  word indecies
        inputs,targets=train_iter.next()
        t_inputs=autograd.Variable(inputs)
        t_targets=autograd.Variable(targets)
        model.zero_grad()
        log_probs = model(t_inputs)
        _,ind=log_probs.data.topk(3,1,True,True)
        if epoch==no_epochs-1:
            predict[batch*batch_size:(batch+1)*batch_size,:]=ind
            all_targets[batch*batch_size:(batch+1)*batch_size]=targets
        loss=loss_function(log_probs, t_targets)
        #print "Batch: {0}, Loss= {1}".format(batch,loss.data)
        loss.backward()
        optimizer.step()
        total_loss+=loss.data
    print "Epoch: {0}, Loss= {1}".format(epoch,total_loss)


Epoch: 0, Loss= 
 811.1234
[torch.FloatTensor of size 1]

Epoch: 1, Loss= 
 741.5076
[torch.FloatTensor of size 1]

Epoch: 2, Loss= 
 664.9062
[torch.FloatTensor of size 1]

Epoch: 3, Loss= 
 604.8054
[torch.FloatTensor of size 1]

Epoch: 4, Loss= 
 564.0308
[torch.FloatTensor of size 1]

Epoch: 5, Loss= 
 535.8111
[torch.FloatTensor of size 1]

Epoch: 6, Loss= 
 515.1380
[torch.FloatTensor of size 1]

Epoch: 7, Loss= 
 500.0435
[torch.FloatTensor of size 1]

Epoch: 8, Loss= 
 481.6049
[torch.FloatTensor of size 1]

Epoch: 9, Loss= 
 471.8219
[torch.FloatTensor of size 1]

Epoch: 10, Loss= 
 456.4569
[torch.FloatTensor of size 1]

Epoch: 11, Loss= 
 449.1542
[torch.FloatTensor of size 1]

Epoch: 12, Loss= 
 435.5252
[torch.FloatTensor of size 1]

Epoch: 13, Loss= 
 431.2939
[torch.FloatTensor of size 1]

Epoch: 14, Loss= 
 420.8241
[torch.FloatTensor of size 1]

Epoch: 15, Loss= 
 417.2932
[torch.FloatTensor of size 1]

Epoch: 16, Loss= 
 410.5732
[torch.FloatTensor of size 1]

Epoch: 

In [52]:
# Training accuracy
print "Training error: ", total_loss
train_res=accuracy(predict,all_targets,topk=(1,3))
print "Training accuracy: ", train_res   

Training error:  
 121.7610
[torch.FloatTensor of size 1]

Training accuracy:  [
 89.0625
[torch.FloatTensor of size 1]
, 
 98.6060
[torch.FloatTensor of size 1]
]


In [53]:
class_correct = list(0. for i in range(len(label_to_idx)))
class_total = list(0. for i in range(len(label_to_idx)))
for i in range(len(predict)):
    c = (predict[i][0] == all_targets[i])
    label = all_targets[i]
    class_correct[label] += c
    class_total[label] += 1

for i in range(len(label_to_idx)):
    #print 'Accuracy of {0}'.format(label_to_idx[0])
    if class_total[i]>0:
        acc=(100 * class_correct[i] / class_total[i])
    else:
        acc=0
    print 'Accuracy of {0:2d} : {1:6.2f} % from a population of {2:3d}'.format(
        i, acc, int(class_total[i]))
print 'Total training cases: {0}, Vocab: {1}'.format(len(predict),vocab_size)

Accuracy of  0 :  87.40 % from a population of 3103
Accuracy of  1 :  75.68 % from a population of 148
Accuracy of  2 :  83.05 % from a population of  59
Accuracy of  3 :  77.15 % from a population of 827
Accuracy of  4 :   0.00 % from a population of  11
Accuracy of  5 :  92.97 % from a population of 725
Accuracy of  6 :   0.00 % from a population of   6
Accuracy of  7 :  93.14 % from a population of 102
Accuracy of  8 :  78.69 % from a population of 244
Accuracy of  9 :  95.81 % from a population of 4227
Accuracy of 10 :  90.73 % from a population of 561
Accuracy of 11 :  89.88 % from a population of 336
Accuracy of 12 :  88.83 % from a population of 573
Accuracy of 13 :  83.69 % from a population of 2134
Total training cases: 13056, Vocab: 3508


In [54]:
ntopk=3
val_predict=torch.LongTensor(val_batches*batch_size,ntopk).zero_()
val_targets=torch.LongTensor(val_batches*batch_size).zero_()
val_total_loss = torch.Tensor([0])
val_iter=iter(val_loader)
v_total_loss=0
for batch in xrange(val_batches):
    context_idxs=[]
    cats=[]
    inputs,targets=val_iter.next()
    model.eval()
    #bp() # This is a breakpoint.
    vinputs = autograd.Variable(inputs)
    #targets=autograd.Variable(torch.LongTensor(np.asarray(cats,dtype='int64')).view(batch_size,-1))
    vtargets=autograd.Variable(targets)
    log_probs = model(vinputs)
    _,ind=log_probs.data.topk(3,1,True,True)
    val_predict[batch*batch_size:(batch+1)*batch_size,:]=ind
    val_targets[batch*batch_size:(batch+1)*batch_size]=targets
    loss=loss_function(log_probs, vtargets)
    #print "Batch: {0}, Loss= {1}".format(batch,loss.data)
    v_total_loss+=loss.data
print "Validation total loss, Loss= {0}".format(v_total_loss)

Validation total loss, Loss= 
 20.3725
[torch.FloatTensor of size 1]



In [55]:
 # Test accuracy
test_res=accuracy(val_predict,val_targets,topk=(1,3))
print "Test accuracy: ", test_res 

Test accuracy:  [
 92.1420
[torch.FloatTensor of size 1]
, 
 99.3325
[torch.FloatTensor of size 1]
]


In [56]:
class_correct = list(0. for i in range(len(label_to_idx)))
class_total = list(0. for i in range(len(label_to_idx)))
for i in range(len(val_predict)):
    c = (val_predict[i][0] == val_targets[i])
    label = val_targets[i]
    class_correct[label] += c
    class_total[label] += 1

for i in range(len(label_to_idx)):
    #print 'Accuracy of {0}'.format(label_to_idx[0])
    if class_total[i]>0:
        acc=(100 * class_correct[i] / class_total[i])
    else:
        acc=0
    label=label_to_idx.keys()[label_to_idx.values().index(i)]
    print 'Accuracy of {0:20s} : {1:6.2f} %% from a population of {2:3d}'.format(
        label, acc, int(class_total[i]))

Accuracy of Activity             :  92.90 %% from a population of 746
Accuracy of Course               :  76.74 %% from a population of  43
Accuracy of Exam resit           : 100.00 %% from a population of  13
Accuracy of Fees or contribution :  75.60 %% from a population of 209
Accuracy of Letting              :   0.00 %% from a population of   3
Accuracy of Meal                 :  94.33 %% from a population of 194
Accuracy of Patents evening      :   0.00 %% from a population of   2
Accuracy of School bus           : 100.00 %% from a population of  29
Accuracy of Tickets              :  77.59 %% from a population of  58
Accuracy of Trip                 :  98.10 %% from a population of 1055
Accuracy of Tuition              :  95.86 %% from a population of 169
Accuracy of Uniform              :  97.50 %% from a population of  80
Accuracy of Wraparound care      :  90.54 %% from a population of 148
Accuracy of Other                :  86.65 %% from a population of 547
