In [None]:
%qtconsole

"""
Notebook to look at a how to use a word vector approach to catagorise sentances.

This is to help with analysing and assessing transaction data from a company based upon the 
description of the transaction.

The approach is to use a pre-trained set of word vectors - in this case the GloVe set 6.B with each vector being of 
length 300.

In this analysis the approach is to define a fixed sentance length and pad sentances where they are less than this.

I have tried lengths of 5 to 10, and 6 or 7 seems to work best. At present I am padding the sentances in my routine, 
I intend to see how well the padding in the pytorch embedding layer works and whether this is any better since
I am not sure if the padding is having a detrimental effect upon the analysis.

After the embedding layer I am using a simple 3 level neural network, the first two layers with rectified linear and 
then finally a softmax output

The previous analysis achieved accuracy of 88.9% on training data but 66.2% on the test data using a simple 
bag of words approach.

The present analysis gives me 93% on the training data and 95% on the test data, hence it is a big improvement in the analysis of the untrained datasets.  More could be done but the actual data itself needs some work and some of the catagories are not very well represented.

"""

In [None]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from myData import Dataset
from myData import DataLoader
#from torchtext import data as t_data
#from torchtext import utils
from torchtext.vocab import load_word_vectors
import numpy as np
#import re
from collections import Counter
import itertools
import sys
import csv
import os
import word_processing as wp


In [None]:
# System parameters
my_file_path='johnrichmond/Dropbox/Machine Learning/text classification/Andrew/'
csv_file_name='Payment items.csv'

stop_list= set("for a c e do h i if is it in g o p or r t u v y 's ' of the and mr ms to nd we".
               split())
pad='<pad>'

remove_single_words=True
max_sent_length=7
min_freq=1
use_subset_data=True
max_cases=30000

# only there are catagories 1-14 are valid, all others should be rejected

min_cat=0
max_cat=13
num_cat=max_cat+1
# Note actual catagory labels have been removed for reasons of commercial sensitivity
label_to_idx={"Cat 0":0, 
              "Cat 1": 1,
              "Cat 2":2,
              "Cat 3": 3,
              "Cat 4": 4,
              "Cat 5":5,
              "Cat 6": 6,
              "Cat 7":7,
              "Cat 8":8,
              "Cat 9":9,
              "Cat 10":10,
              "Cat 11":11,
              "Cat 12":12,
              "Cat 13":13
             }
load_word_vector_set='glove.6B'
word_vector_length=300
word_vector_path='johnrichmond/Dropbox/Machine Learning/text classification/'

In [None]:
# Analysis Hyper parameters
batch_size=32
no_epochs=800
# Define size of first hidden layer
HL1_size=300
# Define size of second hidden layer
HL2_size=120
# Define size of third hidden layer
HL3_size=50
# Define learning rate
lr=0.006
# Define momentum term used in stochastic gradient descent
momentum=0.2
nesterov=False
# Define weight decay term
L2=0.000
# Define percantage of data to use for validation
val_percentage=20


In [None]:
# Make the file paths the same whether using Mac or Linux
if sys.platform == 'darwin':
    start='/Users/'
else: start='/home/'
    
file_name=my_file_path+csv_file_name
txt_file=start+file_name
word_vec_path=os.path.join(start,word_vector_path)

In [None]:
replace_list={"years":"year", "yr":"year", "wks":"week","tickets": "ticket",
              "terms":"term", "students":"student","pupils":"pupil","meals": "meal",
              "lakes":"lake"}

In [None]:
# Utility function to measure classification accuracy - move to module once programme 
# is operating

def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    # The next line is not needed in this case since it is done prior to the call
    #_, pred = output.topk(maxk, 1, True, True) # topk is torch function to return highest values in array
    pred = output.t()  #Transpose
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    """
    Note - the expand is a torch command to expend one tensor to the size of another
    target os a one D tensor. target.view(1,-1) reshapes the tensor.  The -1 means this 
    is chosen by the software to get the right total size.  The first 1 indicates the number 
    of rows to use.
    The net outcome is an array with one column of length maxk for each target value.  
    The entire column is filled with the target value to facilitate easy comparison. 
    The correct array then contains an array with true wherever the tar
    """
    #get value matches the prediction
    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)    # The nomeclature[:k] returns the top k rows Since
                                                           # there is no second array we get the every column.
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

# Load sentances to catagorize

In [None]:
with open(txt_file,'rU') as file_obj:
    f_data=[]
    num=0
    lines=[]
    reader=csv.reader(file_obj)
    for line in reader:
        if reader.line_num<>1:
    #       with col in line:
            text_str=line[0]
            catagory=line[15]
            if wp.is_integer(catagory):
                f_data.append([text_str,int(catagory)])
                num=num+1
        if reader.line_num>max_cases and use_subset_data: break

In [None]:
# All data loaded, will now process each line of data
# Initially remove numbers and punctuation
sentances=[]
catagories=[]
catagory=[]
identity=np.identity(num_cat)
for row in f_data:
    row[0]=wp.clean_str(row[0])
    row[0]=wp.rem_numbers(row[0])# Done separately since I might not always want to do this

    if row[0] in (None, ""):
        # row rejected
        continue
    elif row[1] <1 or row[1]> num_cat or row[1] in (None, ""):
        continue
    sentance=row[0].split(" ")
    # remove stop list words
    sentance=wp.remove_stop_words(sentance,stop_list)
    sentance=wp.replace_similar_words(sentance,replace_list)
    if len(sentance)==0: continue
    row[1]=row[1]-1   
    
    # Getting to this point implies the row is ok and still has valid words, therefore will add
    sentances.append(sentance)
    catagories.append(identity[row[1]-1,:])
    catagory.append(row[1])
    
# Remove single words  
if remove_single_words==True:
    final_sentances=[]
    final_catagories=[]
    final_catagory=[]
    word_counts = Counter(itertools.chain(*sentances))
    new_sentances=[[word for word in sentance if word_counts[word]>1] 
                    for sentance in sentances]
    #Remove empty entries from both sentances and catagories
    for index,sentance in enumerate(new_sentances):
        if len(sentance)<>0:
            final_sentances.append(sentance)
            final_catagories.append(catagories[index])
            final_catagory.append(catagory[index])
    sentances=final_sentances
    catagories=final_catagories
    catagory=final_catagory
    
# Identify the longest sentance
max_length=0
for sentance in sentances:
    if len(sentance)> max_length: max_length=len(sentance)
print "Maximum sentance length: {} words".format(max_length)

# Pad sentances to maximum length
if max_sent_length<>0:
    pad_to=max_sent_length
else:
    pad_to=max_length
for sentance in sentances:
    len_sent=len(sentance)
    if len_sent<pad_to:
        for pos in range(len_sent,pad_to):
            sentance.append(pad)

for i in range(len(sentances)):
    sentances[i]=sentances[i][0:max_sent_length]
    


In [None]:
# Make use of the use of the pytext Vocab class 
from torchtext import vocab
test=Counter(itertools.chain(*sentances))
v=vocab.Vocab(test,wv_type=load_word_vector_set,wv_dim=300, unk_init='random', specials=[pad], min_freq=min_freq)

# We now have v.itos which containes an ordered list of words
# v.stoi which is a disctionary that links a word to an index
# The number of words is given by len(v.itos)
vocab_size=len(v.itos)

In [None]:
# Now need to shuffle the input data
from random import shuffle
no_sentances=len(catagory)
# Create index and shuffle
shuffle_idx=[i for i in range(no_sentances)]
shuffle(shuffle_idx)
new_sentances=[]
new_catagory=[]
for idx in shuffle_idx:
    new_sentances.append(sentances[idx])
    new_catagory.append(catagory[idx])
# for now will save the originals but do not need to in future
sentances_bak=sentances[:]
catagory_bak=catagory[:]
sentances=new_sentances
catagory=new_catagory

# The lists are now shuffled


In [None]:
# will setup simple batches and sets for now
total_sentances=len(sentances)
total_batches=int(total_sentances/batch_size)
train_batches=int(total_batches*0.8)
val_batches=total_batches-train_batches

print "Total valid sentances: {:d}".format(total_sentances)

In [None]:
class sent_dataset(Dataset):
    """
    This is a very simplified version of a Pytorch dataset that I am using with my own 
    bespoke versions of the Pytorch dataiterator since I could not get the proper versions
    to work and this seemed the easiest way.  It does not support multi workers, transforms etc
    
    Args:
        sentances: expected to be a list of sentnaces, each containing a list of words 
                    as strings
        catagory: the corresponding catagory number of each sentance
        word_to_idx: a dictionary linking a word to an index
    """

    def __init__(self, sentances, catagory, word_to_idx):

        self.sentances = sentances
        self.catagory = catagory
        self.word_to_idx = word_to_idx


    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            context_idxs: a list of the word indexes in the sentance corresponding to the 
            sentance index
            catagory[index]: the catagory of the sentance
        """
        context_idxs=(map(lambda w: self.word_to_idx[w], sentances[index]))
             
        #print "sentance nums= ",sentance

        return context_idxs, catagory[index]

    def __len__(self):
        return len(self.catagory) 

In [None]:
train_ds=sent_dataset(sentances[0:train_batches*batch_size],
                      catagory[0:train_batches*batch_size],
                      v.stoi)
val_ds=sent_dataset(sentances[train_batches*batch_size:total_batches*batch_size],
                      catagory[train_batches*batch_size:total_batches*batch_size],
                      v.stoi)
train_loader=DataLoader(dataset=train_ds,
                                   batch_size=batch_size,
                                   shuffle=True)
val_loader=DataLoader(dataset=val_ds,
                                   batch_size=batch_size,
                                   shuffle=False)

# Simple model

In [None]:
class TextClassifier_simple(nn.Module):
    def __init__(self, catagories, vocab_size, sent_length,embedding_dim, HL1_size, HL2_size):
        super(TextClassifier_simple, self).__init__()
        self.embeddings=nn.Embedding(vocab_size, embedding_dim)
        self.linear1=nn.Linear(embedding_dim*sent_length,HL1_size)
        self.dropout=nn.Dropout()
        self.linear2=nn.Linear(HL1_size, HL2_size)
        self.linear3=nn.Linear(HL2_size, catagories)
        
    def forward(self, inputs):
        #embeds=self.embeddings(inputs).view(1,-1)
        embeds=self.embeddings(inputs).view(len(inputs),-1)
        out=F.relu(self.linear1(embeds))
        out=self.dropout(out)
        out=F.relu(self.linear2(out))
        out=self.linear3(out)
        log_probs=F.log_softmax(out)
        return log_probs

In [None]:
# Define classifier, optimiser and prevent modification of the word vector weights
losses=[]
loss_function=nn.NLLLoss()
model=TextClassifier_simple(num_cat, vocab_size, pad_to, word_vector_length, HL1_size, 
                            HL2_size)
optimizer=optim.SGD(model.parameters(),lr, weight_decay=L2, momentum=momentum, 
                    nesterov=nesterov)
model.embeddings.weight.data.copy_(v.vectors)
model.embeddings.weight.requires_grad = False
loss_function.parameters = filter(lambda p: p.requires_grad, model.parameters())

In [None]:

ntopk=3
predict=torch.LongTensor(train_batches*batch_size,ntopk).zero_()
all_targets=torch.LongTensor(train_batches*batch_size).zero_()
for epoch in xrange(no_epochs):
    total_loss = torch.Tensor([0])
    train_iter=iter(train_loader)
    for batch in xrange(train_batches):
        context_idxs=[]
        cats=[]
        # Step 1. Prepare the inputs to be passed to the model.  To do this we will:
        #  iterate around each sentance in the batch creating a numerical array of the 
        #  word indecies
        inputs,targets=train_iter.next()
        t_inputs=autograd.Variable(inputs)
        t_targets=autograd.Variable(targets)
        model.zero_grad()
        log_probs = model(t_inputs)
        _,ind=log_probs.data.topk(3,1,True,True)
        if epoch==no_epochs-1:
            predict[batch*batch_size:(batch+1)*batch_size,:]=ind
            all_targets[batch*batch_size:(batch+1)*batch_size]=targets
        loss=loss_function(log_probs, t_targets)
        #print "Batch: {0}, Loss= {1}".format(batch,loss.data)
        loss.backward()
        optimizer.step()
        total_loss+=loss.data
    if epoch % 25==0:
        print "Epoch: {0}, Loss= {1}".format(epoch,total_loss.numpy()[0])


In [None]:
# Training accuracy
print "Training error: ", total_loss
train_res=accuracy(predict,all_targets,topk=(1,3))
print "Training accuracy: ", train_res   

In [None]:
class_correct = list(0. for i in range(len(label_to_idx)))
class_total = list(0. for i in range(len(label_to_idx)))
for i in range(len(predict)):
    c = (predict[i][0] == all_targets[i])
    label = all_targets[i]
    class_correct[label] += c
    class_total[label] += 1

for i in range(len(label_to_idx)):
    #print 'Accuracy of {0}'.format(label_to_idx[0])
    if class_total[i]>0:
        acc=(100 * class_correct[i] / class_total[i])
    else:
        acc=0
    label=label_to_idx.keys()[label_to_idx.values().index(i)]    
    print 'Accuracy of {0:20s} : {1:6.2f} % from a population of {2:3d}'.format(
        label, acc, int(class_total[i]))
print 'Total training cases: {0}, Vocab: {1}'.format(len(predict),vocab_size)

In [None]:
ntopk=3
val_predict=torch.LongTensor(val_batches*batch_size,ntopk).zero_()
val_targets=torch.LongTensor(val_batches*batch_size).zero_()
val_total_loss = torch.Tensor([0])
val_iter=iter(val_loader)
v_total_loss=0
for batch in xrange(val_batches):
    context_idxs=[]
    cats=[]
    inputs,targets=val_iter.next()
    model.eval()
    #bp() # This is a breakpoint.
    vinputs = autograd.Variable(inputs)
    #targets=autograd.Variable(torch.LongTensor(np.asarray(cats,dtype='int64')).view(batch_size,-1))
    vtargets=autograd.Variable(targets)
    log_probs = model(vinputs)
    _,ind=log_probs.data.topk(3,1,True,True)
    val_predict[batch*batch_size:(batch+1)*batch_size,:]=ind
    val_targets[batch*batch_size:(batch+1)*batch_size]=targets
    loss=loss_function(log_probs, vtargets)
    #print "Batch: {0}, Loss= {1}".format(batch,loss.data)
    v_total_loss+=loss.data
print "Validation total loss, Loss= {0}".format(v_total_loss)

In [None]:
 # Test accuracy
test_res=accuracy(val_predict,val_targets,topk=(1,3))
print "Test accuracy: ", test_res 

In [None]:
class_correct = list(0. for i in range(len(label_to_idx)))
class_total = list(0. for i in range(len(label_to_idx)))
for i in range(len(val_predict)):
    c = (val_predict[i][0] == val_targets[i])
    label = val_targets[i]
    class_correct[label] += c
    class_total[label] += 1

for i in range(len(label_to_idx)):
    #print 'Accuracy of {0}'.format(label_to_idx[0])
    if class_total[i]>0:
        acc=(100 * class_correct[i] / class_total[i])
    else:
        acc=0
    label=label_to_idx.keys()[label_to_idx.values().index(i)]
    print 'Accuracy of {0:20s} : {1:6.2f} % from a population of {2:3d}'.format(
        label, acc, int(class_total[i]))
print 'Total validation cases: {0}'.format(len(val_predict))