In [None]:
"""
Notebook to look at a how to use a very simple bag of words approach to catagorise sentances.
This is to help with analysing and assessing transaction data from a company based upon the 
description of the transaction.

It is not anticipated that this will work very well, since the approach below does not include 
any use of word vectors and hence the dictionary very quickly becomes very large and filled 
with words that should be related but can't be with the model being used. Hence this is just a 
stepping stone to a more sophisticated model.

The approach followed is largely that defined by rgunthrie in the Github repository:
https://github.com/rguthrie3/DeepLearningForNLPInPytorch
, although several functions have been copied from the example by Alexander Rakhlin in 
'CNN-for-sentance classification-in -keras'

I have also looked at Gensim, and am likely to adopt some of their software for the next 
version of the analysis

The present analysis gets an accuracy of 88.9% on training data but this falls to 66.2% on 
the test data.  I beleive this is largely due to additional geographic words coming up in the 
test data that are not included in the training data, and with no word vectors, there is no easy
way to relate such words together

An improvement I would like to try at some point is to use the natural language tool kit and 
stemmer from the nltk library, however, I think this model is fundamentally limited and so won't 
do so with this version
"""

In [None]:
%qtconsole


In [6]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from gensim.models import word2vec
import re
from collections import Counter
import itertools
import sys
import csv

In [7]:
# System parameters
my_file_path='johnrichmond/Dropbox/Machine Learning/text classification/Andrew/'
csv_file_name='Payment items.csv'

stop_list= set("for a c do h i if is it in g o p or e r t 's of the and mr ms to nd we".split())


remove_single_words=True
use_subset_data=True
max_cases=10000

# only there are catagories 1-14 are valid, all others should be rejected

min_cat=0
max_cat=13
num_cat=max_cat+1
label_to_idx={"Activity":0, 
              "Course": 1,
              "Exam resit":2,
              "Fees or contribution": 3,
              "Letting": 4,
              "Meal":5,
              "Patents evening": 6,
              "School bus":7,
              "Tickets":8,
              "Trip":9,
              "Tuition":10,
              "Uniform":11,
              "Wraparound care":12,
              "Other":13
             }

In [8]:
# Hyper parameters
no_epochs=20
lr=0.1

In [9]:
replace_list={"years":"year", "yr":"year", "wks":"week","tickets": "ticket",
              "terms":"term", "students":"student","pupils":"pupil","meals": "meal"}


In [10]:
# Make the file path the same whether using Mac or Linux
if sys.platform == 'darwin':
    start='/Users/'
else: start='/home/'
    
file_name=my_file_path+csv_file_name
txt_file=start+file_name

# Some functions that are needed later

In [11]:
def is_integer(s):
    try:
        int(s)
        return True
    except ValueError:
        print s
        return False
    

In [12]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    #string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"[^A-Za-z0-9,!?\'\`]", " ", string)
    # matches any single character not in the above list and replaces with a white space
    string = re.sub(r"\'s", " \'s", string)
    # adds a space prior to '\apostrophy with an s
    string = re.sub(r"\'ve", " \'ve", string)
    # adds a space before apostrphy with ve
    string = re.sub(r"n\'t", " n\'t", string)
    # as above but for a 't
    string = re.sub(r"\'re", " \'re", string)
    # as above but with 're
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    # not sure why this is necessary since I have not seen this option before
    string = re.sub(r",", " , ", string)
    # adds space before and after commas
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    # adds spaces before and after the brackets
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    # replaces multiple white space sections with single whitespace
    return string.strip().lower()

In [13]:
def rem_numbers(string):
    string=re.sub(r"[0123456789]","",string)
    string = re.sub(r"\s{2,}", " ", string)
    string=string.strip()
    return string

In [14]:
def remove_stop_words(sentance, stop_list):
    filtered = [word for word in sentance if word not in stop_list] 
    return filtered
    

In [15]:
def replace_similar_words(sentance, replace_list):
    new_sentance=[]
    for word in sentance:
        if word in replace_list:
            word=replace_list[word]
        new_sentance.append(word)
    return new_sentance

In [16]:
def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    # The next line is not needed in this case since it is done prior to the call
    #_, pred = output.topk(maxk, 1, True, True) # topk is torch function to return highest values in array
    pred = output.t()  #Transpose
    correct = pred.eq(target.view(1, -1).expand_as(pred))
       #Note - the expand is a torch command to expend one tensor to the size of another
       # target os a one D tensor. target.view(1,-1) reshapes the tensor.  The -1 means this is chosem
       # by the software to get the right total size.  The first 1 indicates the number of rows to use
       # The net outcome is an array with one column of length maxk for each target value.  The entire column 
       # is filled with the target value to facilitate easy comparison.
       # The correct array then contains an array with true wherever the target value matches the prediction
    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)    # The nomeclature[:k] returns the top k rows Since
                                                           # there is no second array we get the every column.
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

In [17]:
def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

In [18]:
with open(txt_file,'rU') as file_obj:
    data=[]
    num=0
    lines=[]
    reader=csv.reader(file_obj)
    for line in reader:
        if reader.line_num<>1:
    #       with col in line:
            text_str=line[0]
            catagory=line[15]
            if is_integer(catagory):
                data.append([text_str,int(catagory)])
                num=num+1
        if reader.line_num>max_cases and use_subset_data: break
    







In [19]:
# All data loaded, will now process each line of data
# Initial work will focus upon removing numbers and punctuation
sentances=[]
catagories=[]
catagory=[]
identity=np.identity(num_cat)
for row in data:
    row[0]=clean_str(row[0])
    row[0]=rem_numbers(row[0])# Done separately since I might not always want to do this

    if row[0] in (None, ""):
        # row rejected
        continue
    elif row[1] <1 or row[1]> num_cat or row[1] in (None, ""):
        continue
    sentance=row[0].split(" ")
    # remove stop list words
    sentance=remove_stop_words(sentance,stop_list)
    sentance=replace_similar_words(sentance,replace_list)
    if len(sentance)==0: continue
    row[1]=row[1]-1   
    
    # Getting to this point implies the row is ok and still has valid words, therefore will add
    sentances.append(sentance)
    catagories.append(identity[row[1]-1,:])
    catagory.append(row[1])
    
# Remove single words  
if remove_single_words==True:
    final_sentances=[]
    final_catagories=[]
    final_catagory=[]
    word_counts = Counter(itertools.chain(*sentances))
    new_sentances=[[word for word in sentance if word_counts[word]>1] 
                    for sentance in sentances]
    #Remove empty entries from both sentances and catagories
    for index,sentance in enumerate(new_sentances):
        if len(sentance)<>0:
            final_sentances.append(sentance)
            final_catagories.append(catagories[index])
            final_catagory.append(catagory[index])
    sentances=final_sentances
    catagories=final_catagories
    catagory=final_catagory
            
# note - might have to pad the sentances in future



In [20]:
vocab, vocab_inv=build_vocab(sentances)
VOCAB_SIZE=len(vocab)
word_to_ix=vocab
dataset_total=len(sentances)
train_data_max=int(dataset_total*75/100)
test_data_start=train_data_max+1
test_data_end=dataset_total-1


In [21]:
# Simple BOW classifier
class BoWClassifier(nn.Module): # inheriting from nn.Module!
    
    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()
        
        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, num_labels)
        
        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here
        
    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec))

In [22]:
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)

def make_target(label, label_to_idx):
    return torch.LongTensor([label_to_idx[label]])

In [23]:

model = BoWClassifier(num_cat, VOCAB_SIZE)

# the model knows its parameters.  The first output below is A, the second is b.
# Whenever you assign a component to a class variable in the __init__ function of a module,
# which was done with the line
# self.linear = nn.Linear(...)
# Then through some Python magic from the Pytorch devs, your module (in this case, BoWClassifier)
# will store knowledge of the nn.Linear's parameters
for param in model.parameters():
    print param

Parameter containing:
 4.8404e-03  2.4111e-02 -2.4233e-02  ...   1.0330e-02 -1.3646e-02  6.2271e-03
-1.4234e-02  4.3335e-03  1.3638e-02  ...  -5.5681e-03 -8.5464e-03 -7.7517e-03
-1.3090e-02  2.0289e-02 -1.1494e-02  ...   2.4310e-02  2.1889e-02 -1.9190e-02
                ...                   ⋱                   ...                
 1.8329e-02 -1.4673e-02 -1.7435e-02  ...   2.4863e-02 -8.9059e-03  9.3438e-03
-2.3373e-02  1.3154e-02  3.2752e-03  ...  -1.8734e-02 -5.6787e-03  7.3283e-04
 2.1252e-02 -1.1654e-02  1.6604e-02  ...   2.3214e-02 -8.3679e-03  2.3990e-02
[torch.FloatTensor of size 14x1572]

Parameter containing:
1.00000e-02 *
  2.4566
 -2.1840
 -1.8227
 -1.8128
  0.0598
 -1.2959
  1.0790
  0.9844
 -1.0496
 -0.7943
  0.8890
  1.0108
 -1.2464
 -1.6383
[torch.FloatTensor of size 14]



In [24]:
# To run the model, pass in a BoW vector, but wrapped in an autograd.Variable
sample = sentances[0]
bow_vector = make_bow_vector(sample, word_to_ix)
log_probs = model(autograd.Variable(bow_vector))
print log_probs

Variable containing:

Columns 0 to 9 
-2.6345 -2.6061 -2.7163 -2.6778 -2.6326 -2.6315 -2.5896 -2.6110 -2.6341 -2.6664

Columns 10 to 13 
-2.6764 -2.6100 -2.6163 -2.6519
[torch.FloatTensor of size 1x14]



In [25]:
ntopk=3
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr)
predict=torch.LongTensor(train_data_max,ntopk).zero_()
targets=torch.LongTensor(catagory[0:train_data_max])


for epoch in xrange(no_epochs):
    for i in xrange(train_data_max):
        # Step 1. Remember that Pytorch accumulates gradients.  We need to clear them out
        # before each instance
        model.zero_grad()
    
        # Step 2. Make our BOW vector and also we must wrap the target in a Variable
        # as an integer.  For example, if the target is SPANISH, then we wrap the integer
        # 0.  The loss function then knows that the 0th element of the log probabilities is
        # the log probability corresponding to SPANISH
        bow_vec = autograd.Variable(make_bow_vector(sentances[i], word_to_ix))
        myVar=torch.LongTensor([catagory[i]])
        target = autograd.Variable(torch.LongTensor([catagory[i]]))
    
        # Step 3. Run our forward pass.
        log_probs = model(bow_vec)
        _,ind=log_probs.data.topk(3,1,True,True)
        predict[i,:]=ind
    
        # Step 4. Compute the loss, gradients, and update the parameters by calling
        # optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
    print "Epoch= {:d}, loss= {:s}".format(epoch,loss)
    
    # Access training accuracy
    #for i in xrange(0,train_data_max):
        
    
    

Epoch= 0, loss= Variable containing:
1.00000e-02 *
  2.1889
[torch.FloatTensor of size 1]

Epoch= 1, loss= Variable containing:
1.00000e-02 *
  1.4978
[torch.FloatTensor of size 1]

Epoch= 2, loss= Variable containing:
1.00000e-02 *
  1.1714
[torch.FloatTensor of size 1]

Epoch= 3, loss= Variable containing:
1.00000e-03 *
  9.8241
[torch.FloatTensor of size 1]

Epoch= 4, loss= Variable containing:
1.00000e-03 *
  8.6266
[torch.FloatTensor of size 1]

Epoch= 5, loss= Variable containing:
1.00000e-03 *
  7.8236
[torch.FloatTensor of size 1]

Epoch= 6, loss= Variable containing:
1.00000e-03 *
  7.2652
[torch.FloatTensor of size 1]

Epoch= 7, loss= Variable containing:
1.00000e-03 *
  6.8672
[torch.FloatTensor of size 1]

Epoch= 8, loss= Variable containing:
1.00000e-03 *
  6.5791
[torch.FloatTensor of size 1]

Epoch= 9, loss= Variable containing:
1.00000e-03 *
  6.3691
[torch.FloatTensor of size 1]

Epoch= 10, loss= Variable containing:
1.00000e-03 *
  6.2166
[torch.FloatTensor of size 1]

In [26]:
# Training accuracy
train_res=accuracy(predict,targets,topk=(1,3))
print "Training accuracy: ", train_res

Training accuracy:  [
 88.9036
[torch.FloatTensor of size 1]
, 
 98.4110
[torch.FloatTensor of size 1]
]


In [27]:
# Test data
no_cases=test_data_end-test_data_start+1
predict_test=torch.LongTensor(no_cases,ntopk).zero_()
targets_test=torch.LongTensor(catagory[test_data_start:test_data_end+1])
for i in xrange(test_data_start,test_data_end+1):
    bow_vec = autograd.Variable(make_bow_vector(sentances[i], word_to_ix))
    log_probs = model(bow_vec)
    _,ind=log_probs.data.topk(3,1,True,True)
    predict_test[(i-test_data_start),:]=ind
    #print log_probs
#print next(model.parameters()) # Index corresponding to Spanish goes up, English goes down!

In [28]:
# Test accuracy
test_res=accuracy(predict_test,targets_test,topk=(1,3))
print "Test accuracy: ", test_res

Test accuracy:  [
 66.2162
[torch.FloatTensor of size 1]
, 
 89.0302
[torch.FloatTensor of size 1]
]
