In [1]:
import re
import sys
import numpy as np
from scipy.sparse import csr_matrix
import xgboost as xgb
from sklearn.externals import joblib


In [2]:
def eliminate_whitespace(somestr):
    newstr = re.sub(r"[\n\t\s]*", "", somestr) #get rid of all whitespace characters including newline
    return newstr


In [3]:
def find_all_4grams(line_of_chars):
    thelist = []
    numchars = len(line_of_chars)
    num4grams = (numchars - 4) + 1 #given the number of chars in line_of_chars, 
                                   #this is how many 4grams we'll find within it
    for i in range(num4grams):
        fourgram = line_of_chars[i:i+4]
        thelist.append(fourgram)
    
    return(thelist)

def createLabels(someLine, list_of_4grams):
    someLineList = list(someLine)
    theLabels = []
    ptr = 0
    for fourGram in list_of_4grams:
        if someLineList[ptr + 2] == ' ':
            theLabels.append(1)
            someLineList.pop(ptr+2) #get rid of that space we just accounted for
        else:
            theLabels.append(0)
            
        ptr = ptr + 1
            
    return theLabels


In [4]:
def findIndexOfBigram(bigram, alphabet, unk_token):
    alphabetLength = len(alphabet)
    firstChar = bigram[0]
    secondChar = bigram[1]
    if (firstChar not in alphabet):
        firstChar = unk_token
    if(secondChar not in alphabet):
        secondChar = unk_token
    
    firstCharMultiplier = alphabet.index(firstChar)
    secondCharMultiplier = alphabet.index(secondChar)
    uniqueIndexOfBigram = firstCharMultiplier*alphabetLength**1 + secondCharMultiplier*alphabetLength**0
    return uniqueIndexOfBigram

def findBigramOfIndex(index, alphabet):
    alphabetLength = len(alphabet)
    row = index // alphabetLength
    col = index % alphabetLength
    firstchar = alphabet[row]
    secondchar = alphabet[col]
    return firstchar + secondchar

def findIndexOfUnigram(unigram, alphabet, unk_token):
    if unigram not in alphabet:
        unigram = unk_token
    theindex = alphabet.index(unigram)
    return theindex

def findUnigramOfIndex(index, alphabet):
    theunigram = alphabet[index]
    return theunigram


In [5]:
def create5dStrFeatureVector(fourGram):
    f1 = fourGram[0] + fourGram[1]
    f2 = fourGram[1]
    f3 = fourGram[1] + fourGram[2]
    f4 = fourGram[2]
    f5 = fourGram[2] + fourGram[3]
    featureVector = [f1,f2,f3,f4,f5]
    return featureVector

def create5dIntFeatureVector(strFeatureVector, alphabet, unk_token):
    lengthAlphabet = len(alphabet)
    lengthBigramPartOfVector = lengthAlphabet * lengthAlphabet
    lengthUnigramPartOfVector = lengthAlphabet
    lengthOneHotVector = lengthBigramPartOfVector + lengthUnigramPartOfVector

    f1,f2,f3,f4,f5 = strFeatureVector
    f1int = findIndexOfBigram(f1, alphabet, unk_token) 
    f2int = findIndexOfUnigram(f2, alphabet, unk_token) + lengthBigramPartOfVector
    f3int = findIndexOfBigram(f3, alphabet, unk_token) 
    f4int = findIndexOfUnigram(f4, alphabet, unk_token) + lengthBigramPartOfVector
    f5int = findIndexOfBigram(f5, alphabet, unk_token) 
    theVector = [f1int,f2int,f3int,f4int,f5int]
    return theVector


In [6]:
def loadChineseDataFile(filepath, numLines=None):
    encoding = 'big5hkscs'
    lines = []
    num_errors = 0
    #Need the 'rb' argument below when opening the file. This info, thanks to:
    #https://stackoverflow.com/questions/22216076/unicodedecodeerror-utf8-codec-cant-decode-byte-0xa5-in-position-0-invalid-s
    linecounter = 1

    for line in open(filepath, 'rb'):
        try:
            decodedLine = line.decode(encoding)
            cleanLine = ' '.join(decodedLine.split()) #replace double spaces with a single space
            cleanLine2 = re.sub(r"[\n]*", "", cleanLine) #get rid of newline character
            lines.append(cleanLine2)
        except UnicodeDecodeError as e:
            num_errors += 1
            print("error encountered at line", linecounter)
        linecounter = linecounter + 1
        
        if numLines is not None:
            if linecounter > numLines:
                break

    print('Encountered %d decoding errors.' % num_errors)
    # The `lines` list contains strings you can use.
    return lines


In [7]:
def createAlphabet(lines, unk_token, start_of_line_token, end_of_line_token):

    unique_chinese_chars_set = set()

    for line in lines:
        somestr = re.sub(r"[\n\t\s]*", "", line) #get rid of whitespace characters
        for somechar in somestr:
            unique_chinese_chars_set.add(somechar)

    #Let's make sure the characters i want to use as special tokens don't already occur in the
    #training data

    if start_of_line_token in unique_chinese_chars_set:
        print("error. The char", start_of_line_token, "which I use as a special token, appears as")
        print("a regular char in the datafile. Do not proceed!")

    if end_of_line_token in unique_chinese_chars_set:
        print("error. The char", end_of_line_token, "which I use as a special token, appears as")
        print("a regular char in the datafile. Do not proceed!")
        
    chinese_alphabet = [unk_token] + [start_of_line_token] + [end_of_line_token] + list(unique_chinese_chars_set) #we cast the set into a list here
    return chinese_alphabet
    

In [8]:
def buildSparseMatrix(lines, start_of_line_token, end_of_line_token, unk_token):
    row = np.array([])
    col = np.array([])
    humanLabelsNumpy = np.array([])
    sparseMatrixRowPtr = 0
    lineNumber = 0

    for someLineOriginal in lines:
        if (lineNumber % 1000 == 0):
            print("processing line", lineNumber, "...")

        someLine = start_of_line_token + someLineOriginal #we need a start of line token to form proper feature vector for 
        #first char in a line

        someLine = someLine + end_of_line_token #And we need a end of sentence token to form proper feature vector for
        #the last char in a line
        #print("Here is a line from the file:", someLine)

        someLineSansSpaces = eliminate_whitespace(someLine)
        all_the_4grams = []

        if len(someLine) >=3:
            all_the_4grams = find_all_4grams(someLineSansSpaces)
        else:
            print("An error occurred. Some line in the training file is either blank or has just a")
            print("single Chinese character on it without a punctuation mark at the end. This breaks")
            print("an important assumption I made about the data.")
            sys.exit()

        #print("Here are all its 4grams:", all_the_4grams)
        labels = createLabels(someLine, all_the_4grams)
        labels = np.asarray(labels)
        #print("Here are the labels for the 4grams:", labels)

        humanLabelsNumpy = np.concatenate((humanLabelsNumpy, labels))
        #print("the labels for the 4grams:", labels)
        #print("------------------")
        for i in range(len(all_the_4grams)):
            ngram = all_the_4grams[i]
            #an ngram will look like ABCD for instance
            strFeatureVector = create5dStrFeatureVector(ngram)
            #print("Here is the 5d feature vector string form:", strFeatureVector)
            intFeatureVector = create5dIntFeatureVector(strFeatureVector, chinese_alphabet, unk_token)
            #print("Here is the 5d feature vector int form:", intFeatureVector)
            #print("---------")
            intFeatureVectorNumpy = np.asarray(intFeatureVector)

            temp = np.ones((5), dtype=int) * sparseMatrixRowPtr
            row = np.concatenate((row, temp)) #which row in the sparse matrix we're building right now
            col = np.concatenate((col, intFeatureVectorNumpy)) #the col indexes that will have a 1 in them

            sparseMatrixRowPtr = sparseMatrixRowPtr + 1

        lineNumber = lineNumber + 1

    print("The number of 5d feature vectors built was:", sparseMatrixRowPtr)
    #print(humanLabelsNumpy.shape)
    #print(row.shape) #row and col should have same length
    #print(col.shape)

    #We can now build the sparse matrix
    lengthAlphabet = len(chinese_alphabet)
    lengthBigramPartOfVector = lengthAlphabet * lengthAlphabet
    lengthUnigramPartOfVector = lengthAlphabet
    lengthOneHotVector = lengthBigramPartOfVector + lengthUnigramPartOfVector

    data = np.ones_like(row) #We're building one-hot vector so non-zero entries in the matrix will have
    #values of 1
    
    sparseOneHotMatrix = csr_matrix((data, (row, col)), shape=(sparseMatrixRowPtr, lengthOneHotVector))
    dtrain = xgb.DMatrix(sparseOneHotMatrix,label=humanLabelsNumpy) #need to convert the sparse matrix 
    #to a dmatrix that the xgboost module understands

    return (dtrain,humanLabelsNumpy)


In [9]:
def evaluateResults(classifierLabels, humanLabels):
    assert len(classifierLabels) == len(humanLabels)
    #these two lists have to have the same length
    
    numPredictions = len(classifierLabels)
    truePositives = 0
    trueNegatives = 0

    falsePositives = 0
    falseNegatives = 0
    numCorrect = 0
    numWrong = 0

    for i in range(len(humanLabels)):
        if (humanLabels[i]==1):
            if (classifierLabels[i] == 1):
                truePositives = truePositives + 1
                numCorrect = numCorrect + 1
            elif(classifierLabels[i] == 0):
                falseNegatives = falseNegatives + 1
                numWrong = numWrong + 1
        elif(humanLabels[i]==0):
            if (classifierLabels[i] == 0):
                trueNegatives = trueNegatives + 1
                numCorrect = numCorrect + 1
            elif(classifierLabels[i]==1):
                falsePositives = falsePositives + 1
                numWrong = numWrong + 1


    #print("true positives:", truePositives)
    #print("false negatives:", falseNegatives)
    #print("false positives:", falsePositives)
    #print()

    accuracy = numCorrect/numPredictions
    precision = truePositives/(truePositives + falsePositives)
    recall = truePositives/(truePositives + falseNegatives)
    f1 = 2*precision*recall/(precision+recall)
    
    return (accuracy,precision,recall, f1)


### Let's load the training set into a list called lines. Each line of the file becomes an item in this list.
There are 745,806 (clean) lines in it.

In [21]:
numberLinesForTrainAndDevSets = 100000
numTrainLines = 90000
numDevLines = 10000


In [22]:
trainAndDevLines = loadChineseDataFile("./data/training.txt", numberLinesForTrainAndDevSets)
print(len(trainAndDevLines))


Encountered 0 decoding errors.
100000


In [23]:
trainLines = trainAndDevLines[0:numTrainLines]
devLines = trainAndDevLines[-numDevLines:]
print(len(trainLines))
print(len(devLines))


90000
10000


### Let's take a look at the first few lines of the file that we loaded into the list called lines

In [24]:
for line in trainLines[0:9]:
    print(line)
    
#As you can see below, each line ends with a punctuation mark and the punctuation mark has 
#a whitespace before it. I'm assuming every line in the file is like this. An error will occur
#otherwise.

時間 ：
三月 十日 （ 星期四 ） 上午 十時 。
地點 ：
學術 活動 中心 一樓 簡報室 。
主講 ：
民族所 所長 莊英章 先生 。
講題 ：
閩 、 台 漢人 社會 研究 的 若干 考察 。
李 院長 於 二月 二十六日 至 三月 十五日 赴 美 訪問 ，


Line 8 up above is a special case that needs to be addressed. The first character in that line is a word but there's no character to the left of it so how would a 5d feature vector be built here to discover that a space should come after that first char? <br /> <br />
Here's an example with English letters to clarify what I mean. Let's say the stream of characters on a line is: <br /> 
ABCDEFGH <br /> 
and the correct spacing is: <br /> 
A BCDEFGH <br />

There isn't a character to the left of A so how would be build that 5 dimensional feature vector for A and find out that a space should come right after it? The answer is we need a start of line token. Let's use the carrot character ^ as the start of line token. <br /> <br />
A similar argument can be made for the importance of an end-of-line token - how it's needed to determine if a space should appear before the last character in a line.
We manually add these special tokens to the alphabet in the function createAlphabet()

### Let's create the alphabet here. We add all the characters in a data file to a set in order to get rid of duplicates. We then convert the set to a list.

In [25]:
unk_token = '*' #The '*' char will represent an unknown char (i.e. unseen in the training data)
start_of_line_token = '^' #The '^' char will represent the start of a sentence
end_of_line_token = '$'
chinese_alphabet = createAlphabet(trainAndDevLines, unk_token, start_of_line_token, end_of_line_token)


In [26]:
print(len(chinese_alphabet))


4660


In [27]:
#take a look at the first few characters in the alphabet
print(chinese_alphabet[0:10])


['*', '^', '$', '鈴', '甦', '特', '藻', '並', '燹', '暨']


In [28]:
joblib.dump(chinese_alphabet, 'chinese_alphabet.pickle', compress=True)


['chinese_alphabet.pickle']

### Now let's create the feature vector for each 4gram and its label (i.e. let's create the training, dev and test sets)


In [29]:
(dtrain, ytrain) = buildSparseMatrix(trainLines, start_of_line_token, end_of_line_token, unk_token)


processing line 0 ...
processing line 1000 ...
processing line 2000 ...
processing line 3000 ...
processing line 4000 ...
processing line 5000 ...
processing line 6000 ...
processing line 7000 ...
processing line 8000 ...
processing line 9000 ...
processing line 10000 ...
processing line 11000 ...
processing line 12000 ...
processing line 13000 ...
processing line 14000 ...
processing line 15000 ...
processing line 16000 ...
processing line 17000 ...
processing line 18000 ...
processing line 19000 ...
processing line 20000 ...
processing line 21000 ...
processing line 22000 ...
processing line 23000 ...
processing line 24000 ...
processing line 25000 ...
processing line 26000 ...
processing line 27000 ...
processing line 28000 ...
processing line 29000 ...
processing line 30000 ...
processing line 31000 ...
processing line 32000 ...
processing line 33000 ...
processing line 34000 ...
processing line 35000 ...
processing line 36000 ...
processing line 37000 ...
processing line 38000 ...

In [30]:
#Now let's create the validation set
(dvalid, yvalid) = buildSparseMatrix(devLines, start_of_line_token, end_of_line_token, unk_token)


processing line 0 ...
processing line 1000 ...
processing line 2000 ...
processing line 3000 ...
processing line 4000 ...
processing line 5000 ...
processing line 6000 ...
processing line 7000 ...
processing line 8000 ...
processing line 9000 ...
The number of 5d feature vectors built was: 121960


### Let's implement and train an xgboost decision tree classifier on the data

In [32]:
#info from https://www.kdnuggets.com/2017/03/simple-xgboost-tutorial-iris-dataset.html

num_round = 40  # the number of training iterations
param = {
    'max_depth': 20,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'binary:logistic' # learning objective is binary classification
     }  

param['eval_metric'] = 'error'
#note that you are not supposed to use num_class with 'binary:logistic'

In [33]:
print("Training for", num_round, "epochs")


Training for 40 epochs


In [34]:
evallist = [(dvalid, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, evallist)


[0]	eval-error:0.31726	train-error:0.324748
[1]	eval-error:0.294228	train-error:0.29754
[2]	eval-error:0.273073	train-error:0.274109
[3]	eval-error:0.262193	train-error:0.262916
[4]	eval-error:0.249606	train-error:0.244611
[5]	eval-error:0.24135	train-error:0.236876
[6]	eval-error:0.232101	train-error:0.22577
[7]	eval-error:0.226722	train-error:0.218999
[8]	eval-error:0.218112	train-error:0.20735
[9]	eval-error:0.211504	train-error:0.197408
[10]	eval-error:0.196015	train-error:0.18136
[11]	eval-error:0.192899	train-error:0.178059
[12]	eval-error:0.191432	train-error:0.176252
[13]	eval-error:0.188972	train-error:0.175557
[14]	eval-error:0.185643	train-error:0.169669
[15]	eval-error:0.181076	train-error:0.164024
[16]	eval-error:0.1786	train-error:0.161765
[17]	eval-error:0.17714	train-error:0.160921
[18]	eval-error:0.175558	train-error:0.159366
[19]	eval-error:0.172614	train-error:0.156471
[20]	eval-error:0.170326	train-error:0.154205
[21]	eval-error:0.169392	train-error:0.153121
[22]	ev

In [35]:
#let's save the model
joblib.dump(bst, 'bst.pickle', compress=True)


['bst.pickle']

### Let's evaluate the trained model
You can load the trained model and its alphabet in the next cell if you need to

In [48]:
bst = joblib.load('chinese-xgb-model-final.pickle') #load the saved model
chinese_alphabet = joblib.load('chinese-alphabet-final.pickle') #load the saved alphabet


In [49]:
#Let's create the test set
testLines = loadChineseDataFile("./data/test.txt")
(dtest, ytest) = buildSparseMatrix(testLines, start_of_line_token, end_of_line_token, unk_token)


Encountered 0 decoding errors.
processing line 0 ...
processing line 1000 ...
The number of 5d feature vectors built was: 17345


In [50]:
preds = bst.predict(dtest)
print(preds)


[0.8608776  0.3243633  0.3243633  ... 0.3243633  0.3243633  0.99747854]


In [51]:
predsBinary = (preds >=0.5)
print(len(predsBinary))
print(predsBinary)


17345
[ True False False ... False False  True]


In [52]:
print(ytest)


[1. 0. 0. ... 1. 0. 1.]


In [53]:
acc,prec,recall,f1 = evaluateResults(predsBinary, ytest)
print("accuracy:", acc)
print("precision:", prec)
print("recall:", recall)
print("F1 score:", f1)


accuracy: 0.8504468146439896
precision: 0.9468858324946886
recall: 0.7998488712572023
F1 score: 0.867178699436764
