In [1]:
import re
import sys
import numpy as np
from scipy.sparse import csr_matrix
import xgboost as xgb
from sklearn.externals import joblib


In [2]:
unk_token = '*' #The '*' char will represent an unknown char (i.e. unseen in the training data)
start_of_line_token = '^' #The '^' char will represent the start of a sentence
end_of_line_token = '$'


In [3]:
def eliminate_whitespace(somestr):
    newstr = re.sub(r"[\n\t\s]*", "", somestr) #get rid of all whitespace characters including newline
    return newstr

def find_all_4grams(line_of_chars):
    thelist = []
    numchars = len(line_of_chars)
    num4grams = (numchars - 4) + 1 #given the number of chars in line_of_chars, 
                                   #this is how many 4grams we'll find within it
    for i in range(num4grams):
        fourgram = line_of_chars[i:i+4]
        thelist.append(fourgram)
    
    return(thelist)

def createLabels(someLine, list_of_4grams):
    someLineList = list(someLine)
    theLabels = []
    ptr = 0
    for fourGram in list_of_4grams:
        if someLineList[ptr + 2] == ' ':
            theLabels.append(1)
            someLineList.pop(ptr+2) #get rid of that space we just accounted for
        else:
            theLabels.append(0)
            
        ptr = ptr + 1
            
    return theLabels


def findIndexOfBigram(bigram, alphabet, unk_token):
    alphabetLength = len(alphabet)
    firstChar = bigram[0]
    secondChar = bigram[1]
    if (firstChar not in alphabet):
        firstChar = unk_token
    if(secondChar not in alphabet):
        secondChar = unk_token
    
    firstCharMultiplier = alphabet.index(firstChar)
    secondCharMultiplier = alphabet.index(secondChar)
    uniqueIndexOfBigram = firstCharMultiplier*alphabetLength**1 + secondCharMultiplier*alphabetLength**0
    return uniqueIndexOfBigram

def findBigramOfIndex(index, alphabet):
    alphabetLength = len(alphabet)
    row = index // alphabetLength
    col = index % alphabetLength
    firstchar = alphabet[row]
    secondchar = alphabet[col]
    return firstchar + secondchar

def findIndexOfUnigram(unigram, alphabet, unk_token):
    if unigram not in alphabet:
        unigram = unk_token
    theindex = alphabet.index(unigram)
    return theindex

def findUnigramOfIndex(index, alphabet):
    theunigram = alphabet[index]
    return theunigram


def create5dStrFeatureVector(fourGram):
    f1 = fourGram[0] + fourGram[1]
    f2 = fourGram[1]
    f3 = fourGram[1] + fourGram[2]
    f4 = fourGram[2]
    f5 = fourGram[2] + fourGram[3]
    featureVector = [f1,f2,f3,f4,f5]
    return featureVector

def create5dIntFeatureVector(strFeatureVector, alphabet, unk_token):
    lengthAlphabet = len(alphabet)
    lengthBigramPartOfVector = lengthAlphabet * lengthAlphabet
    lengthUnigramPartOfVector = lengthAlphabet
    lengthOneHotVector = lengthBigramPartOfVector + lengthUnigramPartOfVector

    f1,f2,f3,f4,f5 = strFeatureVector
    f1int = findIndexOfBigram(f1, alphabet, unk_token) 
    f2int = findIndexOfUnigram(f2, alphabet, unk_token) + lengthBigramPartOfVector
    f3int = findIndexOfBigram(f3, alphabet, unk_token) 
    f4int = findIndexOfUnigram(f4, alphabet, unk_token) + lengthBigramPartOfVector
    f5int = findIndexOfBigram(f5, alphabet, unk_token) 
    theVector = [f1int,f2int,f3int,f4int,f5int]
    return theVector


def loadChineseDataFile(filepath, numLines=None):
    encoding = 'big5hkscs'
    lines = []
    num_errors = 0
    #Need the 'rb' argument in the following line. This info, thanks to:
    #https://stackoverflow.com/questions/22216076/unicodedecodeerror-utf8-codec-cant-decode-byte-0xa5-in-position-0-invalid-s
    linecounter = 1

    for line in open(filepath, 'rb'):
        try:
            decodedLine = line.decode(encoding)
            cleanLine = ' '.join(decodedLine.split()) #replace double spaces with a single space
            cleanLine2 = re.sub(r"[\n]*", "", cleanLine) #get rid of newline character
            lines.append(cleanLine2)
        except UnicodeDecodeError as e:
            num_errors += 1
            print("error encountered at line", linecounter)
        linecounter = linecounter + 1
        
        if numLines is not None:
            if linecounter > numLines:
                break

    print('Encountered %d decoding errors.' % num_errors)
    # The `lines` list contains strings you can use.
    return lines


def insertSpecialTokens(someLine, start_of_line_token, end_of_line_token):
    someLine = start_of_line_token + someLine #we need a start of line token to form proper feature vector for 
    #first char in a line

    someLine = someLine + end_of_line_token #And we need a end of sentence token to form proper feature vector for
    #the last char in a line
    
    return someLine    


def evaluateResults(classifierLabels, humanLabels):
    assert len(classifierLabels) == len(humanLabels)
    #these two lists have to have the same length
    
    numPredictions = len(classifierLabels)
    truePositives = 0
    trueNegatives = 0

    falsePositives = 0
    falseNegatives = 0
    numCorrect = 0
    numWrong = 0

    for i in range(len(humanLabels)):
        if (humanLabels[i]==1):
            if (classifierLabels[i] == 1):
                truePositives = truePositives + 1
                numCorrect = numCorrect + 1
            elif(classifierLabels[i] == 0):
                falseNegatives = falseNegatives + 1
                numWrong = numWrong + 1
        elif(humanLabels[i]==0):
            if (classifierLabels[i] == 0):
                trueNegatives = trueNegatives + 1
                numCorrect = numCorrect + 1
            elif(classifierLabels[i]==1):
                falsePositives = falsePositives + 1
                numWrong = numWrong + 1

    #print("true positives:", truePositives)
    #print("false negatives:", falseNegatives)
    #print("false positives:", falsePositives)
    #print()

    accuracy = numCorrect/numPredictions
    precision = truePositives/(truePositives + falsePositives)
    recall = truePositives/(truePositives + falseNegatives)
    
    return (accuracy,precision,recall)

In [4]:
def makePredictions(lines, start_of_line_token, end_of_line_token, unk_token):
    row = np.array([])
    col = np.array([])
    sparseMatrixRowPtr = 0
    lineNumber = 0

    for someLineOriginal in lines:
        if (lineNumber % 1000 == 0):
            print("processing line", lineNumber, "...")
            
        someLine = insertSpecialTokens(someLineOriginal, start_of_line_token, end_of_line_token)
        all_the_4grams = []

        if len(someLine) >=3:
            all_the_4grams = find_all_4grams(someLine)
        else:
            print("An error occurred. Some line in the training file is either blank or has just a")
            print("single Chinese character on it without a punctuation mark at the end. This breaks")
            print("an important assumption I made about the data.")
            sys.exit()

        for i in range(len(all_the_4grams)):
            ngram = all_the_4grams[i]
            #an ngram will look like ABCD for instance
            strFeatureVector = create5dStrFeatureVector(ngram)
            #print("Here is the 5d feature vector string form:", strFeatureVector)
            intFeatureVector = create5dIntFeatureVector(strFeatureVector, chinese_alphabet, unk_token)
            #print("Here is the 5d feature vector int form:", intFeatureVector)
            #print("---------")
            intFeatureVectorNumpy = np.asarray(intFeatureVector)

            temp = np.ones((5), dtype=int) * sparseMatrixRowPtr
            row = np.concatenate((row, temp)) #which row in the sparse matrix we're building right now
            col = np.concatenate((col, intFeatureVectorNumpy)) #the col indexes that will have a 1 in them

            sparseMatrixRowPtr = sparseMatrixRowPtr + 1

        lineNumber = lineNumber + 1

    #print("The number of 5d feature vectors built was:", sparseMatrixRowPtr)
    #print(humanLabelsNumpy.shape)
    #print(row.shape) #row and col should have same length
    #print(col.shape)

    #We can now build the sparse matrix
    lengthAlphabet = len(chinese_alphabet)
    lengthBigramPartOfVector = lengthAlphabet * lengthAlphabet
    lengthUnigramPartOfVector = lengthAlphabet
    lengthOneHotVector = lengthBigramPartOfVector + lengthUnigramPartOfVector

    data = np.ones_like(row) #We're building one-hot vector so non-zero entries in the matrix will have
    #values of 1
    
    sparseOneHotMatrix = csr_matrix((data, (row, col)), shape=(sparseMatrixRowPtr, lengthOneHotVector))
    dX = xgb.DMatrix(sparseOneHotMatrix) #need to convert the sparse matrix 
    #to a dmatrix that the xgboost module understands
    
    preds = bst.predict(dX)

    return preds


In [5]:
def formatStream(someStreamOfChars, predictions):
    numPredictions = len(predictions)
    constructStr = ""
    
    for i in range(numPredictions):
        constructStr = constructStr + someStreamOfChars[i]
        if predictions[i] == 1:
            constructStr = constructStr + ' '
    
    constructStr = constructStr + someStreamOfChars[-1]

    return constructStr


### Let's see how the classifier formats some sentence in the test file (test.txt)

In [6]:
print("Loading trained xgboost decision tree. This will take a few minutes...")
bst = joblib.load('chinese-xgb-model-final.pickle') #load the saved model

print("Loading the chinese alphabet it uses...")
chinese_alphabet = joblib.load('chinese-alphabet-final.pickle') #load the saved alphabet


Loading trained xgboost decision tree. This will take a few minutes...
Loading the chinese alphabet it uses...


In [7]:
testLines = loadChineseDataFile("./data/test.txt")


Encountered 0 decoding errors.


In [9]:
print("Let's see how the computer formats the first 10 sentences in the test set.")
print("In the pairs of sentences you'll see below, the first sentence will be the way")
print("a human formatted the Chinese sentence and the second sentence will be the way")
print("the computer formats it:")
print("--------------------------")

for someSentence in testLines[0:10]:
    print("Getting rid of spaces in a line from the test set and sending the unbroken stream of")
    print("chars to the classifier for formatting...")
    someSentenceWithoutSpaces = eliminate_whitespace(someSentence)
    preds = makePredictions([someSentenceWithoutSpaces], start_of_line_token, end_of_line_token, unk_token)
    predsBinary = (preds >= 0.5)
    predsBinary = predsBinary.astype(int)
    print(predsBinary)
    formattedSentence = formatStream(someSentenceWithoutSpaces, predsBinary)

    print('h:', someSentence)
    print('c:', formattedSentence)
    print("--------------------------")
    

Let's see how the computer formats the first 10 sentences in the test set.
In the pairs of sentences you'll see below, the first sentence will be the way
a human formatted the Chinese sentence and the second sentence will be the way
the computer formats it:
--------------------------
Getting rid of spaces in a line from the test set and sending the unbroken stream of
chars to the classifier for formatting...
processing line 0 ...
[1 0 0 1 1 0 0 0 1 1 1]
h: 對 基督教 、 天主教徒 而 言 ，
c: 對 基督教 、 天主教徒 而 言 ，
--------------------------
Getting rid of spaces in a line from the test set and sending the unbroken stream of
chars to the classifier for formatting...
processing line 0 ...
[0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 1 1 1 0 1]
h: 聖誕節 不 是 繽紛 的 裝飾 、 聖誕 大餐 或 舞會 狂歡 ，
c: 聖誕節 不 是 繽紛 的 裝飾 、 聖誕大餐 或 舞 會 狂歡 ，
--------------------------
Getting rid of spaces in a line from the test set and sending the unbroken stream of
chars to the classifier for formatting...
processing line 0 ...
[0 1 0 0 1 1 0 1 1 0 0 0 0 0 1