# 4.5 Classifying text with Python

### 4.5.1 Prepare: making word vectors from text

In [4]:
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                  ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not abusive
    return postingList, classVec

def createVocabList(dataSet):
    vocabSet = set([]) # create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document) # union of the two sets
    return list(vocabSet) # convert to list and return

def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList) # create a list of 0's
    for word in inputSet:
        if word in vocabList: # check if the word is in the vocab list
            returnVec[vocabList.index(word)] = 1 # set the index to 1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec # return the vector representation of the input set

listOPosts, listClasses = loadDataSet() # load the dataset
print("Number of documents:", len(listOPosts))
print("Number of labels:", len(listClasses))
myVocabList = createVocabList(listOPosts) # create the vocab list
print(myVocabList) # print the vocab list
print(setOfWords2Vec(myVocabList, listOPosts[0])) # print the vector representation of the first document
print(setOfWords2Vec(myVocabList, listOPosts[3])) # print the vector representation of the fourth document

Number of documents: 6
Number of labels: 6
['mr', 'garbage', 'stop', 'dalmation', 'park', 'how', 'dog', 'love', 'quit', 'to', 'steak', 'food', 'maybe', 'take', 'worthless', 'stupid', 'ate', 'him', 'please', 'so', 'buying', 'cute', 'has', 'is', 'help', 'flea', 'I', 'not', 'problems', 'my', 'posting', 'licks']
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0]
[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


### 4.5.2 Train: calculating probabilities from word vectors

In [10]:
import numpy as np

def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix) # number of training documents
    numWords = len(trainMatrix[0]) # number of words in the vocab list
    pAbusive = sum(trainCategory) / float(numTrainDocs) # probability of abusive class
    p0Num = np.ones(numWords) # initialize p0Num to a zero vector of size numWords
    p1Num = np.ones(numWords) # initialize p1Num to a zero vector of size numWords
    p0Denom = 2.0 # initialize p0Denom to 2.0 (to avoid zero division)
    p1Denom = 2.0 # initialize p1Denom to 2.0 (to avoid zero division)
    for i in range(numTrainDocs): # iterate through all training documents
        if trainCategory[i] == 1: # if the document is abusive
            p1Num += trainMatrix[i] # add the word vector to p1Num
            p1Denom += sum(trainMatrix[i]) # add the sum of the word vector to p1Denom
        else: # if the document is not abusive
            p0Num += trainMatrix[i] # add the word vector to p0Num
            p0Denom += sum(trainMatrix[i]) # add the sum of the word vector to p0Denom
    p1Vect = np.log(p1Num / p1Denom) # calculate log probability of abusive class
    p0Vect = np.log(p0Num / p0Denom) # calculate log probability of non-abusive class
    return p0Vect, p1Vect, pAbusive # return the probabilities

trainMat = [] # initialize the training matrix
for postinDoc in listOPosts: # iterate through all documents
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) # append the vector representation of the document to the training matrix
    
p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses)) # train the Naive Bayes classifier
print(p0V) # print the log probability of non-abusive class
print(p1V) # print the log probability of abusive class
print(pAb) # print the probability of abusive class

[-2.56494936 -3.25809654 -2.56494936 -2.56494936 -3.25809654 -2.56494936
 -2.56494936 -2.56494936 -3.25809654 -2.56494936 -2.56494936 -3.25809654
 -3.25809654 -3.25809654 -3.25809654 -3.25809654 -2.56494936 -2.15948425
 -2.56494936 -2.56494936 -3.25809654 -2.56494936 -2.56494936 -2.56494936
 -2.56494936 -2.56494936 -2.56494936 -3.25809654 -2.56494936 -1.87180218
 -3.25809654 -2.56494936]
[-3.04452244 -2.35137526 -2.35137526 -3.04452244 -2.35137526 -3.04452244
 -1.94591015 -3.04452244 -2.35137526 -2.35137526 -3.04452244 -2.35137526
 -2.35137526 -2.35137526 -1.94591015 -1.65822808 -3.04452244 -2.35137526
 -3.04452244 -3.04452244 -2.35137526 -3.04452244 -3.04452244 -3.04452244
 -3.04452244 -3.04452244 -3.04452244 -2.35137526 -3.04452244 -3.04452244
 -2.35137526 -3.04452244]
0.5


### 4.5.3 Test: modifying the classifier for real-world conditions

In [11]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1) # calculate the log probability of abusive class
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1) # calculate the log probability of non-abusive class
    if p1 > p0: return 1 # if the log probability of abusive class is greater, return 1
    else: return 0 # otherwise, return 0
    
def testingNB():
    listOPosts, listClasses = loadDataSet() # load the dataset
    myVocabList = createVocabList(listOPosts) # create the vocab list
    trainMat = [] # initialize the training matrix
    for postinDoc in listOPosts: # iterate through all documents
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) # append the vector representation of the document to the training matrix
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses)) # train the Naive Bayes classifier
    testEntry = ['love', 'my', 'dalmation'] # test entry 1
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry)) # convert to vector representation
    print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb)) # classify and print result
    testEntry = ['stupid', 'garbage'] # test entry 2
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry)) # convert to vector representation
    print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb)) # classify and print result
    
testingNB() # call the testing function to test the classifier
    

['love', 'my', 'dalmation'] classified as: 0
['stupid', 'garbage'] classified as: 1


### 4.5.4 Prepare: the bag-of-words document model

In [None]:
def bagOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList) # create a list of 0's
    for word in inputSet: # iterate through the input set
        if word in vocabList: # check if the word is in the vocab list
            returnVec[vocabList.index(word)] += 1 # increment the index by 1
    return returnVec # return the vector representation of the input set

# 4.6 Example: classifying spam email with naïve Bayes

### 4.6.1 Prepare: tokenizing text

### 4.6.2 Test: cross validation with naïve Bayes

In [16]:
import re

def textParse(bigString): # function to parse the text
    listOfTokens = re.split(r'\W*', bigString) # split the string into tokens
    return [tok.lower() for tok in listOfTokens if len(tok) > 2] # convert to lowercase and filter out short tokens

def spamTest():
    docList = [] # initialize the document list
    classList = [] # initialize the class list
    fullText = [] # initialize the full text list
    for i in range(1, 26): # iterate through the spam and ham directories
        wordList = textParse(open('email/spam/%d.txt' %i, encoding='utf-8', errors='ignore').read()) # read and parse the spam emails
        docList.append(wordList) # append to the document list
        fullText.extend(wordList) # extend the full text list with the word list
        classList.append(1) # append 1 for spam class
        wordList = textParse(open('email/ham/%d.txt' %i , encoding='utf-8', errors='ignore').read()) # read and parse the ham emails
        docList.append(wordList) # append to the document list
        fullText.extend(wordList) # extend the full text list with the word list
        classList.append(0) # append 0 for ham class
        
    vocabList = createVocabList(docList) # create the vocab list from the document list
    trainingSet = list(range(50)) # create a training set of size 50
    testSet = [] # initialize the test set
    for i in range(10): # iterate through 10 times to create test set
        randIndex = int(np.random.uniform(0, len(trainingSet))) # generate a random index
        testSet.append(trainingSet[randIndex]) # append to the test set
        del(trainingSet[randIndex]) # delete from the training set
        
    trainMat = [] # initialize the training matrix
    trainClasses = [] # initialize the training classes
    for docIndex in trainingSet: # iterate through all documents in training set
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex])) # append vector representation to training matrix
        trainClasses.append(classList[docIndex]) # append class label to training classes
        
    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses)) # train the Naive Bayes classifier
    
    errorCount = 0.0 # initialize error count to 0.0
    
    for docIndex in testSet: # iterate through all documents in test set
        wordVector = setOfWords2Vec(vocabList, docList[docIndex]) # convert to  vector representation
        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1.0
            
    print("the error rate is:", float(errorCount) / len(testSet)) # print the error rate
    
spamTest() # call the spam test function to test the classifier

the error rate is: 0.6
