In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
import math
import statistics
from statistics import mode

In [2]:

#load sparse matrix
sparse_matrix = scipy.sparse.load_npz('training.npz')
matrix = sparse_matrix.toarray()
#print specific values

#row 12 if the word 'of', so most news stories have it
# print('matrix[1000][12]')
# print(matrix[1000][12])

In [20]:
#variables

f = open("vocabulary.txt", "r") 

#vocab length
V = len(f.readlines())

#beta and alpha values
#beta = (1 / V)
beta = .25
alpha = 1 + beta

#shape of matrix rows
shape0 = matrix.shape[0]


#turn far right col into its own matrix
ys = np.zeros(shape=(matrix.shape[0],1))
for i in range(shape0):
    ys[i][0] = matrix[i][-1]
#create a dictionary of the 20 labels and their occurences    
unique, counts = np.unique(ys, return_counts=True)
counts = dict(zip(unique, counts))


columnIndex = -1 
# Sort 2D numpy array by 2nd Column

#recreate matrix but sorted by last col value
sortedArr = matrix[matrix[:,columnIndex].argsort()]


#create list of each of the 20 classifiers positions in the matrix
amounts = []
counter = 0
index = 1
for i in range(sortedArr.shape[0]):
    if(index == sortedArr[i][-1]):
        counter+=1
    elif(i == sortedArr.shape[0] - 1):
        amounts.append([index,counter])       
    else:
        amounts.append([index,counter])
        index = sortedArr[i][-1]
        counter += 1
#amounts is the amount of each label in each doc (list of lists)        
amounts.append([20,sortedArr.shape[0]])
print(amounts)


[[1, 483], [2, 1107], [3, 1729], [4, 2372], [5, 2974], [6, 3604], [7, 4222], [8, 4836], [9, 5485], [10, 6113], [11, 6759], [12, 7398], [13, 8024], [14, 8645], [15, 9282], [16, 9933], [17, 10513], [18, 11106], [19, 11573], [20, 12000]]


In [21]:
#MLE function
#takes in a new label labelY (0-20)
#returns how many docs labeled the same as labelY / total docs
def MLE(labelY):
    #counts is a dictionary of label occurences
    return counts[labelY + 1]/shape0
    
#MLE will have to change for testing set    


#main purpose is to find the amount of uses of a wordI in labelY
#and the total words in labelY
def MAP(wordI,labelY):
    totalWords = 0
    totalY = 0
    #if label is 1, its a different algorithm
    if(labelY == 1):
        #basically sum the sorted array from for the column wordI for the label 1
        slice1 = amounts[0][1]
        totalWords = sum(sum(sortedArr[:slice1,[wordI]]))
        #sum all words for label 1
        totalY = sum(sum(sortedArr[:slice1,1:-2]))
    else: 
        slice1 = amounts[labelY -2][1]
        slice2 = amounts[labelY - 1][1]
        #sum all wordI for label
        totalWords = sum(sum(sortedArr[slice1:slice2,[wordI]]))
        #sum all words in the label, except first and last
        totalY = sum(sum(sortedArr[slice1:slice2,1:-2]))
    return (totalWords + beta) / (totalY + beta * V)

'''
#same as map
#but returns the totalwords and totalY
#done to save the results of out classifycations, but run them with different beta values
def altMAP(wordI,labelY):
    totalWords = 0
    totalY = 0
    #if label is 1, its a different algorithm
    if(labelY == 1):
        #basically sum the sorted array from for the column wordI for the label 1
        slice1 = amounts[0][1]
        totalWords = sum(sum(sortedArr[:slice1,[wordI]]))
        #sum all words for label 1
        totalY = sum(sum(sortedArr[:slice1,1:-2]))
    else: 
        slice1 = amounts[labelY -2][1]
        slice2 = amounts[labelY - 1][1]
        #sum all wordI for label
        totalWords = sum(sum(sortedArr[slice1:slice2,[wordI]]))
        #sum all words in the label, except first and last
        totalY = sum(sum(sortedArr[slice1:slice2,1:-2]))
    return [wordI, labelY, totalWords, totalY]
'''


def classify(wordI):
    wordCount = matrix[:, wordI].sum()
    arr = []
    #loop through all 20 labels        
    for i in range(20):
        #take the MLE and MAP for all labels
        mp = math.log(MAP(wordI,i),2)
        mle = MLE(i)
        if(mle > 0):
            mle = math.log(mle,2)
        #add to list    
        arr.append(mle + (wordCount * mp) )   
    #return the most likely label     
    idx = arr.index(max(arr)) 
    return idx + 1

    

dic = {}
#helper function
#classifies all words in a given doc
def allWordsClassify(row):
    
    #take row and get all non zero elements 
    areRow = matrix[row-1,:]
    indexes = np.nonzero(areRow)
    pred = []
    #loop through all words in doc
    for i in indexes:
        for j in i:
            #if we have classified the word then dont bother classifying again 
            if j in dic:
                pred.append(dic[j])
            else:
                cls = classify(j)
                pred.append(cls)
                dic[j] = cls
        break    
    #return most highest predicted label    
    return mode(pred)


In [None]:
%%time
'''
This section is to classify a word, and store it in our text file
Once a word is classified and stored, we never need to classify it again
since that is the major source of slowdown

Note this code is very very slow. It takes around 8 hours to classify every word
But once this is done, the documents can be classified instantly. 

'''
#file to write to 
f = open("words.txt", "a")

ls = []
start = 0
end = shape0
#loop through every word
for i in range(len(lines),matrix.shape[1]-1):
    #classify word, store it in text file for later
    x = classify(i)
    f.write(str(x) + '\n')
    print(str(x))
    #ls.append(x)
f.close()

In [93]:
'''
This section opens the text file with our classified words
It then takes all the words used in a given doc from the file 
Then it takes the mode (most used label), and adds it to a list 

'''
lines = []
#extract all classifications for words into a list
with open('words.txt') as file:
    lines = [line.rstrip('\n') for line in file]

f = open("classes5.txt", "a")   
print(len(lines))
print(matrix.shape[1])
classified = []
checking = []
#loop through every doc, and all words in doc
for i in range(matrix.shape[0] - 1):
    for j in range(matrix.shape[1] - 1):
        #if doc has that word
        if(matrix[i][j] != 0):
            #take the classification for word and store it
            checking.append(lines[j])
    #docs classification is most often classification from all words in doc        
    classified.append(mode(checking))
    #save data to file for later
    f.write(str(mode(checking)) + '\n')
    
    checking = []

f.close()


61190
61190


In [109]:
'''
Here we use the list created above with the label guess for each doc
Then get the actual label for each doc
Zip them together, check if we are correct
And return the % correct

'''
classes = []
#use data gathered above
with open('classes.txt') as file:
    classes = [line.rstrip('\n') for line in file]
    
start = 1
end = matrix.shape[0]
count = 0
actually = []
confusion = np.zeros((20, 20) , dtype=np.int64)
#loop through all docs and get actual label
for i in range(start,end):
    actually.append(matrix[i-1][-1])
    
paPairs = zip(classes,actually)
#loop through classifications and see if we are right
for prediction, actual in paPairs:
    if str(prediction) == str(actual):
        count+=1 
    #record value in confusion matrix    
    confusion[int(prediction) - 1][int(actual) - 1] += 1
    
    
print("Amount correct:")     
print(count)        
print("Percentage: ")
print(count / (end - start))

print("\n\nConfusion matrix:")
print(confusion)





amount correct:
8779
Percentage: 
0.7316443036919743


 Confusion matrix:
[[458   0   0  67 107  64   7  60  29  37  55 161 126 184  79 176  99  99
  228 326]
 [  0 485   0   0   5  20   0   0   1   0   2   7  13   5  13   0   0   0
    0   1]
 [  0   0 523   0   0   3   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0 528   0   0   4   1   0   1   1   0   9   1   0   1   0   0
    0   0]
 [  0   0   0   0 448   0   0   8   1   5   3   4  29   8   2   0   0   0
    0   0]
 [  0  28   0   0   0 520   0   0   0   0   0   5  10   2   0   0   0   2
    0   0]
 [  3  59  42   0   0   0 596   0   0  12   6  13  64  27  16   6   5   6
    7   0]
 [  0   6   5   8   0   0   0 514   0   0   4   5  14   3   2   4   2   3
    4   1]
 [  1   5  10   7   3   0   0   0 603   0   0  12  19   8   6   1   6   3
    8   3]
 [  5  10  16   9  20   6   0   0   0 547   0   0  28  34  19  12  12  12
   30   5]
 [  0   1   0   0   0   2   0   0   0   0 553   0   0   0   0   1   0   0
  