In [9]:
import numpy as np
import pandas as pd
import scipy.sparse
import math
import statistics
from statistics import mode

In [10]:

#load sparse matrix
sparse_matrix = scipy.sparse.load_npz('training.npz')
matrix = sparse_matrix.toarray()
#print specific values

#row 12 if the word 'of', so most news stories have it
# print('matrix[1000][12]')
# print(matrix[1000][12])

In [12]:
#variables

f = open("vocabulary.txt", "r") 

#vocab length
V = len(f.readlines())

#beta and alpha values
beta = 1 / V
alpha = 1 + beta

#shape of matrix rows
shape0 = matrix.shape[0]


#turn far right col into its own matrix
ys = np.zeros(shape=(matrix.shape[0],1))
for i in range(shape0):
    ys[i][0] = matrix[i][-1]
#create a dictionary of the 20 labels and their occurences    
unique, counts = np.unique(ys, return_counts=True)
counts = dict(zip(unique, counts))


columnIndex = -1 
# Sort 2D numpy array by 2nd Column

#recreate matrix but sorted by last col value
sortedArr = matrix[matrix[:,columnIndex].argsort()]


#create list of each of the 20 classifiers positions in the matrix
amounts = []
counter = 0
index = 1
for i in range(sortedArr.shape[0]):
    if(index == sortedArr[i][-1]):
        counter+=1
    elif(i == sortedArr.shape[0] - 1):
        amounts.append([index,counter])       
    else:
        amounts.append([index,counter])
        index = sortedArr[i][-1]
        counter += 1
#amounts is the amount of each label in each doc (list of lists)        
amounts.append([20,sortedArr.shape[0]])
print(amounts)


{1.0: 483, 2.0: 624, 3.0: 622, 4.0: 643, 5.0: 602, 6.0: 630, 7.0: 618, 8.0: 614, 9.0: 649, 10.0: 628, 11.0: 646, 12.0: 639, 13.0: 626, 14.0: 621, 15.0: 637, 16.0: 651, 17.0: 580, 18.0: 593, 19.0: 467, 20.0: 427}


In [13]:
%%time

#MLE function
#takes in a new label labelY (0-20)
#returns how many docs labeled the same as labelY / total docs
def MLE(labelY):
    #counts is a dictionary of label occurences
    return counts[labelY + 1]/shape0
    
#MLE will have to change for testing set    


#map
def MAP(wordI,labelY):
    totalWords = 0
    totalY = 0
    #if label is 1, its a different algorithm
    if(labelY == 1):
        #basically sum the sorted array from for the column wordI for the label 1
        slice1 = amounts[0][1]
        totalWords = sum(sum(sortedArr[:slice1,[wordI]]))
        #sum all words for label 1
        totalY = sum(sum(sortedArr[:slice1,1:-2]))
    else: 
        slice1 = amounts[labelY -2][1]
        slice2 = amounts[labelY - 1][1]
        #sum all wordI for label
        totalWords = sum(sum(sortedArr[slice1:slice2,[wordI]]))
        #sum all words in the label, except first and last
        totalY = sum(sum(sortedArr[slice1:slice2,1:-2]))
    return (totalWords + beta) / (totalY + beta * V)


def classify(wordI):
    wordCount = matrix[:, wordI].sum()
    arr = []
    #loop through all 20 labels        
    for i in range(20):
        #take the MLE and MAP for all labels
        mp = math.log(MAP(wordI,i),2)
        mle = MLE(i)
        if(mle > 0):
            mle = math.log(mle,2)
        #add to list    
        arr.append(mle + (wordCount * mp) )   
    #return the most likely label     
    idx = arr.index(max(arr)) 
    return idx + 1

    

dic = {}
#classifies all words in a given doc
def allWordsClassify(row):
    #take row and get all non zero elements 
    areRow = matrix[row-1,:]
    indexes = np.nonzero(areRow)
    pred = []
    #loop through all words in doc
    for i in indexes:
        for j in i:
            #if we have classified the word then dont bother classifying again 
            if j in dic:
                pred.append(dic[j])
            else:
                cls = classify(j)
                pred.append(cls)
                dic[j] = cls
        break    
    #return most highest predicted label    
    return mode(pred)




Wall time: 0 ns


In [14]:
%%time
ls = []
start = 100
end = 200
for i in range(start,end):
    ls.append(allWordsClassify(i))
    
    
actual = []
for i in range(start,end):
    actual.append(matrix[i - 1][-1])
count = 0
print("prediction:")
pred = [x - 1 for x in ls]
print(pred)
print("actual")
print(actual)
paPairs = zip(pred,actual)
for prediction, actual in paPairs:
    if prediction == actual:
        count+=1
print(count)        
print("Percentage: ")
print(count / (end - start))

prediction:
[6, 16, 7, 14, 16, 6, 18, 0, 18, 13, 10, 8, 10, 3, 7, 15, 3, 1, 10, 16, 18, 2, 13, 5, 4, 7, 16, 13, 18, 8, 6, 7, 12, 4, 5, 1, 7, 12, 3, 0, 10, 3, 10, 7, 16, 17, 11, 16, 7, 13, 16, 12, 14, 1, 5, 18, 18, 5, 8, 1, 10, 17, 4, 8, 3, 6, 1, 1, 13, 2, 18, 7, 10, 10, 11, 1, 14, 1, 7, 12, 10, 16, 9, 8, 1, 18, 4, 7, 5, 11, 6, 15, 0, 7, 12, 16, 16, 1, 6, 16]
actual
[6, 16, 15, 14, 16, 6, 18, 20, 18, 13, 10, 8, 10, 3, 7, 15, 3, 1, 10, 16, 11, 2, 13, 5, 4, 7, 20, 13, 18, 8, 6, 7, 12, 4, 5, 17, 3, 12, 3, 20, 15, 3, 10, 9, 16, 17, 11, 16, 4, 13, 16, 12, 14, 1, 5, 18, 6, 5, 8, 3, 2, 17, 7, 8, 3, 6, 16, 16, 13, 2, 18, 7, 10, 10, 11, 1, 14, 20, 15, 12, 10, 16, 9, 8, 1, 18, 4, 19, 5, 11, 6, 15, 20, 7, 12, 16, 16, 19, 6, 16]


NameError: name 'p' is not defined

79
Percentage: 
-99.605


In [None]:
#backups

    '''
    labelCount = 0
    for i in range(shape0):
        if(matrix[i][-1] == labelY):
            labelCount += 1
    return labelCount / shape0
    '''
    
'''
#MAP function
#takes in a wordI, which is 1 word used in the docs 
#takes in a labelY (0-20)
def MAP(wordI, labelY):
    wordCount = 0
    totalYCount = 0
    #create matrix of indexes of matrix with matching label
    x = np.where(ys == labelY)
    for num in x:
        for y in num:
            wordCount += matrix[y][wordI] 
            totalYCount += sum(matrix[y]) - matrix[y][0] - matrix[y][-1]
        break
            
    #return formula given
    print("wot")
    return (wordCount + beta) / (totalYCount + beta * V)


X = sortedArr[:amounts[0][1],[-1]]
#print(X)

X = sortedArr[amounts[0][1]:amounts[1][1],[-1]]
#print(X)
X = sortedArr[amounts[1][1]:amounts[2][1],[-1]]
#print(X)




print(betterMAP(100,1))

'''