In [1]:
from scipy.io import loadmat
import numpy as np
from io import StringIO
import scipy as sci
from scipy import stats
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from scipy.sparse import vstack
news = loadmat('news.mat')
data = news['data']
labels = news['labels']
tdata = news['testdata']
tlabels = news['testlabels']

In [2]:
def problem1 (datatrained, datatested, labelstrained, labelstested, classes):
    priors = create_priors(labelstrained,classes)
    collapse = create_mus(datatrained,labelstrained,classes)
    preds = model(collapse,datatested,priors,classes)
    error = check_error(preds,labelstested)
    return error

In [3]:
def create_mus(usedata,uselabels,classes):
    #create matrix of laplace smoothed mus - creates a 20 x 60k
    indices = [a for a, x in enumerate(uselabels) if x in [0]]
    collapse = usedata[indices].sum(axis=0)
    collapse = csc_matrix((collapse +1)/(2+len(indices)))
    for i in range(1,classes):
        indices = [a for a, x in enumerate(uselabels) if x in [i]]
        subdata1 = usedata[indices].sum(axis=0)
        subdata1 = csc_matrix((subdata1 +1)/(2+len(indices)))
        collapse = vstack([csc_matrix(collapse), subdata1],format="csc")
    return collapse

In [4]:
def create_priors(priorslabels,classes):
    indicesp = [a for a, x in enumerate(priorslabels) if x in [0]]
    priors = [(len(priorslabels[indicesp]))/len(priorslabels)]
    for i in range(1,classes):
        indicesp = [a for a, x in enumerate(priorslabels) if x in [i]]
        subprior = (len(priorslabels[indicesp]))/len(priorslabels)
        priors.append(subprior)
    priors= csr_matrix(np.log(np.array(priors)))
    return priors

In [5]:
def model(collapse,datamodel,priorsmodel,classes):
    #create matrix of (log(1-mus)) - creates a 20 x 60k
    minusmu = csc_matrix(np.log(1-(collapse.toarray())))

    #create matrix of (logmu - log (1-mu)) - creates a 20 x 60k
    minusmu2 = csc_matrix((np.log(collapse.toarray())) - minusmu)
    #multiply data by minusmu2 and add to minusmu to obtain Prob(y=1)
    firstprob = (datamodel.multiply(minusmu2[0])).sum(axis=1)
    summedmu = (minusmu[0].sum(axis=1))+ priorsmodel[0,0]
    allprobsY = csc_matrix(firstprob + summedmu)

    for y in range(1,classes):
        probY = (datamodel.multiply(minusmu2[y])).sum(axis=1)
        summedmu = (minusmu[y].sum(axis=1)) + priorsmodel[0,y]
        probY = csc_matrix(probY + summedmu)
        allprobsY = hstack([allprobsY, probY],format="csc") 

    preds = (np.argmax(allprobsY.toarray(),axis=1))
    return preds

In [6]:
def check_error(preds,labelscheck):
    check = ((preds.astype(np.int8))) - (labelscheck.astype(np.int8))
    error = (np.sum(check.astype(np.bool)))/len(preds)
    return error

In [7]:
#pruning test data
positiveindices = [a for a, x in enumerate(tlabels) if x in [17,18,19]]
negativeindices = [b for b, j in enumerate(tlabels) if j in [1,16,20]]
newtdata = vstack((tdata[positiveindices],tdata[negativeindices]), format='csc')
newtlabels =(np.concatenate((np.ones(len(tdata[positiveindices].toarray())),np.zeros(len(tdata[negativeindices].toarray()))),axis=0)).astype(np.bool)

#pruning training data
positiveindices = [a for a, x in enumerate(labels) if x in [17,18,19]]
negativeindices = [b for b, j in enumerate(labels) if j in [1,16,20]]
newdata = vstack((data[positiveindices],data[negativeindices]), format='csc')
newlabels =(np.concatenate((np.ones(len(data[positiveindices].toarray())),np.zeros(len(data[negativeindices].toarray()))),axis=0)).astype(np.bool)

In [8]:
def findalpha (minusmu2):
    alphas = minusmu2[1]-minusmu2[0]
    return alphas

In [9]:
datatrained = newdata
labelstrained = newlabels
classes = 2
collapse = create_mus(datatrained,labelstrained,classes)
minusmu = csc_matrix(np.log(1-(collapse.toarray())))
minusmu2 = csc_matrix((np.log(collapse.toarray())) - minusmu)
alphas = findalpha(minusmu2)
alphas = np.argsort(alphas.toarray()[0])
low20 = alphas[:20]
top20 = alphas[-20:]

In [10]:
vocab = np.genfromtxt('news.vocab', dtype='str')

In [11]:
for y in top20:
    print(y,(vocab[y-1]))
for y in low20:
    print(y,(vocab[y-1]))

47471 gandhi
12178 dobb
47918 occurrred
45016 radioed
39353 yearwood
28710 parabellum
49252 asala
50290 pabl
49315 appressian
49216 kinsley
45378 kms
49316 armenia
49317 sahak
49313 argic
49314 ohanus
49312 serdar
49311 tahassusler
48903 clintonpz
20611 syst
38732 sfsu
45948 khayash
2 name
877 do
46033 questionnaires
2234 already
2604 implied
3548 perspective
896 trimmed
9 version
897 livesey
898 solntze
7886 martyred
1973 entropy
2883 runs
7018 alink
7017 corruption
1880 discussion
1923 came
3919 mohammad
3301 snm
