In [1]:
from scipy.sparse import coo_matrix

In [2]:
with open('bbc/bbc.mtx') as f:
    content = f.readlines()

In [3]:
content.pop(0)
content.pop(0)

'9635 2225 286774\n'

In [4]:
sparsemat = [tuple(map(int,map(float,c.split()))) for c in content]

In [5]:
sparsemat = zip(*sparsemat)

In [6]:
with open('bbc/bbc.terms') as f:
    content = f.readlines()
words = [c.split()[0] for c in content]

with open('bbc/bbc.docs') as f:
    content = f.readlines()
docs = [c.split()[0] for c in content]

In [7]:
coo = coo_matrix((list(sparsemat[2]), (list(sparsemat[1]), list(sparsemat[0]))))

In [8]:
coo

<2226x9636 sparse matrix of type '<type 'numpy.int64'>'
	with 286774 stored elements in COOrdinate format>

In [9]:
len(words)

9635

In [10]:
len(docs)

2225

In [11]:
from sklearn.decomposition import NMF
model = NMF(n_components=5, init='random', random_state=0)
doc_topics = model.fit_transform(coo)

**One way to do clustering is to just assign each article to its highest feature (we could have also put the features into a clustering algorithm):**

In [16]:
doc_cluster = doc_topics.argmax(axis = 1)

In [20]:
doc_cluster[0:20]

array([0, 4, 1, 1, 1, 4, 1, 1, 1, 4, 1, 4, 1, 4, 1, 1, 1, 4, 1, 4])

In [14]:
topic_words = []
for r in model.components_:
    a = sorted([(v,i) for i,v in enumerate(r)],reverse=True)[0:7]
    topic_words.append([words[e[1]-1] for e in a])

In [15]:
topic_words

[['music', 'song', 'best', 'year', '25', 'award', 'angel'],
 ['govern', 'labour', 'parti', 'peopl', 'elect', 'year', 'blair'],
 ['film', 'best', 'award', 'actor', 'director', 'year', 'star'],
 ['game', 'plai', 'time', 'player', 'world', 'first', 'get'],
 ['peopl', 'mobil', 'phone', 'technolog', 'servic', 'firm', 'compani']]

**We just chose k=5 above based on our knowledge of the dataset. But we can also look at the sigma values from the SVD:**

In [31]:
from sklearn.utils.extmath import randomized_svd
U, Sigma, VT = randomized_svd(coo, n_components=15,
                                      n_iter=5,
                                      random_state=None)

In [32]:
Sigma

array([ 299.79347177,  167.5517242 ,  160.50368625,  146.84261753,
        129.05623159,  121.23164679,  115.47318041,  112.718921  ,
        108.76917291,  106.84279339,  100.50798632,   94.83184242,
         93.40277623,   90.61801678,   87.26416544])

In [21]:
# Clusters: business,entertainment,politics,sport,tech
print "business: " + str(doc_cluster[20:40])
print "entertain:" + str(doc_cluster[520:540])
print "politics: " + str(doc_cluster[930:950])
print "sport:    " + str(doc_cluster[1320:1340])
print "tech:     " + str(doc_cluster[-40:-20])

business: [4 2 1 1 1 1 4 4 1 1 1 4 4 1 1 4 1 1 4 1]
entertain:[1 2 2 2 2 2 2 2 4 2 1 1 1 2 2 1 1 2 2 2]
politics: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
sport:    [1 3 3 3 3 3 1 1 1 1 3 3 1 3 1 3 3 1 1 1]
tech:     [4 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4]
