# Implementation of Improved CMFS on 20NG

In [58]:
"""
Authors: Abhirav Gholba
         Bhargav Srinivasa
         Devashish Deshpande
         Gauri Kholkar
         Mrunmayee Nasery
"""
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from pprint import pprint
from gensim.corpora import Dictionary
import numpy as np
import operator

In [59]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
vec = CountVectorizer(stop_words='english')
matrix = vec.fit_transform(newsgroups_train.data).T
documents = len(newsgroups_train.filenames)
categories = len(newsgroups_train.target_names)
terms = matrix.shape[0]

In [60]:
print "No. of documents: %d\nNo. of categories: %d" % (documents, categories)
print "matrix.shape: {0}".format(matrix.shape)
print newsgroups_train.target[10]
print type(matrix)

No. of documents: 11314
No. of categories: 20
matrix.shape: (101323, 11314)
8
<class 'scipy.sparse.csc.csc_matrix'>


### Create Term-category feature-appearance matrix

In [61]:
term_category_mat = np.zeros((terms, categories))
for doc in range(documents):
    cat = newsgroups_train.target[doc]
    for row in matrix.getcol(doc).nonzero()[0]:
        term_category_mat[row][cat] += 1

In [62]:
print "Term-category matrix shape: {0}".format(term_category_mat.shape)

Term-category matrix shape: (101323, 20)


### Perform CMFS on term-category matrix

In [63]:
term_freq_per_cat = np.cumsum(term_category_mat, axis=0)[-1, :]
for term in range(terms):
    # Frequency of the term across all categories
    # CMFS(tk,ci) = (P(tk|ci)*P(ci|tk))/P(ci)
    total_term_freq = sum(term_category_mat[term, :])
    for cat in range(categories):
        numerator = float(((term_category_mat[term][cat] + 1) ** 2) * documents)
        denominator = (total_term_freq + categories) * (term_freq_per_cat[cat] + terms) * term_freq_per_cat[cat]
        term_category_mat[term][cat] = numerator / denominator
        
# Final CMFS matrix
print term_category_mat

[[  1.15911720e-08   2.55417642e-06   4.92091096e-07 ...,   1.10450006e-07
    8.09032111e-07   4.95452140e-08]
 [  5.29700119e-07   7.53707062e-07   6.06857550e-08 ...,   9.89292091e-06
    3.01809142e-06   5.66036501e-07]
 [  1.12906601e-07   9.71858214e-08   3.96143123e-08 ...,   1.72138380e-07
    7.88057204e-08   1.20651771e-07]
 ..., 
 [  1.45165630e-07   1.24953199e-07   5.09326872e-08 ...,   5.53301937e-08
    1.01321641e-07   1.55123706e-07]
 [  1.45165630e-07   1.24953199e-07   5.09326872e-08 ...,   5.53301937e-08
    1.01321641e-07   1.55123706e-07]
 [  1.45165630e-07   1.24953199e-07   5.09326872e-08 ...,   5.53301937e-08
    1.01321641e-07   1.55123706e-07]]


### Create term-cmfs dictionary

In [64]:
term_cmfs_dict = {}
cmfs_max = np.max(term_category_mat, axis=1)
# Integer to term mapping dictionary
dictionary = vec.get_feature_names()
for i in range(terms):
    term = dictionary[i]
    term_cmfs_dict[term] = cmfs_max[i]

### Extract top 10 features

In [65]:
sorted_feature_list = sorted(term_cmfs_dict.items(), key=operator.itemgetter(1), reverse=True)[:10]
print "-------Selected features are-------\n"
for term, cmfs in sorted_feature_list:
    print "Term: {0} \t CMFS: {1}".format(term, cmfs)

-------Selected features are-------

Term: bike 	 CMFS: 0.000538368944352
Term: sale 	 CMFS: 0.000443215660653
Term: car 	 CMFS: 0.000417589539819
Term: mac 	 CMFS: 0.000308104950966
Term: apple 	 CMFS: 0.000280403476571
Term: shipping 	 CMFS: 0.000253629460498
Term: hockey 	 CMFS: 0.000244798088796
Term: cars 	 CMFS: 0.000222399959724
Term: god 	 CMFS: 0.000214102332086
Term: encryption 	 CMFS: 0.000198331154092
