# Implementation of CMFS on 20NG

In [98]:
"""
Authors: Abhirav Gholba
         Bhargav Srinivasa
         Devashish Deshpande
         Gauri Kholkar
         Mrunmayee Nasery
"""
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from pprint import pprint
from gensim.corpora import Dictionary
import numpy as np
import operator

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
vec = CountVectorizer(stop_words='english')
matrix = vec.fit_transform(newsgroups_train.data).T
documents = len(newsgroups_train.filenames)
categories = len(newsgroups_train.target_names)
terms = matrix.shape[0]

In [76]:
print "No. of documents: %d\nNo. of categories: %d" % (documents, categories)
print "matrix.shape: {0}".format(matrix.shape)
print newsgroups_train.target[10]
print type(matrix)

No. of documents: 11314
No. of categories: 20
matrix.shape: (101322, 11314)
8
<class 'scipy.sparse.csc.csc_matrix'>


### Create Term-category feature-appearance matrix

In [77]:
term_category_mat = np.zeros((terms, categories))
for doc in range(documents):
    cat = newsgroups_train.target[doc]
    for row in matrix.getcol(doc).nonzero()[0]:
        term_category_mat[row][cat] += 1

In [93]:
print "Term-category matrix shape: {0}".format(term_category_mat.shape)

Term-category matrix shape: (101322, 20)
101322


### Perform CMFS on term-category matrix

In [87]:
term_freq_per_cat = np.cumsum(term_category_mat, axis=0)[-1, :]
for term in range(terms):
    # Frequency of the term across all categories
    total_term_freq = sum(term_category_mat[term, :])
    for cat in range(categories):
        numerator = float(((term_category_mat[term][cat] + 1) ** 2))
        denominator = (total_term_freq + categories) * (term_freq_per_cat[cat] + terms)
        term_category_mat[term][cat] = numerator / denominator
        
# Final CMFS matrix
print term_category_mat

[[  2.92724117e-08   7.28602676e-06   2.78311080e-06 ...,   5.88311595e-07
    2.72664657e-06   1.18473278e-07]
 [  1.33770769e-06   2.15001978e-06   3.43219337e-07 ...,   5.26946106e-05
    1.01717453e-05   1.35351518e-06]
 [  2.85134973e-07   2.77231631e-07   2.24045956e-07 ...,   9.16894515e-07
    2.65595573e-07   2.88504370e-07]
 ..., 
 [  3.66602108e-07   3.56440669e-07   2.88059087e-07 ...,   2.94716094e-07
    3.41480022e-07   3.70934190e-07]
 [  3.66602108e-07   3.56440669e-07   2.88059087e-07 ...,   2.94716094e-07
    3.41480022e-07   3.70934190e-07]
 [  3.66602108e-07   3.56440669e-07   2.88059087e-07 ...,   2.94716094e-07
    3.41480022e-07   3.70934190e-07]]


### Create term-cmfs dictionary

In [96]:
term_cmfs_dict = {}
cmfs_max = np.max(term_category_mat, axis=1)
# Integer to term mapping dictionary
dictionary = vec.get_feature_names()
for i in range(terms):
    term = dictionary[i]
    term_cmfs_dict[term] = cmfs_max[i]

### Extract top 10 features

In [99]:
sorted_feature_list = sorted(term_cmfs_dict.items(), key=operator.itemgetter(1), reverse=True)[:10]
print "-------Selected features are-------\n"
for term, cmfs in sorted_feature_list:
    print "Term: {0} \t CMFS: {1}".format(term, cmfs)

-------Selected features are-------

Term: bike 	 CMFS: 0.00120489196406
Term: sale 	 CMFS: 0.00112834000168
Term: car 	 CMFS: 0.000970606167276
Term: encryption 	 CMFS: 0.000957596432953
Term: god 	 CMFS: 0.000932343449918
Term: hockey 	 CMFS: 0.00085429558318
Term: clipper 	 CMFS: 0.000844342753261
Term: windows 	 CMFS: 0.000837428432144
Term: space 	 CMFS: 0.000751994970671
Term: israel 	 CMFS: 0.000747811750338
