standard datasets are used to train natural language models... when crunched by a model, the output reflects quirks in the algorithm rather then the data

In [5]:
from sklearn.datasets import fetch_20newsgroups
categories =['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)


In [6]:
print twenty_train.data[0]

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



In [7]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [8]:
twenty_train.target_names[:10]

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [9]:
[twenty_train.target_names[t] for t in twenty_train.target[:10]]

['comp.graphics',
 'comp.graphics',
 'soc.religion.christian',
 'soc.religion.christian',
 'soc.religion.christian',
 'soc.religion.christian',
 'soc.religion.christian',
 'sci.med',
 'sci.med',
 'sci.med']

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english', min_df = 3, lowercase=True, ngram_range=(1,2))
X_train_counts = count_vect.fit_transform(twenty_train.data)
#min_df - a word has to occur in (x) documents to be considered a feature

In [13]:
count_vect.vocabulary_.items()[:10]
#this is a dictionary so it has .items()

[(u'say say', 24934),
 (u'woods', 30960),
 (u'mdbs', 18117),
 (u've thinking', 30041),
 (u'atheist posting', 3391),
 (u'things things', 28177),
 (u'usenet', 29742),
 (u'biochemistry chairman', 4295),
 (u'registered dietician', 23649),
 (u'dna', 9017)]

In [15]:
len(count_vect.vocabulary_)

31594

In [16]:
X_train_counts[0]

<1x31594 sparse matrix of type '<type 'numpy.int64'>'
	with 65 stored elements in Compressed Sparse Row format>

because text data is high dimensional and sparse, a given word probably doesn't exist in a given document

NB about sparse matricies: doesn't store 0s, just saves value and location and assumes everything else is 0
**occasionally this fails and the algorithm doen't play well with sparse matricies

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

TF*IDF = Term frequency* inverse document frequency
term frequency = frequency of a word in a given document 
inverse document frequency = percent of documents the word occurs in 

gives higher weights to infrequently occuring words

In [19]:
transformer = TfidfTransformer()
#model... like clf
X_train_tfidf = transformer.fit_transform(X_train_counts)

In [26]:
print X_train_tfidf
#prints the location in the sparse matrix and the tfidf score
reversed_vocab = dict()
reversed_vocab = {v:k for (k,v) in count_vect.vocabulary_.items()}
"""
for key in count_vect.vocabulary_:
    reversed_vocab[count_vect.vocabulary_[key]] = key    
"""

  (0, 29187)	0.122184682794
  (0, 28033)	0.0833703282547
  (0, 13087)	0.127476183175
  (0, 10172)	0.127476183175
  (0, 28022)	0.127476183175
  (0, 12793)	0.118080276228
  (0, 16106)	0.113254460724
  (0, 9115)	0.0830937280025
  (0, 16891)	0.0828212256827
  (0, 29465)	0.0704820569062
  (0, 6072)	0.25495236635
  (0, 20365)	0.127476183175
  (0, 21979)	0.0398304739967
  (0, 19652)	0.0398304739967
  (0, 29198)	0.120018232264
  (0, 1785)	0.074142400285
  (0, 11307)	0.0729975420101
  (0, 17257)	0.120018232264
  (0, 1449)	0.118080276228
  (0, 1169)	0.269868268172
  (0, 27843)	0.0945193296801
  (0, 29391)	0.101977280591
  (0, 6793)	0.0543905896741
  (0, 22485)	0.0930470585213
  (0, 2136)	0.0751838238838
  :	:
  (2256, 13344)	0.0648682824133
  (2256, 5110)	0.066938392534
  (2256, 16298)	0.0606666781161
  (2256, 17646)	0.054633401601
  (2256, 25146)	0.0555599161106
  (2256, 10638)	0.0473951454191
  (2256, 10627)	0.0648682824133
  (2256, 4114)	0.137281116323
  (2256, 4112)	0.200815177602
  (2256, 1

'\nfor key in count_vect.vocabulary_:\n    reversed_vocab[count_vect.vocabulary_[key]] = key    \n'

In [27]:
tfidfvect = TfidfVectorizer()

In [30]:
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

"""
statistically you would want to normalize word counts 
between 0 and 1 but in practice TFIDF is a useful because gives
different weight to rare terms
"""


'\nstatistically you would want to normalize word counts \nbetween 0 and 1 but in practice TFIDF is a useful because gives\ndifferent weight to rare terms\n'

In [39]:
X_train_tfidf_dense = X_train_tfidf.toarray()
tree = DecisionTreeClassifier()
print cross_val_score(tree, X_train_tfidf_dense, twenty_train.target, cv=3)

[ 0.80212483  0.82735724  0.76298269]


In [41]:
forest = RandomForestClassifier(n_estimators = 10)
cross_val_score(forest, X_train_tfidf_dense, twenty_train.target, cv=3)

array([ 0.87250996,  0.86719788,  0.84553928])

In [42]:
logreg = LogisticRegression()
cross_val_score(logreg,X_train_tfidf, twenty_train.target, cv = 5)

array([ 0.95575221,  0.97123894,  0.9579646 ,  0.96238938,  0.96659243])

In [43]:
nb = MultinomialNB()
cross_val_score(nb,X_train_tfidf, twenty_train.target, cv = 5)

array([ 0.96017699,  0.96902655,  0.96460177,  0.95575221,  0.9688196 ])

In [44]:
forest.fit(X_train_tfidf_dense, twenty_train.target)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

In [46]:
x= forest.feature_importances_

array([], dtype=float64)

In [48]:
x[x>0.]

array([  1.81458819e-03,   9.44967792e-05,   1.28493909e-04, ...,
         5.72509123e-04,   2.25088761e-04,   1.80271434e-04])