In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

---
Example
=====
***

In [2]:
#A simple corpus of 3 documents, each containing a single sentenct
document_setA = ["this is any pen any pen any", 
                "this example has this example pen pen",
                "shines any pen any pen example shines any pen pen"]

In [3]:
#Use the count vectorizer to convert the text into a numerical matrix
vectorizerA = CountVectorizer()

#use the one-shot fit_transform method
freq_term_matrixA = vectorizerA.fit_transform(document_setA)

#print the matrix
print freq_term_matrixA

#print the vocabulary from the vectorizer
print vectorizerA.get_feature_names()

#compare this to the sparse array representation
freq_term_matrixA.todense()

  (0, 6)	1
  (0, 3)	1
  (0, 0)	3
  (0, 4)	2
  (1, 6)	2
  (1, 4)	2
  (1, 1)	2
  (1, 2)	1
  (2, 0)	3
  (2, 4)	4
  (2, 1)	1
  (2, 5)	2
[u'any', u'example', u'has', u'is', u'pen', u'shines', u'this']


matrix([[3, 0, 0, 1, 2, 0, 1],
        [0, 2, 1, 0, 2, 0, 2],
        [3, 1, 0, 0, 4, 2, 0]])

In [4]:
#since we have used a count vectorizer already we only need a tfidf transformer
tfidf = TfidfTransformer()

#fit and transform using the output of the vectorizer
tfidf_tm = tfidf.fit_transform(freq_term_matrixA)

print "This is the tfidf sparse matrix:"
print tfidf_tm

print "\n"
for j, sent in enumerate(document_setA):
    for i, f in enumerate(vectorizerA.get_feature_names()):
        print "Sentence = '", "{:50s}".format(sent),"'   " "??tf = {:5.2f}".format(tfidf_tm[j, i]), f

print "\nInverse Document Freq:"
for i, f in enumerate(vectorizerA.get_feature_names()):
    print "{:10s}".format(f), ":", "{:5.2f}".format(tfidf.idf_[i])
#for i in , tfidf.idf_

This is the tfidf sparse matrix:
  (0, 4)	0.41302572877
  (0, 0)	0.797768739598
  (0, 3)	0.349656674083
  (0, 6)	0.265922913199
  (1, 2)	0.377358350999
  (1, 1)	0.573981505035
  (1, 4)	0.445747842044
  (1, 6)	0.573981505035
  (2, 5)	0.510223460334
  (2, 1)	0.194019046412
  (2, 4)	0.602692389878
  (2, 0)	0.582057139237


Sentence = ' this is any pen any pen any                        '   ??tf =  0.80 any
Sentence = ' this is any pen any pen any                        '   ??tf =  0.00 example
Sentence = ' this is any pen any pen any                        '   ??tf =  0.00 has
Sentence = ' this is any pen any pen any                        '   ??tf =  0.35 is
Sentence = ' this is any pen any pen any                        '   ??tf =  0.41 pen
Sentence = ' this is any pen any pen any                        '   ??tf =  0.00 shines
Sentence = ' this is any pen any pen any                        '   ??tf =  0.27 this
Sentence = ' this example has this example pen pen              '   ??tf =  

---
Another Example
=====
***

In [5]:
document_set = ("Your father's lightsaber. This is the weapon of a Jedi Knight.",
        "A lightsaber is an interesting weapon, a blade unique in the history of warfare.",
        "A lightsaber is a fictional energy weapon featured in the Star Wars universe.",
        "Lightsabers are hand-built as part of a Jedi's or Sith's training regimen.",
        "The lightsaber is the weapon of a Jedi, an elegant weapon of a more civilized age. ")

In [6]:
def clean_and_stem(text):
    stop_words = ENGLISH_STOP_WORDS
    stemmer = nltk.stem.LancasterStemmer()
    tokens = nltk.word_tokenize(text.lower())
    clean_tokens = [word for word in tokens if word not in stop_words]
    stem_tokens = [stemmer.stem(token) for token in clean_tokens]
    return " ".join(stem_tokens)

In [7]:
clean_document_set = [clean_and_stem(doc) for doc in document_set]

In [8]:
for doc in clean_document_set:
    print doc

fath 's lightsab . weapon jed knight .
lightsab interest weapon , blad un hist warf .
lightsab fict energy weapon feat star war univers .
lightsab hand-built jed 's sith 's train regim .
lightsab weapon jed , eleg weapon civil ag .


#####Add a little "hand cleaning"!

In [9]:
clean_document_set2 = []
for doc in clean_document_set:
    v =[token for token in doc if token not in ["'", '.', ',', "s"]]
    clean_document_set2.append("".join(v))

In [10]:
for doc in clean_document_set2:
    print doc

fath  lightab  weapon jed knight 
lightab interet weapon  blad un hit warf 
lightab fict energy weapon feat tar war univer 
lightab hand-built jed  ith  train regim 
lightab weapon jed  eleg weapon civil ag 


In [11]:
tfidfV = TfidfVectorizer()
tfidfV_tm = tfidfV.fit_transform(clean_document_set2)

print "\nInverse Document Freq:"
for i, f in enumerate(tfidfV.get_feature_names()):
    print "{:14s}".format(f), ":", "{:5.2f}".format(tfidfV.idf_[i])


Inverse Document Freq:
ag             :  2.10
blad           :  2.10
built          :  2.10
civil          :  2.10
eleg           :  2.10
energy         :  2.10
fath           :  2.10
feat           :  2.10
fict           :  2.10
hand           :  2.10
hit            :  2.10
interet        :  2.10
ith            :  2.10
jed            :  1.41
knight         :  2.10
lightab        :  1.00
regim          :  2.10
tar            :  2.10
train          :  2.10
un             :  2.10
univer         :  2.10
war            :  2.10
warf           :  2.10
weapon         :  1.18


---
Cosine Similarity
=====
***

#####Which of the sentences are more like each other?

In [12]:
for i in range(5):
    print cosine_similarity(tfidfV_tm[i], tfidfV_tm)

[[ 1.          0.13365433  0.12302     0.1639136   0.34060603]]
[[ 0.13365433  1.          0.09038523  0.04047634  0.16459443]]
[[ 0.12302     0.09038523  1.          0.03725581  0.15149833]]
[[ 0.1639136   0.04047634  0.03725581  1.          0.12751921]]
[[ 0.34060603  0.16459443  0.15149833  0.12751921  1.        ]]
