# TFIDF with SciKit


## Step 1 - Text

In [None]:
d0 = "the brown dog likes the white cow"
d1 = "the grass is brown"
d2 = "the spotted cow likes green grass"

documents = [d0,d1,d2]

## Step 2 - TFIDF

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

# parameters
# ngram_range=(min,max), here we are only doing 1 word
# stop_words = None , 'english'
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 0, stop_words=None)
print(tf)
tfidf_matrix =  tf.fit_transform(documents)
print()
print("document-term matrix")
print(tfidf_matrix)
feature_names = tf.get_feature_names()
print()
print("feature vectors")
for i, feature in enumerate(feature_names):
    print(i,feature)



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=0, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

document-term matrix
  (0, 1)	0.3357637111628959
  (0, 9)	0.4414889304215962
  (0, 6)	0.3357637111628959
  (0, 2)	0.4414889304215962
  (0, 0)	0.3357637111628959
  (0, 8)	0.5215009486364797
  (1, 5)	0.6317450542765208
  (1, 3)	0.4804583972923858
  (1, 0)	0.4804583972923858
  (1, 8)	0.3731188059313277
  (2, 4)	0.49482970636510465
  (2, 7)	0.49482970636510465
  (2, 3)	0.37633074615060896
  (2, 1)	0.37633074615060896
  (2, 6)	0.37633074615060896
  (2, 8)	0.29225439586501756

feature vectors
0 brown
1 co

## Step 3 - Pretty Print

In [2]:
## Let's pretty print
import pandas as pd
df = pd.DataFrame()

for doc, scores in enumerate(tfidf_matrix.toarray()):
    #print("doc" , doc, feature_names)
    for i, score in enumerate(scores):
        #print(i, feature_names[i], score)
        df.at[doc,feature_names[i]] =  round(float(score),2)

print("\ndocument term matrix")    
print(df)

print("\nterm document matrix")
print(df.transpose())


document term matrix
   brown   cow   dog  grass  green    is  likes  spotted   the  white
0   0.34  0.34  0.44   0.00   0.00  0.00   0.34     0.00  0.52   0.44
1   0.48  0.00  0.00   0.48   0.00  0.63   0.00     0.00  0.37   0.00
2   0.00  0.38  0.00   0.38   0.49  0.00   0.38     0.49  0.29   0.00

term document matrix
            0     1     2
brown    0.34  0.48  0.00
cow      0.34  0.00  0.38
dog      0.44  0.00  0.00
grass    0.00  0.48  0.38
green    0.00  0.00  0.49
is       0.00  0.63  0.00
likes    0.34  0.00  0.38
spotted  0.00  0.00  0.49
the      0.52  0.37  0.29
white    0.44  0.00  0.00


## Step 4 - Experiment
- in Step-1, change stop words from 'None'  to 'english'  observe the DTM produced.
- Add your own text in Step-1 and see the DTM