# Quiz 8
Consider the 4 documents given below to construct a term-document matrix. Use the query ”speech systems” to compute the rank (descending order :)) using cosine similarity.

Note: (1) Use normalized TF using document length (2) Each element in the matrix should represent TF*IDF


1. information extraction systems 
2. natural language processing 
3. speech signal systems 
4. speech processing

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

In [2]:
text = [
    'information extraction systems',
    'natural language processing',
    'speech signal systems',
    'speech processing'
]

# Compute Term Frequency Matrix

In [3]:
vectorizer = CountVectorizer()

In [4]:
tf_dtm = vectorizer.fit_transform(text).todense()

In [5]:
tokens = vectorizer.get_feature_names()

In [6]:
tokens

['extraction',
 'information',
 'language',
 'natural',
 'processing',
 'signal',
 'speech',
 'systems']

In [7]:
term_frequency = pd.DataFrame(data=tf_dtm,
                             columns=tokens)

In [8]:
term_frequency

Unnamed: 0,extraction,information,language,natural,processing,signal,speech,systems
0,1,1,0,0,0,0,0,1
1,0,0,1,1,1,0,0,0
2,0,0,0,0,0,1,1,1
3,0,0,0,0,1,0,1,0


# Compute Document Frequency Matrix

In [9]:
vectorizer = CountVectorizer(binary=True)

In [10]:
df_dtm = vectorizer.fit_transform(text).todense().sum(axis=0)

In [11]:
document_frequency = pd.DataFrame(data=df_dtm,
                                  columns=tokens)

In [12]:
document_frequency

Unnamed: 0,extraction,information,language,natural,processing,signal,speech,systems
0,1,1,1,1,2,1,2,2


# Compute TF-IDF

In [13]:
tfidf = pd.DataFrame(data=tf_dtm/df_dtm, columns=tokens)

In [14]:
tfidf

Unnamed: 0,extraction,information,language,natural,processing,signal,speech,systems
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.5
1,0.0,0.0,1.0,1.0,0.5,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5
3,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0


# Normalize TF and Compute TF-IDF

In [15]:
vect = TfidfVectorizer(smooth_idf=True,
                      norm='l2',  # squared weights sum to 1 by document
                      sublinear_tf=False,  # if True, use 1+log(tf)
                      binary=False)

In [16]:
tfidf2 = pd.DataFrame(vect.fit_transform(text).todense(),
            columns=vect.get_feature_names())

In [17]:
tfidf2

Unnamed: 0,extraction,information,language,natural,processing,signal,speech,systems
0,0.617614,0.617614,0.0,0.0,0.0,0.0,0.0,0.486934
1,0.0,0.0,0.617614,0.617614,0.486934,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.667679,0.526405,0.526405
3,0.0,0.0,0.0,0.0,0.707107,0.0,0.707107,0.0


In [18]:
query_str = "speech systems"

In [19]:
query_tfidf = pd.DataFrame(vect.transform([query_str]).todense(),
                    columns=vect.get_feature_names())

In [20]:
query_tfidf

Unnamed: 0,extraction,information,language,natural,processing,signal,speech,systems
0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.707107


In [21]:
cosine_similarity(query_tfidf, tfidf2)

array([[0.34431452, 0.        , 0.7444497 , 0.5       ]])

In [22]:
cosine_similarity(query_tfidf, tfidf)

array([[0.23570226, 0.        , 0.57735027, 0.5       ]])