<a href="https://colab.research.google.com/github/chunter3/Information_Retrieval_Projects/blob/master/TF%2C_IDF%2C_%26_Cosine_Similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import re
import math
%matplotlib inline

In [2]:
# Problem 1 (start)

In [None]:
# Loading the state_union sample dataset from the nltk library

nltk.download('punkt')
nltk.download('state_union')
from nltk.corpus import state_union
state_union.fileids()  # 65 documents w/in the state_union corpus (N = 65)

In [4]:
# Term frequency (TF) auxiliary function; determines the TF of a word in a document

# tf(t,d) = count of t in d / number of words in d

def TermFrequency(term, doc):
  termCount = 0       # Final count of the number of times the given term appear in document
  wordCount = 0       # Total number of words in document
  for sentence in doc:
    joinTerms = ' '.join(sentence)        
    processSent = (re.sub('[^A-Za-z0-9]+',' ', joinTerms)).split()
    termCount += processSent.count(term)
    wordCount += len(processSent)
  return (termCount/wordCount)

In [5]:
# Inverse document frequency (IDF) auxiliary function; uses the inverted index to determine the IDF for a term and corpus

# idf(t) = log(N/(df + 1)); N = total number of documents in corpus; df(t) = number of documents in corpus that contain the term t

def InverseDF(term, index):
  df = len(index[term])
  return math.log10(65/(df + 1))    # Decided to use log base 10; shouldn't make much of a difference

In [6]:
# TF-IDF inverted index function

def TF_IDFIndex(corpus):
  invertedIndex = {}
  docID = 0
  for textFile in corpus:
    doc = state_union.sents(textFile)
    docID+=1
    for sentence in doc:   
      joinTerms = ' '.join(sentence)        
      processSent = (re.sub('[^A-Za-z0-9]+',' ', joinTerms)).split()
      for word in processSent:
         if word in invertedIndex and docID not in invertedIndex[word]:
           invertedIndex[word].append(docID)
         else:
           invertedIndex[word] = [docID]
  for key in invertedIndex.keys():
    docIDLst = invertedIndex[key]
    idf = InverseDF(key,invertedIndex)
    invertedIndex[key] = []
    for docID in docIDLst:
      text = state_union.sents(corpus[docID - 1])
      tf = TermFrequency(key,text)
      invertedIndex[key].append((docID,(tf*idf)))
  return invertedIndex

In [None]:
tf_idf_index = TF_IDFIndex(state_union.fileids())
tf_idf_index

In [8]:
# Problem 1 (End)

In [9]:
# Problem 2 (Start)

In [10]:
# Query vector function

def QueryVector(query, tf_idf_index):
  qvDict = {}           # Query vector dictionary; will be the final return value
  tokenizedQuery = list(query.split(" "))
  for token in tokenizedQuery:
    if token not in tf_idf_index:
      qvDict[token] = 0
      continue
    qvDict[token] = InverseDF(token, tf_idf_index)
  return qvDict

In [19]:
# Testing the query vector function

query = "the world ends with you"
QueryVector(query, tf_idf_index)

{'ends': 0.8586708472035307,
 'the': 1.5118833609788744,
 'with': 1.5118833609788744,
 'world': 1.5118833609788744,
 'you': 1.5118833609788744}

In [12]:
# Problem 2 (End)

In [13]:
# Problem 3 (Start)

In [14]:
# Document length auxiliary function; returns the euclidean length of a document

def DocLength(queryVector, docID, tf_idf_index):
  sum = 0
  for term in queryVector:
    if docID not in tf_idf_index[term]:
      continue
    for pair in tf_idf_index[term]:
      if pair[0] == docID:
        sum += pair[1] ** 2
        break
  return math.sqrt(sum)

In [15]:
# Cosine similarity search function

def CSSearch(query, tf_idf_index):
  scores = [(0,0)] * 65    # Remember that there are 65 documents in state_union corpus
  queryVector = QueryVector(query, tf_idf_index)
  for term, idf in queryVector.items():
    for (docID, tf_idf) in tf_idf_index[term]:
      if (docID - 1) in scores:
        scores[docID - 1][1] += idf*tf_idf
        continue
      scores[docID - 1] = (docID, (idf*tf_idf))
  scores = sorted(list(set([score for score in scores])))
  for score in scores:
    docLength = DocLength(queryVector, score[0], tf_idf_index)
    if docLength == 0:
      continue
    score[1] = score[1]/docLength 
  return scores

In [18]:
# Testing the cosine similiarity search function

query = "the world ends with you"
CSSearch(query, tf_idf_index)  # ignore starting (0,0); list is sorted by ascending order of document ID

[(0, 0),
 (19, 0.00026709495520276373),
 (23, 0.00013147568185399952),
 (26, 0.00017702656034507304),
 (49, 0.00025155770175272236),
 (52, 9.738682127027198e-05),
 (56, 9.84400031825406e-05),
 (60, 0.0002411104067486034),
 (65, 0.0057002276738273745)]

In [17]:
# Problem 3 (End)