In [12]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics.pairwise import cosine_similarity
from string import punctuation
from heapq import nlargest

In [2]:
stemmer = SnowballStemmer(language='english') # official documentation says SnowBall stemmer is better than others for Eng

In [43]:
def sort_dict(x,descending=True):
    '''
    method to return a sorted dict based on the values for. for example {'a':50,'b':2,'c':10} would be returned as either
    {'a':50,'c':10,'b':2} or {'b':2,'c':10,'a':50}
    args:
        x: dictionary
        descending: whether to return a descending or ascending dict
    out:
        sorted dict based on values
    '''
    assert len(x)>1, 'Single Element Dictionary'
    return {k: v for k, v in sorted(x.items(), key=lambda item: item[1],reverse=descending)}


def get_vocab(sent_tok_list):
    '''
    Function to build a vicablary in form of {'word':frequency} dict
    param:
        text_list: a list of all the word tokens in the document
    out:
        vocab: a dictonary with words as keys and respective frequencies as values
        '''
    vocab = {} # dict that will be returned
    for sent_tokens in sent_tok_list:
        for word in sent_tokens:
            word = word.lower().strip()
            if word not in sw and word not in punctuation:
                # could use if vocab[word] in vocab then +=1 else 1 
                # or could have imported the DeefaultDict
                word = stemmer.stem(word)
                try:
                    vocab[word] += 1
                except KeyError:
                    vocab[word] = 1
                    
    return sort_dict(vocab)


def inverted_index(sent_token_list):
    IF_score = {}
    for DocId, doc in enumerate(sent_token_list):
        for word in doc:
            word = word.lower().strip()
            if (word not in sw and word not in punctuation):
                word = stemmer.stem(word)
                try:
                    if DocId+1 in IF_score[word]:
                        IF_score[word][DocId+1]+=1
                    else:
                        IF_score[word][DocId+1] = 1
                except KeyError:
                    IF_score[word] = {}
                    IF_score[word][DocId+1] = 1
    return IF_score


def cleaned_docs(sent_tok_list):
    '''
    Function to build a vicablary in form of {'word':frequency} dict
    param:
        text_list: a list of all the word tokens in the document
    out:
        vocab: a dictonary with words as keys and respective frequencies as values
        '''
    cleaned_doc_list = [] # dict that will be returned
    for sent_tokens in sent_tok_list:
        doc = []
        for word in sent_tokens:
            word = word.lower().strip()
            if word not in sw and word not in punctuation:
                word = stemmer.stem(word)
                doc.append(word)
                
        cleaned_doc_list.append(' '.join(doc))
    return cleaned_doc_list


In [44]:
sw = [i.lower().strip() for i in 'Is, An, That, Use, And, To, From, In, Both, Of, At, The'.split(',')] # stop words
doc_1 = 'data science is a field to use scientific method, process, algorithm, system to extract knowledge.'
doc_2 = 'data mining is the process to discover pattern in large data to involve method at the database system.'
doc_3 = 'information system is the study of network of hardware and software that people use to process data.'
DOC = [doc_1,doc_2,doc_3]

In [45]:
DOC_TOKENS = [word_tokenize(sent) for sent in  DOC] # list of list of word tokens

## Q 1.1, 1.2 
Both are implemented via the functions defined above

In [46]:
vocab = get_vocab(DOC_TOKENS)

In [55]:
vocab

{'data': 4,
 'process': 3,
 'system': 3,
 'method': 2,
 'scienc': 1,
 'a': 1,
 'field': 1,
 'scientif': 1,
 'algorithm': 1,
 'extract': 1,
 'knowledg': 1,
 'mine': 1,
 'discov': 1,
 'pattern': 1,
 'larg': 1,
 'involv': 1,
 'databas': 1,
 'inform': 1,
 'studi': 1,
 'network': 1,
 'hardwar': 1,
 'softwar': 1,
 'peopl': 1}

```data``` has came 4 times in all of the documents, ```process``` thrice and ```method``` twice

In [47]:
nlargest(5,vocab,vocab.get) # get the top 5 occuring words in the document

['data', 'process', 'system', 'method', 'scienc']

## Q 1.3

In [48]:
IDX = inverted_index(DOC_TOKENS)

In [49]:
IDX

{'data': {1: 1, 2: 2, 3: 1},
 'scienc': {1: 1},
 'a': {1: 1},
 'field': {1: 1},
 'scientif': {1: 1},
 'method': {1: 1, 2: 1},
 'process': {1: 1, 2: 1, 3: 1},
 'algorithm': {1: 1},
 'system': {1: 1, 2: 1, 3: 1},
 'extract': {1: 1},
 'knowledg': {1: 1},
 'mine': {2: 1},
 'discov': {2: 1},
 'pattern': {2: 1},
 'larg': {2: 1},
 'involv': {2: 1},
 'databas': {2: 1},
 'inform': {3: 1},
 'studi': {3: 1},
 'network': {3: 1},
 'hardwar': {3: 1},
 'softwar': {3: 1},
 'peopl': {3: 1}}

```data``` came once in document 1, twice in document 2 and once in document 3

## Q. 1.4
Boolean Queries. Construct queries which apperar in atleast 2 documents using and,or,not

In [57]:
doc_1 = 'data science is a field to use scientific method, process, algorithm, system to extract knowledge.'
doc_2 = 'data mining is the process to discover pattern in large data to involve method at the database system.'
doc_3 = 'information system is the study of network of hardware and software that people use to process data.'

In [58]:
d_2 = ('data' or 'method') and ('database') # return document 2
d_1 = not('pattern' or 'mine') or ('data' or 'method' ) # returns document 1
d_3 = not('data' or 'method') or (('system' or 'process') or ('inform' or 'network'))

## Q 1.5 Vector Model (using TfiDf)

In [50]:
CLEANED_DOC = cleaned_docs(DOC_TOKENS) # document with cleaned words 

In [51]:
tf_vect = TfidfVectorizer().fit(CLEANED_DOC) # fit in the the document data only
tf_docs = tf_vect.transform(CLEANED_DOC) # transform the document data

In [52]:
query = 'data science and algorithm is fun while mining data to discover pattern in data and applying things'
query = list(get_vocab([word_tokenize(query)]))

In [53]:
sum(cosine_similarity(tf_vect.transform(query).todense(),tf_docs.todense()))

array([0.93818073, 1.41990924, 0.22249441])

It means that the given query is most relevent with the ```Second``` document