As a baseline, we will be implementing a simple document retrieval system which 
* processes the texts (cleaning + removal of stop words + lemmatizing) then
* represent the documents using the TF-IDF statistic and 
* perform the comparison using cosine similarity.

In [20]:
#!pip install ipynb
#from ipynb.fs.full.preprocessing import parseDocs, tokenize_and_clean, lemmatize
#from preprocessing import *

In [61]:
#from cosine_sim
import numpy as np
def cosine_sim(x, y):
    """calculates cosine similarity between 2 vectors.
        
    Parameters
    ----------
    x : numpy.ndarray
        vector representation (of query)
    y : numpy.ndarray
        vector representation (of document)
    
    Returns
    -------
    cosine_sim: numpy.float64
        cosine similarity between vector x and y
    """
    
    cos_sim = np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))
    
    return cos_sim

In [62]:
#from cosine_sim
def get_k_relevant(k, query, D):
    """returns ranked list of top k documents in descending order of their
    cosine similarity with the query
        
    Parameters
    ----------
    k : int
        number of documents to retrieve (top k)
    query : numpy.ndarray
        vector representation of query whose cosine similarity is to be computed with the corpus
    D: list of numpy.ndarray
        vector representation of all documents in corpus
    
    Returns
    -------
    ranked_sims: list of tuples (cosine similarity, index of document)
        list of top k cosine similarities and the corresponding documents (their index) in descending order
    """
      
    cosine_sims = []
    
    for i, d in enumerate(D):
        cosine_sims.append((cosine_sim(query, d), i))
        
    ranked_sims = sorted(cosine_sims, key=lambda x: x[0], reverse=True)
    
    if k != 0:
        # if k=0 retrieve all documents in descending order
        ranked_sims = ranked_sims[:k]
    
    return ranked_sims

In [25]:
#from preprocessing file

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import re
from bs4 import BeautifulSoup #to remove HTML tags

lemmatizer = WordNetLemmatizer()
stop_list = stopwords.words('english')

[nltk_data] Downloading package punkt to /home/bayrakd1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bayrakd1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/bayrakd1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
#from preprocessing file
def parseDocs(filename): 
    with open(filename,"r") as f:
        docs = []
        doc = ""
        cont = False
    
        for line in f: #skip text between .I and .W
            if ".I" in line:
                cont = False
                if len(doc)>0:
                    docs.append(doc)
                    doc = ""
            elif ".W" in line:
                cont = True
            elif cont == True:
                doc = doc + line

        if len(doc)>0: #needed for the last document
            docs.append(doc)

        f.close()
    
    return docs

In [22]:
#from preprocessing file
def tokenize_and_clean(docs):
    """This function tokenizes texts into lowercased tokens with TreebankWordTokenizer
    
    Preprocesses the list of strings given as input.
    Tokenize each string into sentences using sent_tokenize(),
    tokenize each sentence into tokens using TreebankWordTokenizer().tokenize(),
    Lowercasing the characters, removing non-ASCII values, special characters, HTML tags and stopwords.
    

    Parameters
    ----------
    docs : list of strings
        list of document contents
    
    Returns
    -------
    tokens : list of list of strings
        each text as a list of lowercased tokens
    """
    tokens = []
    
    for doc in docs:
        # converting to lower case
        txt = doc.lower()
        
        # remove HTML tags
        txt = BeautifulSoup(txt, 'html.parser').get_text()
        
        # tokenize
        sentence = sent_tokenize(txt)
        tok = [TreebankWordTokenizer().tokenize(sent) for sent in sentence]
        tok = [item for sublist in tok for item in sublist] #convert to one list
        
        # removing stop words and special characters from the tokens
        clean_tokens = [word for word in tok if (word not in stop_list and not re.match('[^A-Za-z0-9]', word))]
        
        tokens.append(clean_tokens)


    return tokens

In [23]:
#from preprocessing file
def lemmatize(doc_tokens):
    """This function lemmatizes texts with NLTK WordNetLemmatizer

    Parameters
    ----------
    doc_tokens : list of list of tokens
    
    Returns
    -------
    doc_lemmas : list of list of lemmatized tokens
    """
    doc_lemmas = []
    
    for doc in doc_tokens:
        lemmas = [lemmatizer.lemmatize(token) for token in doc]
        doc_lemmas.append(lemmas)
        
    return doc_lemmas

In [28]:
base_docs = parseDocs("cran/cran.all.1400")
base_docs = tokenize_and_clean(base_docs)
base_docs = lemmatize(base_docs)

### TF-IDF

In [33]:
import numpy as np
from collections import Counter

def create_term_doc_matrix(base_docs):
    """ Constructs a frequency term-document matrix
    
    this function takes in a list of documents and returns a term-document matrix
    the rows are lemma types, the columns are documents 
    the rows should be sorted alphabetically
    the order of the columns should be preserved as it's given in base_docs
    the cell values are a number of times a lemma was seen in a document
    the value should be zero, if a lemma is absent from a document
    
    Parameters
    ----------
    base_docs : a list of lists of strings [['a','a','b'], ['a','b','c']]
        a list of documents represented as a list of lemmas
    
    Returns
    -------
    matrix : numpy array
        a matrix where columns are documents and rows are lemma types,
        the cells of the matrix contain lemma counts in a document,
        the lemmas for rows are sorted alphabetically
        for the example above it will be:
            np.array([[2,1],
                      [1,1],
                      [0,1]])
        
    sorted_vocab : list of strings
        a list of all the lemma types used in all documents (the rows of our matrix)
        the words should be strings sorted alphabetically
        for the example above it should be ['a','b','c']
    """
    lemma_types = set()
    count_list = []
    
    for doc in base_docs:
        counter = dict(Counter(doc))
        count_list.append(counter)
        lemma_types = lemma_types.union(set(counter.keys()))

    sorted_vocab = sorted(list(lemma_types))
    rows = len(sorted_vocab)
    columns = len(count_list)
    matrix = np.zeros((rows,columns))
    
    for i, doc in enumerate(count_list):
        for j, lemma in enumerate(sorted_vocab):
            if lemma in doc.keys():
                matrix[j][i] = doc[lemma]
            else:
                matrix[j][i] = 0
                
    return matrix, sorted_vocab


In [34]:
td_matrix, sorted_vocab = create_term_doc_matrix(base_docs)

In [36]:
import math

def tf_idf(td_matrix):
    """ Weighs a term-document matrix of raw counts with tf-idf scheme
    
    this function takes in a term-document matrix as a numpy array, 
    and weights the scores with the tf-idf algorithm described above.
    idf values are modified with log_10
    
    Parameters
    ----------
    td_matrix : numpy array 
        a matrix where columns are documents and 
        rows are word counts in a document
    
    Returns
    -------
    tf_idf_matrix : numpy array 
        a matrix where columns are documents and 
        rows are word tf-idf values in a document
        
    idf_vector : numpy array of shape (vocabulary-size, 1)
        a vector of idf values for words in the collection. the shape is (vocabulary-size, 1)
        this vector will be used to weight new query documents
    """
    shape = td_matrix.shape
    tf_idf_matrix = np.zeros(shape)
    idf_vector = np.zeros((shape[0],1))
    N = shape[1]
    
    for t, term in enumerate(td_matrix):
        df_t = sum(x > 0 for x in term)
        idf_t = math.log10(N/df_t)
        idf_vector[t] = idf_t
        for d, w_td in enumerate(term):
            tf_idf_matrix[t][d] = w_td*idf_t
            
    
    return tf_idf_matrix, idf_vector      

In [37]:
tf_idf_matrix, idf_vector = tf_idf(td_matrix)

In [38]:
def lsi(matrix, d):
    """ Returns truncted SVD components
    
    this function takes in a term-document matrix, where
    values can be both raw frequencies and weighted values (tf_idf)
    and returns their trunctaded SVD matrices.


    Parameters
    ----------
    matrix : numpy array
        a numpy array where columns are documents and 
        rows are lemmas
    d : int
        a number of features we will be reducing our matrix to
    
    Returns
    -------
    DT_d : numpy array
        a [d x m], where m is the number of word dimensions in the original matrix, 
        and d is the number of features we want to keep
        this is a matrix that represents documents with values for d hidden topics
    transformation_matrix : numpy array 
        a matrix to transform queries into the same vector space as DT_d
        T_dS_d^-, where S_d^- is inverse of S_d
    """
    # Singular-value decomposition is already done for you
    T, s, DT = np.linalg.svd(matrix)
    S = np.diag(s)

    DT_d = DT[0:d]
    T_2 = np.array([t[0:d] for t in T])
    S_2 = [x[0:d] for x in S[0:d]]
    S_2_inv = np.linalg.inv(S_2)
    transformation_matrix = np.dot(T_2,S_2_inv)

    
    return DT_d, transformation_matrix

In [71]:
# Run the cell below to get dense 20d vector models
tf_idf_matrix_dense, tf_idf_matrix_transform = lsi(tf_idf_matrix,20)

In [72]:
tf_idf_matrix_dense.shape

(20, 1398)

### Transform queries

In [54]:
def create_term_doc_matrix_queries(normalized_queries, sorted_vocabulary):
    """ Constructs a frequency term-document matrix for queries
    
    this function takes in a list of queries and a vocabulary list and returns a term-document matrix
    the rows are lemma types as given in vocabulary, the columns are documents
    the rows should be in the same order as in vocabulary given
    the order of the columns should be preserved as it's given in normalized_queries
    the cell values are a number of times a lemma was seen in a document
    the value should be zero, if a lemma is absent from a document
    
    Parameters
    ----------
    normalized_queries : a list of lists of strings [['a','a','b','d'], ['a','b','c']]
        a list of queries represented as a list of lemmas
    sorted_vocabulary : list of strings
        a list of all the lemma types used in all training documents (the rows of our matrix)
        the words are strings sorted alphabetically
        for our example it will be ['a','b','c']
    
    Returns
    -------
    query_matrix : numpy array
        a matrix where columns are documents in normalized_queries 
        and rows are lemma types from sorted_vocabulary.
        for the example above it will be:
            np.array([[2,1],
                      [1,1],
                      [0,1]])
        'd' is not included in the matrix, because it is absent from sorted_vocabulary
    """

    rows = len(sorted_vocabulary)
    columns = len(normalized_queries)
    query_matrix = np.zeros((rows,columns))
    counter = [dict(Counter(query)) for query in normalized_queries]
    
    for i, query in enumerate(counter):
        for j, lemma in enumerate(sorted_vocabulary):
            if lemma in query.keys():
                query_matrix[j][i] = query[lemma]
            else:
                query_matrix[j][i] = 0
            
    
    return query_matrix   

In [73]:
normalized_queries = parseDocs("cran/cran.qry")
normalized_queries = tokenize_and_clean(normalized_queries)
normalized_queries = lemmatize(normalized_queries)

# collect term-document matrix
td_queries = create_term_doc_matrix_queries(normalized_queries, sorted_vocab)
# weigh term-document matrix with tf-idf
tf_idf_queries = td_queries*idf_vector 
# transform matrices with LSI
tf_idf_queries_dense = tf_idf_queries.T.dot(tf_idf_matrix_transform).T

In [74]:
tf_idf_queries_dense.shape

(20, 225)

In [75]:
query = tf_idf_queries_dense.T[0] 
D = tf_idf_matrix_dense.T

print("get 10 top: ", get_k_relevant(10, query, D))

get 10 top:  [(0.7809579117962733, 50), (0.7676650546661836, 11), (0.7643710468981361, 252), (0.7618345019991778, 1166), (0.7376461569764691, 882), (0.7293633705409656, 1165), (0.729238984321086, 808), (0.7182758380584934, 1325), (0.7170035653923107, 99), (0.7074827920113113, 1302)]
