In [1]:
import re
import os
import numpy as np
from nltk.corpus import stopwords

# All of the irrelevent words and punctuation to be removed from document texts
stop_words = stopwords.words('english')
DELIM = '[ \n\t0123456789;:.,&$£/#~@><|!%^*\(\)\"\'-]+'


In [2]:
# Reads a document with the index docid 

def readfile(path, docid):
    files = sorted(os.listdir(path))
    f = open(os.path.join(path, files[docid]), 'r',encoding='latin-1')
    s = f.read()
    f.close()
    return s


In [3]:
# Tokenizes each of the words so that words can be easily
# inspected. Text is all converted to lower case too to
# prevent multiple variations of the same word being tokenized

def tokenize(text):
    return re.split(DELIM, text.lower())


In [4]:
# The values for the terms in all documents are calculated
# so that they can be accessed using an index later with a query

def indextextfiles_RR(path):
    
    N = len(sorted(os.listdir(path)))                      # The number of documents
    postings = {}                                          # The documents which the word appears in
    df = {}                                                # Document Frequency 
    idf_in_all_docs = {}                                   # All IDF values for each of the words  
    tf_in_all_docs = []                                    # Term frequencies for words in all documents 
    list_of_normTF=[]                                      # The list of all documents normalised term frequencies
    final_docCollection = []                               # The final collection of all tfidf values for every term
    uniqueWords = {}                                       # The set used to check whether a query word is in any document
    
    
    for docID in range(N):                                 # Reads each file one by one in the collection of documents
        s = readfile(path, docID)                           
        words = sorted(tokenize(s))                        # Tokenises each of the words in the document
        words = [w for w in words if not w in stop_words]  # Removes all stopwords such as I, a, you... as they are irrelevent 
        specificDocWords = {}                              # Dictionary of all unique words in the document
        normTF = {}                                        # Dictionary of all the normalised term frequencies in a document
            
        for w in words:
            if w!='':
                postings.setdefault(w, set()).add(docID)   # Adds each of the docIDs to a set containng the words          
                
                # Calculates the unique words and their frequencies in individual documents
                if w in specificDocWords.keys():
                    specificDocWords[w] = specificDocWords[w] + 1
                else:
                    specificDocWords[w] = 1 
           
            if w not in uniqueWords.keys():
                uniqueWords[w] = 1 
    
        
        # calculates the normalised tf for each unique word in the document
        words_in_doc_Count = len(words)
        for word in specificDocWords.keys():
            normTF[word] = (specificDocWords[word] / words_in_doc_Count)               
        
        
        # Appends all of the dictionaries with normalised tf into one list
        tf_in_all_docs.append(normTF)    
    
    
    # gets the df values for all terms 
    # -- The number of documents each term appears in 
    # gets the idf values for all terms using the df values
    # -- Log10 ( number of documents / df ) 
    for x in postings.keys():
        df[x] = len(postings[x])
        idf_in_all_docs[x] = np.log10(N / df[x])
        
       
    # tfidf calculated for each of the words in a document by multiplying
    # the idf values of terms by their normalised term frequencies which 
    # are stored in the final document 
    for docID in range(N): 
        list_of_normTF = list(tf_in_all_docs[docID].items())
        tfidf = {}

        for index in range(len(list_of_normTF)):
            ind = list_of_normTF[index]
            word = ind[0]
            tfidf[word] = ind[1] * idf_in_all_docs[word] 

        final_docCollection.append(tfidf)
    
    
    return postings, N, final_docCollection, uniqueWords, specificDocWords, tf_in_all_docs

In [5]:
def query_RR(postings, qtext):
    words = tokenize(qtext)                                    # Query Tokenised and all stopwords removed as they are 
    words = [w for w in words if not w in stop_words]          # not calculated in the index due to lack of relevence.
    score_of_documents = {}                                    # Dictionary contains docIDs as keys and scores as values
    
    for w in words:
        if w not in uniqueWords.keys():
            return print("The word \'" + w + "\' cannot be found in any of the documents! \nRemove this word from the query and try again.")
        
    allpostings = [postings[w] for w in words if w!='']        # The postings for each individual word in the query

    
    # Loops through all of the postings which the words in 
    # the query can be found in
    for wordDocs in range(len(allpostings)):

        
        # The postings of each individual word in the query
        collectionOfDocs = allpostings[wordDocs]

        
        # Loops through each of the documents which contain the words 
        for docID in collectionOfDocs:
            document = final_docCollection[docID]
            score = 0

            # Looping through each of the words in the query 1 at a time, retrieving
            # the tfidf values of the words
            for queryWords in words:
                if queryWords in document.keys():
                    score += document[queryWords]
                    

            # Each of the query words are added to a dictionary as their keys
            # and their final scores are the values.
            score_of_documents[docID] = score
            
    
    # Orders all of the scores with the highest ranking document appearing first.
    # Only the top 10 relevent documents are outputted for efficiency
    # Reversed so that the highest ranking document is the first displayed 
    final_rankings = {}
    sorted_keys = sorted(score_of_documents, key=score_of_documents.get, reverse=True)

    for w in sorted_keys[:10]:
        final_rankings[w] = score_of_documents[w]
    
    
    return print("The top 10 relevent documents with their scores for the given query are: \n\n" + str(final_rankings))
    #return print("The top 10 relevent documents with their scores for the given query are: \n\n" + str(final_rankings.keys()))

In [7]:
postings, N, final_docCollection, uniqueWords, specificDocWords, tf_in_all_docs = indextextfiles_RR('docs')
query_RR(postings,'christmas champions mourinho arsenal')

The top 10 relevent documents with their scores for the given query are: 

{372: 0.09078423131156231, 736: 0.08696477174635121, 446: 0.08347265659341374, 294: 0.07990606677483257, 462: 0.06852744216405984, 4: 0.06643204207432916, 36: 0.05810832979076925, 80: 0.05746696492744416, 91: 0.056897343847774895, 177: 0.05162035849800805}


# Testing of the system

In [8]:
s = readfile('docs', 736)
print(s)
words = sorted(tokenize(s))
words = [w for w in words if not w in stop_words]

print("\nThe number of words in the document when stop words are removed: \n" + str(len(words)))

No lack of Christmas spirit

It's that time of year when footballers and managers brace themselves for what I think is the most important period of the entire season.

I was thinking to myself last week that the last time I had a Christmas off was 39 years ago. I have never been out of work at Christmas as a player or manager since I was 17 when our youth team coach at Chesterfield, a chap called Reg Wright, gave us Christmans off. But only because there were no games. I think things have changed dramatically over the years in terms of discipline and looking after themselves. Players take a lot more responsibility these days, in particular the older ones - I'm talking about those 32 and over, here. They've changed their whole outlook in order to continue playing at this level. Managers as well need to trust players more than we have in the past. In my squad I haven't got anyone I have to warn regarding excess and over-eating, which is a massive bonus.

Over the years, there have been s

In [9]:
print("The raw term frequencies for the last document in the collection: \n\n" + str(specificDocWords))

The raw term frequencies for the last document in the collection: 

{'accept': 1, 'accident': 1, 'ago': 1, 'allow': 1, 'also': 2, 'always': 2, 'amy': 1, 'anyone': 1, 'asked': 2, 'balance': 1, 'become': 1, 'bed': 1, 'behave': 2, 'behaviour': 1, 'big': 1, 'bit': 1, 'bonus': 1, 'booze': 1, 'boxing': 1, 'brace': 1, 'called': 1, 'came': 1, 'certain': 1, 'changed': 2, 'chap': 1, 'chesterfield': 1, 'children': 1, 'christmans': 1, 'christmas': 13, 'coach': 1, 'continue': 1, 'could': 1, 'coventry': 2, 'day': 5, 'days': 1, 'discipline': 1, 'dramatically': 1, 'dublin': 1, 'eating': 1, 'embarrassing': 1, 'entire': 1, 'eve': 1, 'eventually': 1, 'everybody': 1, 'excess': 1, 'fact': 1, 'feet': 1, 'festivities': 1, 'fight': 1, 'find': 2, 'football': 2, 'footballer': 1, 'footballers': 2, 'game': 2, 'games': 1, 'gave': 1, 'get': 1, 'getting': 1, 'go': 2, 'going': 3, 'got': 1, 'govern': 1, 'grotto': 1, 'ground': 1, 'happy': 1, 'holding': 1, 'holiday': 1, 'home': 1, 'hope': 1, 'hours': 1, 'idiot': 1, 'imp

In [10]:
print("The normalised versions of the terms in the same document: \n\n" + str(tf_in_all_docs[736]))

The normalised versions of the terms in the same document: 

{'accept': 0.0034965034965034965, 'accident': 0.0034965034965034965, 'ago': 0.0034965034965034965, 'allow': 0.0034965034965034965, 'also': 0.006993006993006993, 'always': 0.006993006993006993, 'amy': 0.0034965034965034965, 'anyone': 0.0034965034965034965, 'asked': 0.006993006993006993, 'balance': 0.0034965034965034965, 'become': 0.0034965034965034965, 'bed': 0.0034965034965034965, 'behave': 0.006993006993006993, 'behaviour': 0.0034965034965034965, 'big': 0.0034965034965034965, 'bit': 0.0034965034965034965, 'bonus': 0.0034965034965034965, 'booze': 0.0034965034965034965, 'boxing': 0.0034965034965034965, 'brace': 0.0034965034965034965, 'called': 0.0034965034965034965, 'came': 0.0034965034965034965, 'certain': 0.0034965034965034965, 'changed': 0.006993006993006993, 'chap': 0.0034965034965034965, 'chesterfield': 0.0034965034965034965, 'children': 0.0034965034965034965, 'christmans': 0.0034965034965034965, 'christmas': 0.0454545454

In [11]:
print("The number of documents that contain the specific word \'years\' \n" + str(len(postings['years'])))

The number of documents that contain the specific word 'years' 
175


In [12]:
print("All of the documents that contain the specific word \'years\' \n\n" + str(postings['years']) + "\n\nThe final document in the collection, \'document 736\' can be seen in here which is expected")

All of the documents that contain the specific word 'years' 

{0, 5, 517, 7, 12, 15, 16, 527, 20, 22, 537, 541, 543, 33, 546, 547, 36, 37, 40, 553, 43, 45, 558, 559, 49, 52, 53, 565, 568, 575, 576, 66, 578, 68, 579, 70, 75, 76, 79, 80, 591, 82, 597, 603, 604, 94, 607, 613, 615, 617, 106, 108, 622, 623, 624, 113, 123, 124, 125, 636, 127, 129, 645, 646, 135, 649, 139, 140, 141, 655, 145, 146, 659, 662, 151, 664, 158, 672, 161, 162, 675, 676, 165, 679, 681, 170, 682, 683, 685, 691, 692, 183, 695, 185, 696, 188, 700, 190, 701, 196, 709, 710, 711, 201, 715, 206, 719, 723, 212, 725, 726, 727, 729, 218, 731, 732, 733, 736, 233, 236, 238, 241, 257, 258, 264, 269, 270, 271, 274, 277, 283, 292, 293, 304, 306, 310, 315, 318, 325, 329, 335, 336, 339, 341, 342, 348, 351, 361, 367, 369, 370, 375, 376, 383, 385, 386, 388, 391, 392, 393, 394, 397, 401, 407, 414, 424, 446, 447, 471, 483, 485, 486, 494, 503, 510}

The final document in the collection, 'document 736' can be seen in here which is expected

In [13]:
print("The idf value of the word \'years\' is: \n"+ str(np.log10(737 / len(postings['years']))))

The idf value of the word 'years' is: 
0.6244294391727571


In [14]:
print("The tfidf values of the words in the final document: \n"+ str(final_docCollection[736]))

The tfidf values of the words in the final document: 
{'accept': 0.00613120327116159, 'accident': 0.0062527490972427505, 'ago': 0.003641233164888899, 'allow': 0.004811558720366359, 'also': 0.002679494045540319, 'always': 0.005947091947973924, 'amy': 0.008973557665017728, 'anyone': 0.005021341691258966, 'asked': 0.009254541562998432, 'balance': 0.006018669413219628, 'become': 0.004542887285986211, 'bed': 0.007921005232626186, 'behave': 0.017947115330035457, 'behaviour': 0.005913902897913882, 'big': 0.002930362622985461, 'bit': 0.0033530855558755305, 'bonus': 0.007305301529634293, 'booze': 0.010026110097409271, 'boxing': 0.007305301529634293, 'brace': 0.007582159033297317, 'called': 0.0041796140906410284, 'came': 0.002427556711352369, 'certain': 0.004763347935451558, 'changed': 0.010400393329702416, 'chap': 0.008973557665017728, 'chesterfield': 0.010026110097409271, 'children': 0.007305301529634293, 'christmans': 0.010026110097409271, 'christmas': 0.08696477174635121, 'coach': 0.00200350

In [15]:
query_RR(postings,'christmas')
print("\nThe document \'736\' is the most relevent for the word \'christmas\'. Therefore, one final test should show that the addition of both the scores of \'years\' and \'christmas\' should be equal to 0.093514730898513. It passes!")

The top 10 relevent documents with their scores for the given query are: 

{736: 0.08696477174635121, 61: 0.018942821568512146, 686: 0.014439433799394163, 462: 0.013015135907617187, 553: 0.01145643699652531, 409: 0.011320857860471755, 452: 0.0073023854138157505, 73: 0.005780135886464431, 76: 0.0036166823788652677}

The document '736' is the most relevent for the word 'christmas'. Therefore, one final test should show that the addition of both the scores of 'years' and 'christmas' should be equal to 0.093514730898513. It passes!


In [16]:
query_RR(postings,'christmas years')

The top 10 relevent documents with their scores for the given query are: 

{736: 0.093514730898513, 61: 0.018942821568512146, 553: 0.018934633872845753, 731: 0.016432353662440974, 686: 0.014439433799394163, 407: 0.013723723937862793, 462: 0.013015135907617187, 385: 0.012614736144904184, 733: 0.012243714493583472, 604: 0.012164209854014748}


In [19]:
query_RR(postings,'christmas years nonExistantWORD')
print("\nThe system is also robust to words which cannot be found in any of the documents!")

The word 'nonexistantword' cannot be found in any of the documents! 
Remove this word from the query and try again.

The system is also robust to words which cannot be found in any of the documents!
