### Import Library

In [1]:
import re
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet as wn
from collections import defaultdict
from nltk.stem import PorterStemmer 


ps = PorterStemmer() 

# Inisialisasi 
doc_ids = list(range(56))



#### Clean Text
- mengambil teks sebagai input dan membersihkannya dari karakter khusus, angka, dan kontraksi kata (e.g., "won't" menjadi "will not"). 

In [2]:
# Fungsi untuk membersihkan teks
def clean_text(text):
    text = re.sub('  ', ' ', text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'re", " are", text)
    text = re.sub(r"'s", " is", text)
    text = re.sub(r"'d", " would", text)
    text = re.sub(r"'ll", " will", text)
    text = re.sub(r"'t", " not", text)
    text = re.sub(r"'ve", " have", text)
    text = re.sub(r"'m", " am", text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    return text


### Inverted Index
 Fungsi ini membuat indeks teks terbalik (inverted index) dari dokumen-dokumen. Ini juga melakukan pemrosesan teks, menghilangkan stopwords, dan melakukan stemming sebelum membangun indeks.

In [3]:
# Fungsi untuk membuat inverted index
def inverted_index(stop_words):
    dictionary = defaultdict(list)
    documents = {}
    
    for i in range(56):
        doc_no = i
        with open("data/trump_speechs/speech_" + str(doc_no) + ".txt", 'r') as file:
            next(file)
            s = file.read().replace('\n', ' ')
        
        s = clean_text(s)
        
        key = 'speech_' + str(doc_no)
        documents.setdefault(key, [])
        documents[key].append(s)
        
        s = s.lower()
        s = [words if words not in stop_words else '' for words in s.split(' ')]
        doc = list(filter(None, s)) 
        stemmed = [ps.stem(word) for word in doc]
        
        for x in stemmed:
            dictionary[x].append(doc_no)
    
    return dictionary, documents


### Positional Index

Fungsi ini mirip dengan fungsi inverted_index, tetapi juga membangun indeks posisional yang menyimpan informasi tentang posisi kata dalam dokumen.

In [4]:
# Fungsi untuk membuat positional index
def positional_index(stop_words):
    dictionary = defaultdict(dict)
    documents = {}
    
    for i in range(56):
        doc_no = i
        with open("data/trump_speechs/speech_" + str(doc_no) + ".txt", 'r') as file:
            s = file.read().replace('\n', ' ')[1:]
        
        s = clean_text(s)
        
        key = 'speech_' + str(doc_no)
        documents.setdefault(key, [])
        documents[key].append(s)
        
        s = s.lower()
        s = s.split(' ')
        doc = list(filter(None, s)) 
        temp_dict = {}
        stemmed = [ps.stem(word) for word in doc]
        
        for a, x in enumerate(stemmed):
            temp_dict.setdefault(x, []).append(a)
        
        for x in temp_dict:
            dictionary[x][doc_no] = temp_dict[x]
    
    return dictionary, documents


In [5]:
# Mendapatkan stopwords dari file
stop_words = []
with open("data/stopword_en.txt", 'r') as file:
    stop_words = file.read().split()

# Mendapatkan inverted_index dan positional_index
dictionary_inverted, docu = inverted_index(stop_words)
dictionary_positional, docu = positional_index(stop_words)

In [8]:
print(f"Inverted Index: {dictionary_inverted}")

Inverted Index: defaultdict(<class 'list'>, {'trump': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 11, 12, 12, 13, 14, 16, 16, 16, 17, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 21, 22, 24, 26, 29, 29, 30, 31, 31, 32, 32, 33, 33, 33, 34, 35, 36, 36, 37, 37, 37, 39, 39, 40, 40, 41, 41, 43, 43, 45, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 48, 49, 49, 4

Fungsi postfix ini menerima satu argumen infix_tokens, yang merupakan daftar token dalam ekspresi infix yang akan dikonversi menjadi ekspresi postfix.

Fungsi ini berguna dalam konteks pencarian informasi atau ekspresi query dalam sistem pencarian teks, di mana ekspresi infix dapat dengan mudah diubah menjadi ekspresi postfix untuk evaluasi lebih lanjut.

In [9]:
#infix to postifix query
def postfix(infix_tokens):
    
    #precendence initialization
    precedence = {}
    precedence['NOT'] = 3
    precedence['AND'] = 2
    precedence['OR'] = 1
    precedence['('] = 0
    precedence[')'] = 0    

    output = []
    operator_stack = []
    
    #creating postfix expression
    for token in infix_tokens:
        if (token == '('):
            operator_stack.append(token)

        elif (token == ')'):
            operator = operator_stack.pop()
            while operator != '(':
                output.append(operator)
                operator = operator_stack.pop()
        
        elif (token in precedence):
            if (operator_stack):
                current_operator = operator_stack[-1]
                while (operator_stack and precedence[current_operator] > precedence[token]):
                    output.append(operator_stack.pop())
                    if (operator_stack):
                        current_operator = operator_stack[-1]

            operator_stack.append(token)

        else:
            output.append(token.lower())
    
    #while staack is not empty appending
    while (operator_stack):
        output.append(operator_stack.pop())
    return output



In [10]:
#AND two posting lists
def AND_op(word1,word2):
    if ((word1) and (word2)):
        return set(word1).intersection(word2)
    else:
        return set()
     
#OR two posting lists
def OR_op(word1, word2):
    if word1 is not None and word2 is not None:
        return set(word1).union(word2)
    else:
        return set()
   
#NOT two posting lists
def NOT_op(a,doc_ids):
    return set(doc_ids).symmetric_difference(a)



AND digunakan untuk mencari dokumen yang mengandung kedua kata, OR digunakan untuk mencari dokumen yang mengandung salah satu atau kedua kata, dan NOT digunakan untuk mengecualikan dokumen-dokumen yang mengandung suatu kata.


### Boolean query processing

In [11]:
#Boolean query processing
def process_query(q,dictionary_inverted):

    q = q.replace('(', '( ')
    q = q.replace(')', ' )')
    q = q.split(' ')
    query = []

    for i in q:
        query.append(ps.stem(i))
    for i in range(0,len(query)):
        if ( query[i]== 'and' or query[i]== 'or' or query[i]== 'not'):
            query[i] = query[i].upper()
    results_stack = []
    postfix_queue = postfix(query)

    #evaluating postfix query expression
    for i in postfix_queue:
        if ( i!= 'AND' and i!= 'OR' and i!= 'NOT'):
            i = i.replace('(', ' ')
            i = i.replace(')', ' ')
            i = i.lower()
            i = dictionary_inverted.get(i)
            results_stack.append(i)
        elif (i=='AND'):
            a = results_stack.pop()
            b = results_stack.pop()
            results_stack.append(AND_op(a,b))
        elif (i=='OR'):
            a = results_stack.pop()
            b = results_stack.pop()
            results_stack.append(OR_op(a,b))
        elif (i == 'NOT'):
            a = results_stack.pop()
            print(a)
            results_stack.append(NOT_op(a,doc_ids))
            
    return results_stack.pop()



### Evaluasi Positional Index

In [12]:
#Evaluating proximity query
def positional_query(q,dictionary_positional):
    
    q = re.sub(r"AND", "", q)
    q = re.sub(r"  ", " ", q)
    q = q.split(' ')
    query = []
    
    for i in q:
        query.append(ps.stem(i))
        
    word1 = dictionary_positional.get(query[0])
    word2 = dictionary_positional.get(query[1])
    anding = set(word1).intersection(word2)
    
    query[2] = re.sub(r"/", "", query[2])
    answer = []
    skip = int(query[2]) + 1
    for i in anding:
        pp1 = dictionary_positional.get(query[0])[i]
        pp2 = dictionary_positional.get(query[1])[i]
        plen1 = len(pp1)
        plen2 = len(pp2)
        ii = jj = 0 
        while ii != plen1:
            while jj != plen2:
                if (abs(pp1[ii] - pp2[jj]) == skip):
                    answer.append(i)
                elif pp2[jj] > pp1[ii]:
                    break    
                jj+=1
            ii+=1
    answer = list(dict.fromkeys(answer))
    return answer



In [13]:
#checking whether word is present within position
def doc_check(ii,jj,plen1,plen2,skip,pp1,pp2):
    while ii != plen1:
        while jj != plen2:
            if (abs(pp1[ii] - pp2[jj]) == skip):
                return 1
            elif pp2[jj] > pp1[ii]:
                break
            jj+=1
        ii+=1
    return 0



In [14]:
#Evaluating phrase query     
def phrase_query(q,dictionary_positional,dictionary_inverted):
    
    q = q.replace('"', '')
    q = q.split()

    query = []
    for i in q:
        query.append(ps.stem(i))
        query.append('AND')
    query.pop()
    query = " ".join(query)
    anding = process_query(query,dictionary_positional)
    print(anding)
    answer = []
    query = query.replace('AND','')
    query = query.split()
    print(query)

    for i in anding:
        pp1 = dictionary_positional.get(query[0].lower())[i]
        flag = []
        skip = 1
        for x in range(1,len(query)):
            pp2 = dictionary_positional.get(query[x].lower())[i]
            plen1 = len(pp1)
            plen2 = len(pp2)
            ii = jj = 0 
            flag.append(doc_check(ii,jj,plen1,plen2,skip,pp1,pp2))
            skip = skip + 1
        if(0 not in flag):
            answer.append(i)
    answer = list(dict.fromkeys(answer))
    
    return answer

### Deployment

In [15]:
from flask import Flask, render_template, request

import time

app = Flask(__name__)

#Getting stopwords from the file
stop_words = []
with open ("data/stopword_en.txt",'r') as file:
    s=file.read().replace('\n',' ')
stop_words = s.split()

#Getting inverted_index and positional_index
dictionary_inverted,docu = inverted_index(stop_words)
dictionary_positional,docu = positional_index(stop_words)

#Returning Relevant document retrieved
def documents_ret(a):
    documents = {}
    if(a):
        for i in a:
            speech = "speech_" + str(i)
            documents.setdefault(speech,[])
            documents[speech].append(docu.get(speech))
    else:
        documents = {}
    
    return documents
        

#Default page display/home_page
@app.route('/')
def dictionary():
    return render_template('home.html')

#Funtion will invoke whenever a query is posted
@app.route("/query", methods=['POST'])
def upload():
    #query processing start time
    start = time.time()
    #getting query from the HTML form
    query = request.form['query']
    #Checking for boolean,proximity and phrase queries
    if('/' in query):
        result = positional_query(query,dictionary_positional)
    elif('"' not in query):
        result = process_query(query,dictionary_inverted)
    else:
        result = phrase_query(query,dictionary_positional,dictionary_inverted)

    documents = documents_ret(result)
    print(result)
    end = time.time()
    #total time to process query
    times = end - start
    return render_template('dictionary.html',dictionary = documents, num_docs= len(documents), time = str(times) + " " + "seconds")

if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [10/Oct/2023 05:08:57] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [10/Oct/2023 05:08:58] "GET /static/stylesheets/display_style.css HTTP/1.1" 304 -
127.0.0.1 - - [10/Oct/2023 05:09:12] "POST /query HTTP/1.1" 200 -


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 21, 21, 21, 21, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 28, 28, 29, 29, 29, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 41, 41, 41

127.0.0.1 - - [10/Oct/2023 05:09:16] "GET /static/stylesheets/display_style.css HTTP/1.1" 304 -
127.0.0.1 - - [10/Oct/2023 05:09:37] "POST /query HTTP/1.1" 200 -


{3}


127.0.0.1 - - [10/Oct/2023 05:09:38] "GET /static/stylesheets/display_style.css HTTP/1.1" 304 -
127.0.0.1 - - [10/Oct/2023 05:09:49] "POST /query HTTP/1.1" 200 -


set()


127.0.0.1 - - [10/Oct/2023 05:09:50] "GET /static/stylesheets/display_style.css HTTP/1.1" 304 -
