In [1]:
from pathlib import WindowsPath, Path
from bs4 import BeautifulSoup
from urllib.request import urlopen
import json
from collections import Counter
import re
from nltk.stem import PorterStemmer
import os
from collections import Counter, OrderedDict

In [2]:
def frequency_word_per_url(content):
    #given a url returns a dictionary: key-> word, value-> frequency of word
    #words in dictionary are stemmed

    #https://docs.python.org/3/library/re.html used to with re.findall to find all valid words in url
    #https://www.geeksforgeeks.org/python-stemming-words-with-nltk/ used for learining PorterStemmer
    # resp = urlopen(url)
    # html = resp.read()

    soup = BeautifulSoup(content, 'html.parser')
    for cur in soup(['script', 'style']):
        cur.extract()

    text = soup.get_text(separator=" ")
    words = re.findall(r"\b\w+\b", text.lower())

    stemmer = PorterStemmer()
    stemmed_word_lis = [stemmer.stem(word) for word in words]

    frequency = Counter(stemmed_word_lis)

    return dict(frequency)

In [3]:
def write_to_file(myDict: dict, filename: str):
    with open(filename, 'a') as file:
        for key in myDict:
            data = {key: myDict[key]}
            json.dump(data, file)
            file.write("\n")
    
    print("Finished writing to file", filename)

In [4]:
def rewrite_to_file(myDict: dict, filename: str):

    with open(filename, 'w') as file:
        for key in myDict:
            data = {key: myDict[key]}
            json.dump(data, file)
            file.write("\n")
    
    print("Finished writing to file", filename)

In [33]:
def create_inverted_index(file_name, index_dir):

    key = 0 # for assigning document id
    url_id_map = dict({})   # map each url to an id
    inverted_index  = dict({})
    max_length = 100000
    n_index = 0
    n_id = 0
    
    # read each JSON file, and add url to the map
    with open(file_name, "r") as file:
        for file_path in file:
            file_path = WindowsPath(file_path.strip("\n"))
            print("Reading content from", file_path)
            
            with open(file_path, "r") as file:
                
                # get a json object
                jsonObj = json.load(file)
        
                # get the url and tokens (including its frequency). 
                url = jsonObj["url"]
                content = jsonObj["content"]
                tokens = frequency_word_per_url(content)    # key: token, val: frequency
                
                # map each url to a document id
                url_id_map[key] = url
        
                # update the posting list for each token
                for token in tokens:
                    if token not in inverted_index:
                        inverted_index[token] = {key: tokens[token]}
                    else:
                        inverted_index[token][key] = tokens[token]
                
                if len(inverted_index) >= max_length:
                    inverted_index = dict(sorted(inverted_index.items()))
                    filename = index_dir / f"inverted_index_{str(n_index)}.jsonl" 
                    write_to_file(inverted_index, filename)
                    n_index += 1
                    inverted_index.clear()

                if len(url_id_map) >= max_length:
                    filename = index_dir / f"url_id_{str(n_id)}.jsonl"
                    write_to_file(url_id_map, filename)
                    n_id += 1
                    url_id_map.clear()

        
                # update doc id
                key += 1

        if len(inverted_index) > 0:
            inverted_index = dict(sorted(inverted_index.items()))
            filename = index_dir / f"inverted_index_{str(n_index)}.jsonl"
            write_to_file(inverted_index, filename)
            n_index += 1

        if len(url_id_map) > 0:
            filename = index_dir / f"url_id_{str(n_id)}.jsonl"
            write_to_file(url_id_map, filename)
            n_id += 1
    
    return {"key": key, "n_id": n_id, "n_index": n_index}

In [5]:
def read_inverted_index(filename):
    inverted_index = dict({})
    
    with open(filename, "r") as file:
        for jsonObj in file:
            jsonObj = json.loads(jsonObj)
            token = list(jsonObj.keys())[0]
            inverted_index[token] = jsonObj[token]
    
    return inverted_index

In [19]:
def update_url_map(url_map: OrderedDict, filename):
    with open(filename, "r") as file:
        for jsonObj in file:
            jsonObj = json.loads(jsonObj)
            token = list(jsonObj.keys())[0]
            if len(url_map) > 100000:
                url_map.popitem()
            url_map[token] = jsonObj[token]
    return url_map

In [7]:
def merge_index(filename):
    inverted_index = dict({})
    
    with open(filename, "r") as file:
        for jsonObj in file:
            jsonObj = json.loads(jsonObj)
            token = list(jsonObj.keys())[0]
            if token not in inverted_index:
                inverted_index[token] = jsonObj[token]
            else:
                existed_postings = Counter(inverted_index[token])
                new_postings = Counter(jsonObj[token])
                updated_postings = dict(existed_postings + new_postings)
                inverted_index[token] = updated_postings
    
    return inverted_index

In [8]:
def group_index(filename):
    grouped_index = []
    alpha_list = [] # contain all the first letters of all current tokens
    temp_inverted_index = dict()
    last_token = ""

    # read inverted_index from file
    print("\n Read index from file", filename)
    inverted_index = read_inverted_index(filename)

    # given a sorted dict, group index by first letter
    for token in inverted_index:
        if last_token == "" or token[0] == last_token[0]:
            temp_inverted_index[token] = inverted_index[token]
        else:
            grouped_index.append(temp_inverted_index.copy())
            alpha_list.append(last_token[0])

            # reset
            temp_inverted_index = dict({})
            temp_inverted_index[token] = inverted_index[token]
        
        last_token = token

    # save the remaining to grouped_index
    if len(temp_inverted_index) > 0:
        grouped_index.append(temp_inverted_index.copy())
        alpha_list.append(last_token[0])
    
    # write index to new files
    index_dir = Path("inverted_index") # folder's name to save index files
    index_dir.mkdir(exist_ok=True)
    for i in range(len(grouped_index)):
        char = alpha_list[i]
        filename = index_dir / f"{char}.jsonl"
        write_to_file(grouped_index[i], filename)

    return alpha_list

In [20]:
def update_cache_index(cache_inverted_index: OrderedDict, token):
    filename = "final_inverted_index/" + token[0] + ".jsonl"
    with open(filename, "r") as file:
        for jsonObj in file:
            jsonObj = json.loads(jsonObj)
            term = list(jsonObj.keys())[0]

            if len(cache_inverted_index) > 100000:
                cache_inverted_index.popitem()
            cache_inverted_index[term] = jsonObj[term]
    return cache_inverted_index

In [None]:
# build a search engine
from nltk.stem import PorterStemmer
import math

def search_engine(queries, total_urls):

    stemmer = PorterStemmer()
    max_length = 100000
    cache_inverted_index = OrderedDict()    # recently searched items are moved to the back

    # perform the search
    # cristina lopes, machine learning, ACM, master of software engineering

    tokens = queries.split() # ex: "machine learning" => ['machine', 'learning']
    stemmed_tokens = [stemmer.stem(t) for t in tokens]
    posting_lists = []      # for saving all the url id 
    inverted_index = dict() # for calculating tf-idf score purpose

    # find the posting lists of each token
    for token in stemmed_tokens:
        if token not in cache_inverted_index:
            cache_inverted_index = update_cache_index(cache_inverted_index, token)

        if token in cache_inverted_index:
            posting_lists.append(cache_inverted_index[token])
            cache_inverted_index.move_to_end(token)
            inverted_index[token] = cache_inverted_index[token]

        # Boolean AND: intersect all posting lists
        url_id_set = set()
        for posting_list in posting_lists:
            doc_ids = set(posting_list.keys())
            if not url_id_set:
                url_id_set = doc_ids
            else:
                url_id_set &= doc_ids

        # compute tf-idf scores
        doc_scores = {}
        N = int(total_urls)

        for doc_id in url_id_set:
            score = 0.0
            for token in stemmed_tokens:
                posting = inverted_index.get(token, {})
                tf = posting.get(doc_id, 0)
                df = len(posting) if posting else 1
                idf = math.log(N / (1 + df))
                score += tf * idf
            doc_scores[doc_id] = score
            print(score)
            
        ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)

        return url_id_set, ranked_docs


In [40]:
# run collect_file_path.py to get the txt file (named "JSON_file_path.txt")
file_name = "JSON_file_path.txt"

# create inverted_index, return number of index files and url_id files
index_dir = Path("jsonl_files") # folder's name to save index files
index_dir.mkdir(exist_ok=True) 
return_dict = create_inverted_index(file_name, index_dir)
# return_dict = {"key": 55393, "n_id": 1, "n_index": 28}  # for testing mode

# save return values to a file
write_to_file(return_dict, "return_values.txt")

# read files in json_files, group index by first letters, return a list of letters
# new inverted_index files are under inverted_index folder
num_of_index_files = return_dict["n_index"]
chars = set([])
for i in range(num_of_index_files):
    filename = index_dir / f"inverted_index_{i}.jsonl"
    letters = group_index(filename)
    chars.update(letters)

# for each inverted_index/{char} file
# reorganzie, merge/update the token postings
index_dir = Path("final_inverted_index") # folder's name to save index files
index_dir.mkdir(exist_ok=True) 
for char in chars:
    filename = "inverted_index/" + char + ".jsonl"
    newFilename = "final_" + filename
    inverted_index = merge_index(filename)
    rewrite_to_file(inverted_index, newFilename)
    

Reading content from C:\Users\qnluo\Downloads\DEV\www_ics_uci_edu\5f157db1db4993d22495a5ce3a11b7409a2df1757bca84cc10edf0d6e0612472.json
Reading content from C:\Users\qnluo\Downloads\DEV\www_ics_uci_edu\c2043196b48f89fc440e64c598255282559fd15824e7861161a51c1375c1168b.json
Reading content from C:\Users\qnluo\Downloads\DEV\www_ics_uci_edu\8b6b9f7e100105222bf33942799cd340b7daadb3413e9f163c81b6d0eb0dfffa.json
Reading content from C:\Users\qnluo\Downloads\DEV\www_ics_uci_edu\bb176187e95d270d7428d4f07b5d42bd926d91ac8f0350bb9cbd77c232d01675.json
Reading content from C:\Users\qnluo\Downloads\DEV\grape_ics_uci_edu\88068279046899c25902ed6c00a356d7e8453d6c954ad2039ebaf40061f62f6b.json
Reading content from C:\Users\qnluo\Downloads\DEV\duttgroup_ics_uci_edu\dd2c0877f59a6aea363ba8adf664447256e2e9186428a7c32971eec17a735da7.json
Reading content from C:\Users\qnluo\Downloads\DEV\www_ics_uci_edu\7436c3ebe1631aad6cc55f97ec9584d111d5140be9fdd5e5cd5eab9a49a9cc8f.json
Reading content from C:\Users\qnluo\Down


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(content, 'html.parser')


Reading content from C:\Users\qnluo\Downloads\DEV\www_ics_uci_edu\2b0a532c41b72b9e7d02b472c2ba5c80c8756691b5564fa1cbe4923bf06f7b8b.json
Reading content from C:\Users\qnluo\Downloads\DEV\grape_ics_uci_edu\4702b8500d3b89c092362b2f462704fa8c30371095e74a877d8e100a5092780c.json
Reading content from C:\Users\qnluo\Downloads\DEV\cbcl_ics_uci_edu\9b3f5119a443626b10277d00244c2d290cd315c43a4a623251d191e50d6a4bae.json
Reading content from C:\Users\qnluo\Downloads\DEV\flamingo_ics_uci_edu\00445e86477c950178e06d38c57c3f85b07f75747a647c746c5cc6bb37fcf0ad.json
Reading content from C:\Users\qnluo\Downloads\DEV\www_ics_uci_edu\61fa71c4e7abf2fe079fadd34cbfef6dfb09d47996142304ab761d97416db214.json
Reading content from C:\Users\qnluo\Downloads\DEV\grape_ics_uci_edu\e6035a8b0a2c24cbf850e026a2521bc7bed407d3f821c1e41f8d37455460f242.json
Reading content from C:\Users\qnluo\Downloads\DEV\www_ics_uci_edu\b8a56ca0cfbd5a5c514e142b905c030dae7de476e91117ac11ac15b93850928d.json
Reading content from C:\Users\qnluo\Do


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(content, 'html.parser')


Reading content from C:\Users\qnluo\Downloads\DEV\www_ics_uci_edu\d90b9739da5300913a436a3a49c3287d84f566499c818bb071ee6dff300ccf67.json
Reading content from C:\Users\qnluo\Downloads\DEV\grape_ics_uci_edu\e7c818d8de38fabf8c924939f6f8c6de054de605001915fd160bede914679a70.json
Reading content from C:\Users\qnluo\Downloads\DEV\www_ics_uci_edu\e9ef007bc40a48b7bb683b1d4c6824d34a0b9df3702c8d97ced9815cbaa1da4c.json
Reading content from C:\Users\qnluo\Downloads\DEV\grape_ics_uci_edu\2d0653eb65d0cff4a9b7b53bb7634b4b1f5dd34d13ef19225811b17b850eab5e.json
Reading content from C:\Users\qnluo\Downloads\DEV\grape_ics_uci_edu\a5952269540dd1ef7d561b503ef90ffcacb8c0306de3797358974592ab3b37b6.json
Reading content from C:\Users\qnluo\Downloads\DEV\www_ics_uci_edu\c5f8eaf36d8fe6bc423eeb48d08b0799f172de9b34ab9ce17d2f4bd64ef9c1df.json
Reading content from C:\Users\qnluo\Downloads\DEV\www_ics_uci_edu\de937e15a9c0fab675d85100d0ec05ce4a0307c54dfbe8c98a5bf5b42a205a63.json
Reading content from C:\Users\qnluo\Downlo

In [None]:
filename = f"jsonl_files/url_id_0.jsonl"
url_map = OrderedDict()
update_url_map(url_map, filename)

OrderedDict([('0',
              'https://www.ics.uci.edu/~irus/wisen/wisen98/presentations/Aggarwal/tsld005.htm'),
             ('1', 'https://www.ics.uci.edu/~guix/'),
             ('2',
              'https://www.ics.uci.edu/~irus/wisen/wisen98/presentations/Mathon/sld001.htm'),
             ('3',
              'https://www.ics.uci.edu/faculty/profiles/view_faculty.php?ucinetid=bnan'),
             ('4',
              'https://grape.ics.uci.edu/wiki/asterix/wiki/cs122a-2018-spring?version=46'),
             ('5',
              'https://duttgroup.ics.uci.edu/group-members/rahmani2/#content'),
             ('6',
              'https://www.ics.uci.edu/~eppstein/pix/laguna/StaceyScott.html'),
             ('7',
              'https://www.ics.uci.edu/~eppstein/pix/j4p11/SuspenderedDrummer.html'),
             ('8',
              'https://www.ics.uci.edu/~dechter/courses/ics-275a/fall-99/slides/node279.html'),
             ('9',
              'https://grape.ics.uci.edu/wiki/public/wiki/cs

In [None]:
import time
# num_of_urls = return_dict["key"]
num_of_urls = 55393

# perform the search
while True:

    # ask for input/queries
    queries = input("Input queries (type exit to stop searching): ")
    if queries == "exit".lower():
        break
    else:
        startTime = time.process_time_ns()
        
        url_id_set, ranked_docs = search_engine(queries, num_of_urls)

        # get url_id map
        for id in url_id_set:
            if id not in url_map:
                filename = f"jsonl_files/url_id_{int(id) // 100000}.jsonl" 
                url_map = update_url_map(url_map, filename)

        endTime = time.process_time_ns()

        # Output up to 5 URLs (no ranking yet)
        print(f"\nURLs that contain: \"{queries}\"")
        for i, (doc_id, score) in enumerate(ranked_docs[:5]):
            url = url_map.get(doc_id, doc_id)
            print(f"{i+1}. {url} (tf-idf: {score:.4f})")
        print("Time Response:", (endTime-startTime) / 10**6, "ms")
        print()


URLs that contain: "professor cristina lopes"
1. https://www.ics.uci.edu/faculty/ (tf-idf: 272.3607)
2. https://www.ics.uci.edu/faculty/index.php (tf-idf: 272.3607)
3. https://www.ics.uci.edu/community/news/notes/notes_2007.php (tf-idf: 198.5433)
4. https://www.ics.uci.edu/community/news/notes/notes_2010.php (tf-idf: 152.7256)
5. https://www.cs.uci.edu/faculty/ (tf-idf: 147.6348)
Time Response: 187.5 ms


URLs that contain: "undergraduate informatics program requirements"
1. https://www.ics.uci.edu/~redmiles/ics125-FQ01/adp.html (tf-idf: 52.4341)
2. https://www.ics.uci.edu/~alspaugh/researchStudents.html (tf-idf: 43.6951)
3. https://www.informatics.uci.edu/professor-ruberg-honored-for-excellence-in-fostering-undergraduate-research/ (tf-idf: 39.3256)
4. https://www.informatics.uci.edu/professor-ruberg-honored-for-excellence-in-fostering-undergraduate-research/#content (tf-idf: 39.3256)
5. https://www.cs.uci.edu/undergraduate-programs/ (tf-idf: 37.1409)
Time Response: 62.5 ms


URLs tha