<a href="https://colab.research.google.com/github/ndvp39/CloudComputing-tirgul/blob/main/Project/Index/index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:

!pip install requests beautifulsoup4
!pip install firebase
!pip install firebase-admin




In [20]:

from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from collections import defaultdict
from firebase_admin import credentials
from firebase_admin import firestore
import firebase_admin
import requests
import re
import json
import os


# activate only if run this file directly and not from searchEngine
!git clone "https://github.com/ndvp39/CloudComputing-tirgul.git"


fatal: destination path 'CloudComputing-tirgul' already exists and is not an empty directory.


In [21]:
# Function to extract text from a webpage
def get_page_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text()
    except Exception as e:
        return ""

In [22]:
# Function to extract links from a webpage
def get_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        return links
    except Exception as e:
        print("Error:", e)
        return []

In [23]:
# Removes stop words from the given text - done to skip nonimportant words.
def remove_stop_words(text):
  stop_words = {'ha','thi','skip','-','&', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"}

  words = [word for word in text if word.lower() not in stop_words]
  return words

In [24]:
# Trimming unnecessary chars.
def trim_words(words):
    cleaned_words = [word.rstrip(",.\\/?!") for word in words]
    return cleaned_words

In [25]:
# Applies stemming to a list of words.
def apply_stemming(words):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words

In [26]:
# Function to create index database.
def create_index(url):
    index = defaultdict(list)
    visited = set()
    queue = [url]
    loop = 0
    # Limit the search depth.
    while queue and loop < 80:
        loop+=1
        current_url = queue.pop(0)
        if current_url in visited:
            continue
        visited.add(current_url)
        text = get_page_text(current_url)
        words = remove_stop_words(text.split())
        words = trim_words(words)
        words = apply_stemming(words)
        for word in words:
            index[word].append(current_url)
        links = get_links(current_url)
        for link in links:
            if link.startswith(url) and link not in visited:
                queue.append(link)
    return index

In [27]:
# This creates a dict with an id for each link and word counter based on the index that was returned in create_index.
def update_dict(word, link):
    global link_ids
    if link not in link_ids:
        link_ids[link] = len(link_ids) + 1
    link_id = link_ids[link]
    if link not in index[word]:
        index[word][link] = {'id': link_id, 'counter': 0}
    index[word][link]['counter'] += 1

In [28]:
# Ranking the words based on how many times they appeared.
def create_ranked_words(index):
    sorted_dict = {}
    counter_for_each_word = 0
    for word, links in index.items():
        for link in links.keys():
            counter_for_each_word += index[word][link]['counter']
        sorted_dict[word] = counter_for_each_word
        counter_for_each_word = 0
    # Sort the dictionary by counts in descending order
    sorted_dict = {k: v for k, v in sorted(sorted_dict.items(), key=lambda item: item[1], reverse=True)}
    ranked_dict = {}
    rank = 1
    for word, counter in sorted_dict.items():
        # highest rank = biggest counter
        ranked_dict[word] = {'rank': rank, 'counter': counter}
        rank += 1
    return ranked_dict

In [29]:
# Creates a dict that can fit into the firestore format.
def create_data_for_db(final_index):
    data_list = []
    for word, doc_ids in final_index.items():
        term_data = {
            'term': word,
            'DocId': doc_ids
        }
        data_list.append(term_data)
    return data_list

In [30]:
# In case we want to generate a json file and upload it to db and not directly from code. ## NOT IN USE ##
def generate_json_file(data_list, is_desktop):
    if is_desktop:
      filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "indexDb.json")
    else:
      filepath = "/content/drive/My Drive/CloudComputing/project/indexDb.json"
    with open(filepath, "w") as json_file:
        json.dump(data_list, json_file, indent = 2)

In [31]:
# Prints the content from data base.
def print_collection_from_db(collection_name):
    collection_ref = db.collection(collection_name)
    docs = collection_ref.get()
    i = 0
    for doc in docs:
        term = doc.to_dict()['term']
        print(f"{i} -- Document ID: {doc.id}, Data: {term}")
        i += 1

In [32]:
# Uploads the index to the db.
def upload_to_db(data_list, collection_name):
  try:
    for data in data_list:
        doc_ref = db.collection(collection_name).add(data)
    print("All documents uploaded successfully.")
  except Exception as e:
      print("Error:", e)

In [33]:
# Gets the index from db, used in searchEngine and in chatbot files.
def get_index_from_db(db_con = None):
    dcon = db_con if db_con is not None else db
    imported_dict = {}
    collection_ref = dcon.collection(db_collection_name)
    docs = collection_ref.get()
    for doc in docs:
        # Checks if the doc is indeed in the db.
        if doc.exists:
            term = doc.to_dict()['term']
            doc_id = doc.to_dict()['DocId']
            imported_dict[term] = doc_id
    return imported_dict

Main section for index.ipynb

In [36]:
website_url = 'https://www.redhat.com/en'
db_collection_name = "PantherIndex"

# createing the index, only once.
'''
link_ids = {}
index_db = create_index(website_url)
index = defaultdict(dict)

for word, links in index_db.items():
    for link in links:
        update_dict(word, link)

ranked_words = create_ranked_words(index)
chosen_words = list(ranked_words.keys())[:110]
final_index = {word:index[word] for word in chosen_words}
data_list_for_db = create_data_for_db(final_index)
'''

# db connection.
cred = credentials.Certificate("/content/CloudComputing-tirgul/Project/Json/sak.json")
if not firebase_admin._apps:
  firebase_admin.initialize_app(cred)
db = firestore.client()

#upload_to_db(data_list_for_db, db_collection_name)
#print_collection_from_db(db_collection_name)

0 -- Document ID: 0EFrD1jE9UHO6kYHOxxE, Data: across
1 -- Document ID: 0Qm3wbBUJ4aqALTibZg8, Data: us
2 -- Document ID: 0nOUNHPATVbLDxYiEs36, Data: resourc
3 -- Document ID: 1a7JnrApFoau0r5iZ5Gp, Data: stori
4 -- Document ID: 1rztxcrAHRq9wQ24GQFU, Data: commit
5 -- Document ID: 210VMFXH30D1dLUNiWqG, Data: customerscustom
6 -- Document ID: 2hjEe0Jue7yNMUyPU2QA, Data: manag
7 -- Document ID: 2pvsQ30iIVddZypLe1sS, Data: trial
8 -- Document ID: 3WMrUOYhKn919yHHk9vM, Data: make
9 -- Document ID: 475z4lczulz5mw69EZu3, Data: model
10 -- Document ID: 4r5PNs5P2N92T6nA1F8W, Data: search
11 -- Document ID: 53VPaz3ggyy71h1mtPEM, Data: featur
12 -- Document ID: 5HBJAb2PiIVIjPOGXZKf, Data: open
13 -- Document ID: 5kZxo2XwEazJA7GgDfaQ, Data: administr
14 -- Document ID: 6MkkzWroWukuMJiaZEou, Data: deliv
15 -- Document ID: 6SbAeEtSitWreIq5gRNG, Data: commun
16 -- Document ID: 6mu9EnbngwF3mUpwFp5b, Data: support
17 -- Document ID: 8ZzuZH7w0CEOfcU20FWy, Data: process
18 -- Document ID: 8e6MAnv3kkxki8XU1