<a href="https://colab.research.google.com/github/daniarmag/AdvancedShell/blob/main/Project/index_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install requests beautifulsoup4
!pip install firebase
!pip install firebase-admin

Collecting firebase
  Downloading firebase-4.0.1-py3-none-any.whl (12 kB)
Installing collected packages: firebase
Successfully installed firebase-4.0.1


In [2]:
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from collections import defaultdict
from firebase_admin import credentials
from firebase_admin import firestore
import firebase_admin
import requests
import re
import json
import os

In [3]:
# Function to extract text from a webpage
def get_page_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text()
    except Exception as e:
        return ""

In [4]:
# Function to extract links from a webpage
def get_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        return links
    except Exception as e:
        print("Error:", e)
        return []

In [5]:
def remove_stop_words(text):
  stop_words = {'a','as','ha','thi','an', 'the', 'and', 'is', 'are', 'or', 'in', 'on', 'at','skip','-','them','they','than','she',',','.','&','for','what'}
  words = [word for word in text if word.lower() not in stop_words]
  return words

In [6]:
def trim_words(words):
    cleaned_words = [word.rstrip(",.\\/") for word in words]
    return cleaned_words

In [7]:
def apply_stemming(words):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words

In [8]:
# Function to create index database
def create_index(url):
    index = defaultdict(list)
    visited = set()
    queue = [url]
    loop = 0
    while queue and loop < 40:
        loop+=1
        current_url = queue.pop(0)
        if current_url in visited:
            continue

        visited.add(current_url)
        text = get_page_text(current_url)
        words = remove_stop_words(text.split())
        words = trim_words(words)
        words = apply_stemming(words)

        for word in words:
            index[word].append(current_url)

        links = get_links(current_url)
        for link in links:
            if link.startswith(url) and link not in visited:
                queue.append(link)
    return index

In [9]:
def update_dict(word, link):
    global link_ids
    if link not in link_ids:
        link_ids[link] = len(link_ids) + 1
    link_id = link_ids[link]
    if link not in index[word]:
        index[word][link] = {'id': link_id, 'counter': 0}
    index[word][link]['counter'] += 1

In [10]:
def create_ranked_words(index):
    sorted_dict = {}
    counter_for_each_word = 0
    for word, links in index.items():
        for link in links.keys():
            counter_for_each_word += index[word][link]['counter']
        sorted_dict[word] = counter_for_each_word
        counter_for_each_word = 0
    sorted_dict = {k: v for k, v in sorted(sorted_dict.items(), key=lambda item: item[1], reverse=True)}
    ranked_dict = {}
    rank = 1
    for word, counter in sorted_dict.items():
        ranked_dict[word] = {'rank': rank, 'counter': counter}
        rank += 1
    return ranked_dict

In [11]:
def create_data_for_db(final_index):
    data_list = []
    for word, doc_ids in final_index.items():
        term_data = {
            'term': word,
            'DocId': doc_ids
        }
        data_list.append(term_data)
    return data_list

In [12]:
def generate_json_file(data_list, is_desktop):
    if is_desktop:
      filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "indexDb.json")
    else:
      filepath = "/content/drive/My Drive/CloudComputing/project/indexDb.json"
    with open(filepath, "w") as json_file:
        json.dump(data_list, json_file, indent = 2)

In [13]:
def print_collection_from_db(db, collection_name):
    collection_ref = db.collection(collection_name)
    docs = collection_ref.get()
    i = 0
    for doc in docs:
        term = doc.to_dict()['term']
        print(f"{i} -- Document ID: {doc.id}, Data: {term}")
        i += 1

In [14]:
def upload_to_db(db, collection_name):
  try:
    for data in data_list:
        doc_ref = db.collection(collection_name).add(data)
    print("All documents uploaded successfully.")
  except Exception as e:
      print("Error:", e)