In [None]:
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import defaultdict
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
# Web Crawler

seed_url = "https://www.expedia.com/"

output_dir = "crawled_pages"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

visited_urls = set()

url_queue = [seed_url]

# Delay between requests to avoid overloading the server
delay = 2 

def save_page_content(url, content):
    filename = os.path.join(output_dir, f"{len(visited_urls)}.txt")
    with open(filename, "w", encoding="utf-8") as file:
        file.write(content)

def crawl_web():
    while len(visited_urls) < 500 and url_queue:
        url = url_queue.pop(0)
        if url not in visited_urls:
            try:
                print(f"Crawling: {url}")
                response = requests.get(url)
                if response.status_code == 200:
                    content = response.text
                    save_page_content(url, content)
                    visited_urls.add(url)
                    soup = BeautifulSoup(content, "html.parser")
                    for link in soup.find_all("a", href=True):
                        full_url = urljoin(url, link["href"])
                        if full_url.startswith(seed_url) and full_url not in visited_urls:
                            url_queue.append(full_url)
                    time.sleep(delay)
            except Exception as e:
                print(f"Error crawling {url}: {e}")

crawl_web()


Crawling: https://www.expedia.com/
Crawling: https://www.expedia.com/#main_content
Crawling: https://www.expedia.com/Hotels
Crawling: https://www.expedia.com/Flights
Crawling: https://www.expedia.com/Cars
Crawling: https://www.expedia.com/Vacation-Packages
Crawling: https://www.expedia.com/Activities
Crawling: https://www.expedia.com/Cruises
Crawling: https://www.expedia.com/deals
Crawling: https://www.expedia.com/magazine?BRANDCID=EXPEDIA-US.DTI-HOMEPAGE.EDITORIAL.MAGAZINE.GENERIC
Crawling: https://www.expedia.com/one-key-cards
Crawling: https://www.expedia.com/helpcenter/
Crawling: https://www.expedia.com/trips
Crawling: https://www.expedia.com/inbox/notifications
Crawling: https://www.expedia.com/login?&uurl=e3id%3Dredr%26rurl%3D%2F
Crawling: https://www.expedia.com/welcome-one-key
Crawling: https://www.expedia.com/p/info-other/feedback.htm
Crawling: https://www.expedia.com/Hotels
Crawling: https://www.expedia.com/Flights
Crawling: https://www.expedia.com/Cars
Crawling: https://www.

In [None]:
# Indexing
import nltk

nltk.download('stopwords')

import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import defaultdict
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


def tokenize_and_normalize(text):

    text = text.lower()

    text = re.sub(r"[^\w\s]", "", text)
  
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens

inverted_index = defaultdict(list)

def build_inverted_index():
    for doc_id in range(len(visited_urls)):
        filename = os.path.join(output_dir, f"{doc_id}.txt")
        with open(filename, "r", encoding="utf-8") as file:
            content = file.read()
            tokens = tokenize_and_normalize(content)
            for token in tokens:
                inverted_index[token].append(doc_id)

build_inverted_index()

with open("inverted_index.txt", "w", encoding="utf-8") as file:
    for token, doc_ids in inverted_index.items():
        file.write(f"{token}: {', '.join(map(str, doc_ids))}\n")

print("Crawling and indexing completed.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Crawling and indexing completed.


In [None]:
from google.colab import files

if os.path.exists("inverted_index.txt"):
    files.download("inverted_index.txt")
else:
    print("inverted_index.txt does not exist.")

if os.path.exists("crawled_pages"):
    !zip -r crawled_pages.zip crawled_pages
    files.download("crawled_pages.zip")
else:
    print("crawled_pages directory does not exist.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: crawled_pages/ (stored 0%)
  adding: crawled_pages/44.txt (deflated 84%)
  adding: crawled_pages/9.txt (deflated 85%)
  adding: crawled_pages/34.txt (deflated 82%)
  adding: crawled_pages/86.txt (deflated 71%)
  adding: crawled_pages/54.txt (deflated 83%)
  adding: crawled_pages/66.txt (deflated 86%)
  adding: crawled_pages/87.txt (deflated 71%)
  adding: crawled_pages/23.txt (deflated 82%)
  adding: crawled_pages/36.txt (deflated 82%)
  adding: crawled_pages/49.txt (deflated 92%)
  adding: crawled_pages/64.txt (deflated 79%)
  adding: crawled_pages/21.txt (deflated 78%)
  adding: crawled_pages/55.txt (deflated 83%)
  adding: crawled_pages/67.txt (deflated 83%)
  adding: crawled_pages/65.txt (deflated 79%)
  adding: crawled_pages/76.txt (deflated 88%)
  adding: crawled_pages/11.txt (deflated 78%)
  adding: crawled_pages/84.txt (deflated 85%)
  adding: crawled_pages/70.txt (deflated 84%)
  adding: crawled_pages/25.txt (deflated 82%)
  adding: crawled_pages/32.txt (deflated 82%

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>