# **Backend Section - Crawler**

**Imports**

In [1]:
# installations and imports
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from collections import defaultdict
import nltk
import numpy as np
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter

!pip install firebase
from firebase import firebase

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Collecting firebase
  Downloading firebase-4.0.1-py3-none-any.whl.metadata (6.5 kB)
Downloading firebase-4.0.1-py3-none-any.whl (12 kB)
Installing collected packages: firebase
Successfully installed firebase-4.0.1


**Dictionary creating function**

In [2]:
# count words from page
def create_count_dictionary(words):
  dictionary = defaultdict(int)
  print("--> create_count_dictionary",words)
  for word in words:
    if word not in dictionary:
      dictionary[word] = 1
    else:
      dictionary[word] += 1

  #print(dictionary.keys())

  #for key in dictionary:
  #  print(key, ": ", dictionary[key])
  return dictionary

**Crawler data upload to database function**

In [3]:
def upload_data_to_DB(dictionary, url):
  print("--> upload_data_to_DB")
  data_dict = dict(dictionary)
  length = len(data_dict)
  #i = 0
  FBconn = firebase.FirebaseApplication('https://vercelcrawler-1c167-default-rtdb.firebaseio.com/',None)

  for key in data_dict:
      clean_key = re.sub(r'[^\w\s]', ' ', key) # TODO: is this still neccesery

      # First, get existing data if any
      existing_data = FBconn.get('/', clean_key) # get any existing datawith this term

      if existing_data is None: # If no existing data, create new structure
          upload_data = {
              "docIDs": {
                  "0": {
                      "link": url,
                      "count": data_dict[key]
                  }
              }
          }
      else: # If existing data found, find the next available docID number
          #print(existing_data) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TODO: maybe add check if page already is in db to replace it (?)
          existing_doc_ids = existing_data.get('docIDs', {})
          next_id = len(existing_doc_ids)

          # Add new data to existing structure
          existing_data['docIDs'].append({
              "link": url,
              "count": data_dict[key]
          })
          upload_data = existing_data

      # Upload the merged data
      FBconn.put('/', clean_key, upload_data)
      #i += 1
      #print(i,"/",length)

**Top 10 terms from the website function**

In [4]:
# 10 most repeated words in the site
def get_top_terms(term_dict, n=10):
    """
    Returns the n most frequent terms and their counts from a dictionary.

    Args:
        term_dict (dict): Dictionary with terms as keys and counts as values
        n (int): Number of top terms to return (default 10)

    Returns:
        list: List of tuples [(term, count), ...] sorted by count in descending order
    """
    # Sort dictionary items by value (count) in descending order
    sorted_terms = sorted(term_dict.items(), key=lambda x: x[1], reverse=True)

    # Return first n items (or all if dict is smaller than n)
    return sorted_terms[:n]

# avg repetitions per word

def calculate_avg(term_dict):
  sum = np.sum(list(term_dict.values()))
  cnt = len(term_dict)
  return round(sum/cnt,2)

# Action vs Restriction words

def calculate_action_restriction_percentages(term_dict):
    action_terms = [
        "build", "deploy", "create", "configure", "integrate", "manage", "scale",
        "test", "update", "connect", "provision", "launch", "clone", "allocate",
        "enable", "query", "retrieve", "replicate", "execute", "optimize", "monitor",
        "share", "collaborate", "contribute", "invite", "access"
    ]
    restrictive_terms = [
        "prohibit", "terminate", "restrict", "limit", "revoke", "suspend", "forbid",
        "deny", "disallow", "cease", "throttle", "cap", "exceed", "disable", "block",
        "prevent", "restrictapi", "limitbandwidth", "resourcecap", "liable",
        "nontransferable", "breach", "violation", "infringement", "indemnify"
    ]

    # Convert lists to sets for faster lookup
    action_set = set(action_terms)
    restrictive_set = set(restrictive_terms)

    sum_total = np.sum(list(term_dict.values()))

    # Handle edge case of empty dictionary or zero sum
    if sum_total == 0:
        return [0.0, 0.0]

    action_count = 0
    restriction_count = 0

    for term, count in term_dict.items():
        if term in action_set:
            action_count += count
        if term in restrictive_set:
            restriction_count += count

    action_per = round((action_count/sum_total) * 100, 2)
    restriction_per = round((restriction_count/sum_total) * 100, 2)

    return [action_per, restriction_per]


**Crawler main code: Crawl main page and upload to Firebase**

In [5]:
# main
visited_links = set()  # saving a set of already visited links
word_count = 0
base_url = "https://vercel.com/home"
def crawl(url):
    global word_count  # TODO: check why dis global

    try:
        FBconn = firebase.FirebaseApplication('https://vercelcrawler-1c167-default-rtdb.firebaseio.com/',None)
        response = requests.get(url)
        if response.status_code != 200:
            return
        soup = BeautifulSoup(response.text, "html.parser")
        entire_text = preprocess_text(soup.get_text())
        dictionary = create_count_dictionary(entire_text)
        upload_data_to_DB(dictionary,url)
        top_ten = get_top_terms(dictionary)
        avg = calculate_avg(dictionary)
        act_vs_rest = calculate_action_restriction_percentages(dictionary)
        upload_data = {
            "_greatest10": top_ten,
            "_avg_term_appearance": avg,
            "_action_vs_restriction": act_vs_rest
        }
        FBconn.put('/', '_statistics', upload_data)
        print("FINISHED!")

    except Exception as e:
        print(f"Failed to crawl {url}: {e}")
    return entire_text

def preprocess_text(text):
  """
  Args:
    text: string
  Returns:
    list of preprocessed words
  """
  print("--> preproccess_text")
  # List of english stop words
  STOP_WORDS = set(stopwords.words('english'))
                                            #print("---> Original text: ", text)
  # Remove punctuation
  text = re.sub(r'[^\w\s]', ' ', text)
                                            #print("---> Remove punctuation: ", text)
  # split camelcase and pascalcase
  # Add space before capital letters that aren't at the start
  text = re.sub(r'(?<!^)(?<![\W\d_])([A-Z])', r' \1', text)
                                            #print("---> Remove camelPascal: ", text)
  # Convert to lowercase
  text = text.lower()
  # Split between digits and letters (both directions)
  text = re.sub(r'(\d+)([a-z])', r'\1 \2', text)  # digits followed by letters
  text = re.sub(r'([a-z])(\d+)', r'\1 \2', text)  # letters followed by digits
                                            #print("---> Convert to lowercased: ", text)
  # Tokenize
  words = nltk.word_tokenize(text)
                                            #print("---> Tokenize: ", words)
  # Remove stop words
  words = [word for word in words if word not in STOP_WORDS]
                                            #print("---> Remove stop words: ", words)
  # Lemmatize
  lemmatizer = WordNetLemmatizer()
  lemmas = [lemmatizer.lemmatize(word) for word in words]
                                            #print("---> Lemmatize: ", lemmas)
  return lemmas

# start crawler in first page
words = crawl(base_url)

--> preproccess_text
--> create_count_dictionary ['vercel', 'build', 'deploy', 'best', 'web', 'experience', 'frontend', 'cloud', 'product', 'x', 'platform', 'preview', 'helping', 'team', 'ship', '6', 'faster', 'powering', 'breakthrough', 'managed', 'infrastructure', 'rendering', 'fast', 'scalable', 'reliable', 'observability', 'trace', 'every', 'step', 'security', 'scale', 'without', 'compromising', 'open', 'source', 'next', 'j', 'native', 'next', 'j', 'platform', 'turborepo', 'speed', 'enterprise', 'scale', 'k', 'toolkit', 'type', 'script', 'solution', 'use', 'case', 'apps', 'deploy', 'speed', 'composable', 'commerce', 'power', 'storefront', 'convert', 'marketing', 'site', 'launch', 'campaign', 'fast', 'multi', 'tenant', 'platform', 'scale', 'apps', 'one', 'codebase', 'web', 'apps', 'ship', 'feature', 'infrastructure', 'user', 'platform', 'engineer', 'automate', 'away', 'repetition', 'design', 'engineer', 'deploy', 'every', 'idea', 'resource', 'tool', 'resource', 'center', 'today', 'b