In [None]:
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool, cpu_count
import time
from typing import List, Dict, Optional
import logging
from urllib.parse import urlparse
from geocode.geocode import Geocode

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


def scrape_about_page(url: str) -> Optional[Dict]:
    """
    Scrapes an about page and extracts its main content.

    Args:
        url: The URL of the about page to scrape

    Returns:
        Dictionary containing the URL and extracted text, or None if failed
    """
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")

        # Remove script and style elements
        for element in soup(["script", "style", "nav", "header", "footer"]):
            element.decompose()

        # Extract main content - looking for common content containers
        content = None
        for selector in ["main", "article", ".content", "#content", ".about", "#about"]:
            content = soup.select_one(selector)
            if content:
                break

        # If no main content found, use body
        if not content:
            content = soup.body

        if content:
            text = " ".join(content.stripped_strings)
            return {"url": url, "text": text, "domain": urlparse(url).netloc}
        return None

    except Exception as e:
        logging.error(f"Error scraping {url}: {str(e)}")
        return None


def parallel_scrape(urls: List[str], num_processes: Optional[int] = None) -> List[Dict]:
    """
    Scrapes multiple URLs in parallel using multiprocessing.

    Args:
        urls: List of URLs to scrape
        num_processes: Number of processes to use (defaults to CPU count)

    Returns:
        List of dictionaries containing scraped data
    """
    if num_processes is None:
        num_processes = cpu_count()

    logging.info(f"Starting parallel scraping with {num_processes} processes")
    start_time = time.time()

    with Pool(num_processes) as pool:
        results = pool.map(scrape_about_page, urls)

    # Filter out None results from failed scrapes
    valid_results = [r for r in results if r is not None]

    elapsed_time = time.time() - start_time
    logging.info(f"Scraped {len(valid_results)} pages in {elapsed_time:.2f} seconds")

    return valid_results


def process_locations(
    scraped_data: List[Dict], num_cpus: Optional[int] = None
) -> List[Dict]:
    """
    Processes scraped text to extract location information using parallel processing.
    This should be run after scraping due to memory requirements.

    Args:
        scraped_data: List of dictionaries containing scraped text
        num_cpus: Number of CPU cores to use for parallel processing (defaults to all available)

    Returns:
        List of dictionaries with location information added
    """
    gc = Geocode()
    gc.load()  # load geonames data

    # Extract text content for parallel processing
    texts = [item["text"] for item in scraped_data]

    try:
        # Process locations in parallel
        all_locations = gc.decode_parallel(texts, num_cpus=num_cpus)

        # Merge results back with original data
        results = []
        for item, locations in zip(scraped_data, all_locations):
            item["locations"] = locations
            results.append(item)

        return results

    except Exception as e:
        logging.error(f"Error in parallel location processing: {str(e)}")
        raise

In [None]:
import polars as pl

urls = pl.read_parquet("../data/output/about_pages.parquet")

In [None]:
about_page = scrape_about_page(urls.select(pl.col("url").first()).item())

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("flair/ner-english-fast")

sentences = [Sentence(sentence) for sentence in about_page["text"].split(".")]
# make example sentence

# predict NER tags
tagger.predict(sentences)

# print sentence

# print predicted NER spans
print("The following NER tags are found:")
# iterate over entities and print


In [None]:
len(urls)

In [None]:
sentence.get_spans("ner")[0]

In [None]:
import spacy 
nlp = spacy.load("en_core_web_sm")
# Access the entity recognizer (NER) pipe
ner = nlp.get_pipe("ner")

# Get the raw entity predictions for the document
doc = nlp(about_page["text"])
beams = ner.predict([doc])

# Get entity labels
entity_labels = ner.labels

# Process the beam outputs
for doc_idx, beam in enumerate(beams):
    for i, (score, ents) in enumerate(beam):
        if i == 0:  # Only look at the top-scoring analysis
            print(f"Document score: {score:.4f}")

            # Group spans by tokens
            token_to_ents = {}
            for start, end, label_id in ents:
                entity_label = entity_labels[label_id]
                entity_text = doc[start:end].text
                entity_score = score  # This is the overall beam score

                print(
                    f"Entity: {entity_text}, Label: {entity_label}, Score: {entity_score:.4f}"
                )
                print(f"  Position: {start} to {end}")

    print()

In [52]:
# Process the document
doc = nlp(about_page["text"])

# First, let's see what entities were recognized
print("Recognized entities:")
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}, Span: ({ent.start}, {ent.end})")

# To get probability scores for entities, we need to access the NER pipe differently
ner = nlp.get_pipe("ner")

# Let's inspect what methods and attributes are available
print("\nAvailable NER methods:")
for method_name in dir(ner):
    if not method_name.startswith("_"):
        print(method_name)

# Try to access scores through the model
# This might need to be adjusted based on your spaCy version
docs = [doc]  # Wrap in list as required by predict
ner_scores = None
try:
    # Try different approaches based on spaCy version
    if hasattr(ner, "predict"):
        ner_scores = ner.predict(docs)
    elif hasattr(ner.model, "predict"):
        ner_scores = ner.model.predict(docs)

    print("\nNER scores type:", type(ner_scores))
    if ner_scores is not None:
        print("NER scores structure:", ner_scores)
except Exception as e:
    print(f"Error accessing NER scores: {e}")

# As a fallback, examine the token-level entity information
print("\nToken-level entity information:")
for token in doc:
    print(f"Token: {token.text}, IOB: {token.ent_iob_}-{token.ent_type_}")

Recognized entities:
Entity: Privacy Statement and Cookie, Label: ORG, Span: (40, 44)
Entity: Graduate Degrees MA, Label: PERSON, Span: (55, 58)
Entity: Philosophy Ethics and Public Affairs, Label: ORG, Span: (59, 64)
Entity: Degrees BA, Label: PERSON, Span: (70, 72)
Entity: Philosophy Philosophy, Label: ORG, Span: (73, 75)
Entity: Economics, Label: ORG, Span: (83, 84)
Entity: Ethics, Label: GPE, Span: (87, 88)
Entity: Law Minor, Label: PERSON, Span: (97, 99)
Entity: Philosophy for Social Change Courses People Careers News Headlines In The Media Publications Events All Upcoming Past Events Events, Label: ORG, Span: (104, 123)
Entity: Graduate Degrees MA, Label: PERSON, Span: (128, 131)
Entity: Philosophy Ethics and Public Affairs, Label: ORG, Span: (132, 137)
Entity: Degrees BA, Label: PERSON, Span: (143, 145)
Entity: Philosophy Philosophy, Label: ORG, Span: (146, 148)
Entity: Economics, Label: ORG, Span: (156, 157)
Entity: Ethics, Label: GPE, Span: (160, 161)
Entity: Law Minor, Label:

In [None]:
predictions

In [None]:
# Example usage:
if __name__ == "__main__":
    # Example URLs
    urls = [
        "http://example1.com/about",
        "http://example2.com/about",
        # ... more URLs ...
    ]

    # First phase: Parallel scraping
    scraped_data = parallel_scrape(urls)

    # Second phase: Location processing
    # Note: This should be done after scraping due to memory requirements
    results_with_locations = process_locations(scraped_data)

    # Example of accessing results
    for result in results_with_locations:
        print(f"URL: {result['url']}")
        print(f"Locations found: {result['locations']}")
        print("---")

In [None]:
import geograpy

places = geograpy.get_geoPlace_context(url=about_page["url"])

In [None]:


# a large number of items
mydata = ['Tel Aviv']
num_cpus = None # By default use all CPUs

locations = gc.decode_parallel(about_page["text"], num_cpus=num_cpus)
print(locations)

In [None]:
from geocode.geocode import Geocode

gc = Geocode()
gc.load()  # load geonames data
gc.decode("Fairfax")