<a href="https://colab.research.google.com/github/emgeiger/my-gardyn/blob/Colab/Gardyn_Plant_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# prompt: import BeautifulSoup(4), for webscraping.

!pip install beautifulsoup4

# Now you can use the soup object to parse the HTML
# For example, to find all the links on the page:
# for link in soup.find_all("a"):
#     print(link.get("href"))




In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
import sys

In [None]:
# Load the webpage
url = "https://help.mygardyn.com/en/articles/1776961"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Main content section
# main_section = soup.find("div", attrs={"data-link": "all_plants_placement_guide"})
# if not main_section:
#    raise Exception("Could not find All Plants Placement Guide section.")

In [None]:
high_light_links = []
medium_light_links = []
low_light_links = []

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json # To pretty-print the results

def scrape_links_by_section(url):
    """
    Scrapes links from a specific URL and categorizes them based on
    preceding section headers (High, Medium, Low Light Zones).

    Args:
        url (str): The URL of the article to scrape.

    Returns:
        dict: A dictionary containing three lists of links ('High', 'Medium', 'Low'),
              or None if an error occurs. Each link is a dict {'text': str, 'url': str}.
    """
    # Standard User-Agent to mimic a browser
    request_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=request_headers, timeout=15)
        response.raise_for_status() # Check for HTTP errors
        print(f"Successfully fetched URL: {url}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # --- Find the main article content area ---
    # Inspecting the page, the content seems to be within <div class="article__content">
    article_content = soup.find('div', class_='article__content')
    if not article_content:
        print("Error: Could not find the main article content container (div.article__content).")
        # As a fallback, try finding the container seen in previous steps if structure changed slightly
        article_content = soup.find('div', class_='article-content')
        if not article_content:
           print("Error: Fallback container (div.article-content) also not found.")
           return None
        else:
            print("Info: Found article content using fallback selector 'div.article-content'.")


    # --- Initialize variables ---
    High_Light_Zone_Links = []
    Medium_Light_Zone_Links = []
    Low_Light_Zone_Links = []
    current_zone = None # Possible values: 'High', 'Medium', 'Low', None

    # --- Iterate through elements in the article content ---
    # We look for h3 tags as section markers and p tags for links.
    # find_all() gets all descendants, which works well here.
    for element in article_content.find_all(['h3', 'p']): # Look for headings and paragraphs
        if element.name == 'h3':
            # Check the heading text to determine the zone
            heading_text = element.get_text(strip=True)
            if 'High Light Zone' in heading_text:
                current_zone = 'High'
                print(f"\nEntering Zone: {current_zone}")
            elif 'Medium Light Zone' in heading_text:
                current_zone = 'Medium'
                print(f"Entering Zone: {current_zone}")
            elif 'Low Light Zone' in heading_text:
                current_zone = 'Low'
                print(f"Entering Zone: {current_zone}")
            # Add checks for other headings if they might reset the zone
            # else:
            #     # If we encounter another h3 that isn't a zone marker, maybe reset?
            #     # current_zone = None # Optional: uncomment if non-zone h3 should stop collection
            #     pass

        elif element.name == 'p' and current_zone:
            # If we are inside a known zone and find a paragraph, look for links
            links_in_p = element.find_all('a')
            if links_in_p:
                print(f"  Found paragraph in '{current_zone}' zone with links:")
                for link_tag in links_in_p:
                    href = link_tag.get('href')
                    link_text = link_tag.get_text(strip=True)

                    if href:
                        # Resolve relative URLs to absolute ones
                        absolute_url = urljoin(url, href.strip())

                        # Basic validation
                        if not absolute_url.startswith(('http://', 'https://')):
                             print(f"    Skipping invalid or non-HTTP URL format: {absolute_url}")
                             continue

                        link_data = {'text': link_text, 'url': absolute_url}
                        print(f"    - Link: '{link_text}' -> {absolute_url}")

                        # Append to the correct list based on the current zone
                        if current_zone == 'High':
                            High_Light_Zone_Links.append(link_data)
                        elif current_zone == 'Medium':
                            Medium_Light_Zone_Links.append(link_data)
                        elif current_zone == 'Low':
                            Low_Light_Zone_Links.append(link_data)

    return {
        'High': High_Light_Zone_Links,
        'Medium': Medium_Light_Zone_Links,
        'Low': Low_Light_Zone_Links
    }

# --- Configuration ---
TARGET_URL = 'https://help.mygardyn.com/en/articles/1776961'

# --- Execute the Scraper ---
categorized_links = scrape_links_by_section(TARGET_URL)

# --- Display Results ---
if categorized_links:
    print("\n--- Scraping Results ---")

    print("\nHigh Light Zone Links:")
    if categorized_links['High']:
        # Using json.dumps for readable list output
        print(json.dumps(categorized_links['High'], indent=2))
    else:
        print("No links found for this zone.")

    print("\nMedium Light Zone Links:")
    if categorized_links['Medium']:
        print(json.dumps(categorized_links['Medium'], indent=2))
    else:
        print("No links found for this zone.")

    print("\nLow Light Zone Links:")
    if categorized_links['Low']:
        print(json.dumps(categorized_links['Low'], indent=2))
    else:
        print("No links found for this zone.")

    # You can now access the lists directly if needed elsewhere in your project
    # High_Light_Zone = categorized_links['High']
    # Medium_Light_Zone = categorized_links['Medium']
    # Low_Light_Zone = categorized_links['Low']
    # print(f"\nVariable 'High_Light_Zone' contains {len(High_Light_Zone)} links.")

else:
    print("\nScraping failed to produce results.")



Successfully fetched URL: https://help.mygardyn.com/en/articles/1776961
Error: Could not find the main article content container (div.article__content).
Info: Found article content using fallback selector 'div.article-content'.

Entering Zone: High
  Found paragraph in 'High' zone with links:
    - Link: 'Banana Pepper,' -> https://help.mygardyn.com/en/articles/1785345
    - Link: 'Bi-Color Sunflower,' -> https://help.mygardyn.com/en/articles/1782529
    - Link: 'Burrito Pepper,' -> https://help.mygardyn.com/en/articles/1785473
    - Link: 'Candy Cane Pepper' -> https://help.mygardyn.com/en/articles/3525697
    - Link: 'Cape Gooseberry' -> https://help.mygardyn.com/en/articles/1785281
    - Link: 'Celosia Mix' -> https://help.mygardyn.com/en/articles/2566209
    - Link: 'Cucumber' -> https://help.mygardyn.com/en/articles/1785537
    - Link: 'Fairytale Eggplant' -> https://help.mygardyn.com/en/articles/1785601
    - Link: 'Fuchsia Snapdragon' -> https://help.mygardyn.com/en/articles/186

¨