Crawler du site Langchain

In [1]:
import os
import json
import requests
from bs4 import BeautifulSoup
import json

In [2]:
def clean_text(text):
    """Clean up text by removing extra whitespace and newline characters."""
    return ' '.join(text.split()).replace('\n', '').replace('\t', '').replace('\r', '')

def get_page_data(url):
    """Scrape data from a given URL and extract readable markdown-style content."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # select content
        main_content = soup.select_one('.theme-doc-markdown.markdown')
        if not main_content:
            print(f" No content found in: {url}")
            return None

        # entire text
        content = main_content.get_text(separator="\n", strip=True)

        return {
            'url': url,
            'content': clean_text(content)
        }

    except Exception as err:
        print(f" Error scraping {url}: {err}")
        return None

   

In [3]:
def scrape_all_pages(urls):
    """Main function to initiate web scraping."""
    print("Starting web scraping...")
    data = []
    total_urls = len(urls)

    for index, url in enumerate(urls, start=1):
        page_data = get_page_data(url)
        if page_data:
            print(f"{url}: {len(page_data['content'])} characters scraped")
            data.append(page_data)
        print(f"Progress: {index}/{total_urls} URLs scraped")

    return data

def read_urls_from_file(file_path):
    try:
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read().splitlines()
        else:
            raise FileNotFoundError(f"{file_path} not found or is not a allowed file type.")
    except Exception as e:
        print(f"Error reading file: {e}")
        return []

def results_to_file(data, output_dir='./out', output_file='result.json'):
    #save result to json file 
    try:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        file_path = os.path.join(output_dir, output_file)

        if os.path.exists(file_path):
            os.remove(file_path)

        formatted_data = [
            {'url': item['url'], 'content': item['content']}
            for item in data
        ]

        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(formatted_data, f, ensure_ascii=False, indent=4)

        total_characters = sum(len(item['content']) for item in data)
        print(f"Total of {total_characters} characters scraped and saved in [{output_file}] format to {file_path}!")
    except Exception as err:
        print(f"Error writing to file: {err}")


In [11]:

# Main execution
if __name__ == "__main__":
    file_name = 'urls'
    file_path = os.path.join(file_name)

    urls = read_urls_from_file(file_path)

    if urls:
        scraped_data = scrape_all_pages(urls)
        if scraped_data:
            results_to_file(scraped_data)

Starting web scraping...
https://python.langchain.com/docs/introduction/: 4637 characters scraped
Progress: 1/1 URLs scraped
Total of 4637 characters scraped and saved in [result.json] format to ./out/result.json!
