In [2]:
from langchain_community.document_loaders import WebBaseLoader  #  Load web content into LangChain workflows.
import bs4            # Parse and scrape HTML or XML data.
from fpdf import FPDF # Generate PDF files programmatically.
import os             # Manage file system and environment operations.

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
# Function to clean the content
def clean_content(content):
    cleaned_content = []
    for line in content:
        # Remove leading/trailing spaces and condense multiple spaces
        line = " ".join(line.split())
        if line:  
            cleaned_content.append(line)
    return cleaned_content

In [4]:
# Function to save content to a PDF file
def save_to_pdf(content, folder_name, file_name):
    os.makedirs(folder_name, exist_ok=True)

    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    for line in content:
        try:
            line = line.encode('latin-1', 'replace').decode('latin-1')
        except UnicodeEncodeError:
            line = "Invalid Characters Detected"
        pdf.multi_cell(0, 10, line.strip())

    output_path = os.path.join(folder_name, file_name)
    pdf.output(output_path)
    print(f"PDF saved at {output_path}")
    

In [5]:
# Scrape the data for a specific dataset
def scrape_dataset(dataset_name, urls, classes):
    try:
        print(f"Scraping data for {dataset_name}...")
        all_content = []

        for url in urls:
            try:
                loader = WebBaseLoader(
                    web_paths=[url],
                    bs_kwargs=dict(parse_only=bs4.SoupStrainer(class_=classes))
                )
                documents = loader.load()
                raw_content = [doc.page_content for doc in documents]

                if not raw_content:
                    print(f"No content found for URL: {url}. Skipping.")
                    continue

                all_content.extend(raw_content)
            except Exception as url_error:
                print(f"Error scraping URL {url}: {url_error}. Skipping.")

        if not all_content:
            print(f"No valid content found for {dataset_name}. Skipping PDF generation.")
            return

        cleaned_content = clean_content(all_content)

        # Save content to a PDF in the dataset's folder
        save_to_pdf(cleaned_content, "rguktBasarDataset", f"{dataset_name}.pdf")
    except Exception as e:
        print(f"Error scraping {dataset_name}: {e}")


In [6]:

# Main function to manage all datasets
def scrape_all_datasets():
    datasets = {
        "about_rgukt": {
            "urls": [
                'http://www.rgukt.ac.in/about-introduction.html',
                'http://www.rgukt.ac.in/vision-mission.html',
                'http://www.rgukt.ac.in/stu-campuslife.html',
                'http://www.rgukt.ac.in/anti-ragging.html',
            ],
            "classes": ["page-row"]
        },
        "cse": {
            "urls": [
                "http://www.rgukt.ac.in/cse.html",
                "http://www.rgukt.ac.in/cse-faculty.html",
            ],
            "classes": ['panel-body']
        },
    }

    for dataset_name, details in datasets.items():
        scrape_dataset(dataset_name, details["urls"], details["classes"])

if __name__ == "__main__":
    scrape_all_datasets()


Scraping data for about_rgukt...
PDF saved at rguktBasarDataset\about_rgukt.pdf
Scraping data for cse...
PDF saved at rguktBasarDataset\cse.pdf
