In [1]:
from langchain_community.document_loaders import WebBaseLoader  #  Load web content into LangChain workflows.
import bs4            # Parse and scrape HTML or XML data.
from fpdf import FPDF # Generate PDF files programmatically.
import os             # Manage file system and environment operations.

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
# Function to clean the content
def clean_content(content):
    cleaned_content = []
    for line in content:
        # Remove leading/trailing spaces and condense multiple spaces
        line = " ".join(line.split())
        if line:  
            cleaned_content.append(line)
    return cleaned_content

In [3]:
# Function to save content to a PDF file
def save_to_pdf(content, folder_name, file_name):
    os.makedirs(folder_name, exist_ok=True)

    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    for line in content:
        try:
            line = line.encode('latin-1', 'replace').decode('latin-1')
        except UnicodeEncodeError:
            line = "Invalid Characters Detected"
        pdf.multi_cell(0, 10, line.strip())

    output_path = os.path.join(folder_name, file_name)
    pdf.output(output_path)
    print(f"PDF saved at {output_path}")
    

In [4]:
def scrape_dataset(folder, dataset_name, urls, classes):
    try:
        print(f"Scraping data for {dataset_name} in folder {folder}...")
        all_content = []

        for url in urls:
            try:
                loader = WebBaseLoader(
                    web_paths=[url],
                    bs_kwargs=dict(parse_only=bs4.SoupStrainer(class_=classes))
                )
                documents = loader.load()
                raw_content = [doc.page_content for doc in documents]

                if not raw_content:
                    print(f"No content found for URL: {url}. Skipping.")
                    continue

                all_content.extend(raw_content)
            except Exception as url_error:
                print(f"Error scraping URL {url}: {url_error}. Skipping.")

        if not all_content:
            print(f"No valid content found for {dataset_name}. Skipping PDF generation.")
            return

        cleaned_content = clean_content(all_content)

        # Save content to a PDF in the dataset's folder
        save_to_pdf(cleaned_content, f"rguktBasarDataset/{folder}", f"{dataset_name}.pdf")
    except Exception as e:
        print(f"Error scraping {dataset_name} in folder {folder}: {e}")

In [5]:
def scrap_dataset(dataset):
    for folder, file in dataset.items():
        for dataset_name, details in file.items():
            scrape_dataset(folder, dataset_name, details["urls"], details["classes"])

about_rgukt = {
    "rgukt-info": {
        "about_rgukt": {
            "urls": [
                'http://www.rgukt.ac.in/about-introduction.html',
                'http://www.rgukt.ac.in/vision-mission.html',
                'http://www.rgukt.ac.in/stu-campuslife.html',
                'http://www.rgukt.ac.in/anti-ragging.html',
            ],
            "classes": ["page-row"]
        },
    }
}
scrap_dataset(about_rgukt)

bio_science_dept = {
    "departments": {
        "bio_science": {
            "urls": [
                'https://www.rgukt.ac.in/bio-sciences.html',
                'https://www.rgukt.ac.in/bio-sciences-goals.html',
                'https://www.rgukt.ac.in/bio-sciences-faculty.html',
            ],
            "classes": ["panel-body", "page-row", "panel"]
        },
    }
}
scrap_dataset(bio_science_dept)

chemical_dept = {
    "departments": {
        "chemical": {
            "urls": [
                'https://www.rgukt.ac.in/che.html',
                'https://www.rgukt.ac.in/che-faculty.html',
                'https://www.rgukt.ac.in/che-laboratories.html',
                'https://www.rgukt.ac.in/che-library.html',
                '',
            ],
            "classes": ["panel-body"]
        },
    }
}
scrap_dataset(chemical_dept)

chemistry_dept = {
    "departments": {
        "chemistry": {
            "urls": [
                'https://www.rgukt.ac.in/chemistry.html',
                'https://www.rgukt.ac.in/chemistry-faculty.html',
                'https://www.rgukt.ac.in/chemistry-staff.html',
            ],
            "classes": ["panel-body"]
        },
    }
}
scrap_dataset(chemistry_dept)

civil_dept = {
    "departments": {
        "civil": {
            "urls": [
                'https://www.rgukt.ac.in/ce.html',
                'https://www.rgukt.ac.in/civil-faculty.html',
                'https://www.rgukt.ac.in/civil-staff.html',
                'https://www.rgukt.ac.in/ce-library.html',
            ],
            "classes": ["panel-body", 'content-wrapper']
        },
    }
}
scrap_dataset(civil_dept)

cse_dept = {
    "departments": {
        "cse_scrapped": {
                "urls": [
                    "http://www.rgukt.ac.in/cse.html",
                    "https://www.rgukt.ac.in/cse-faculty.html",
                    '',
                ],
                "classes": ['panel-body']
            },
    }
}
scrap_dataset(cse_dept)

electrical_dept = {
    "departments": {
        "electrical": {
                "urls": [
                    "https://www.rgukt.ac.in/eee.html",
                    "https://www.rgukt.ac.in/eee-faculty.html",
                    'https://www.rgukt.ac.in/eee-staff.html',
                    'https://www.rgukt.ac.in/eee-laboratories.html',
                    'https://www.rgukt.ac.in/eee-events.html',
                    '',
                ],
                "classes": ['panel-body', 'content-wrapper']
            },
    }
}
scrap_dataset(electrical_dept)

ece_dept = {
    "departments": {
        "ece": {
                "urls": [
                    'https://www.rgukt.ac.in/ece.html',
                    'https://www.rgukt.ac.in/ece-faculty.html',
                    'https://www.rgukt.ac.in/ece-staff.html',
                    '',
                ],
                "classes": ['panel-body', 'content-wrapper']
            },
    }
}
scrap_dataset(ece_dept)

mme_dept = {
    "departments": {
        "mme": {
                "urls": [
                    'https://www.rgukt.ac.in/mme.html',
                    'https://www.rgukt.ac.in/mme-faculty.html',
                    'https://www.rgukt.ac.in/mme-staff.html',
                    '',
                ],
                "classes": ['panel-body', 'content-wrapper']
            },
    }
}
scrap_dataset(mme_dept)

mathematics = {
    "departments": {
        "mathematics": {
                "urls": [
                    'https://www.rgukt.ac.in/maths.html',
                    'https://www.rgukt.ac.in/maths-faculty.html',
                ],
                "classes": ['page-row', 'content-wrapper']
            },
    }
}
scrap_dataset(mathematics)

me_dept = {
    "departments": {
        "me_dept": {
                "urls": [
                    'https://www.rgukt.ac.in/me.html',
                    'https://www.rgukt.ac.in/me-faculty.html',
                    'https://www.rgukt.ac.in/me-staff.html',
                    'https://www.rgukt.ac.in/me-events.html',
                ],
                "classes": ['page-row', 'panel-body', 'content-wrapper']
            },
    }
}
scrap_dataset(me_dept)

administration = {
    "administration": {
        "administration": {
                "urls": [
                    'https://www.rgukt.ac.in/vc.html',
                    'https://www.rgukt.ac.in/gc.html',
                    'https://www.rgukt.ac.in/cd.html',
                    'https://www.rgukt.ac.in/administration-section.html',
                    'https://www.rgukt.ac.in/academic-office.html',
                    'https://www.rgukt.ac.in/deans-and-hods.html',
                    'https://www.rgukt.ac.in/establishment-section.html',
                    'https://www.rgukt.ac.in/finance-accounts.html',
                    'https://www.rgukt.ac.in/pro.html',
                    'https://www.rgukt.ac.in/rti.html',
                    'https://www.rgukt.ac.in/scholarship-section.html',
                    'https://www.rgukt.ac.in/security-unit.html',
                    'https://www.rgukt.ac.in/stores-and-purchase.html',
                    'https://www.rgukt.ac.in/student-affairs-hostels.html',
                    'https://www.rgukt.ac.in/system-network%20administration.html',
                    'https://www.rgukt.ac.in/sdcell.html',
                    'https://www.rgukt.ac.in/works-estate-and-maintenance.html',
                    'https://www.rgukt.ac.in/contactus.html'
                ],
                "classes": ['page-content','page-row', 'panel-body', 'content-wrapper']
            },
    }
}
scrap_dataset(administration)

academic_section = {
    "academics": {
        "academics": {
                "urls": [
                    'https://www.rgukt.ac.in/examination-staff.html',
                    'https://www.rgukt.ac.in/examination_educational_verification_procedure.html',
                    'https://www.rgukt.ac.in/fee_structure_of_Certificates.html',
                    'https://www.rgukt.ac.in/examination-fee_structure_for_various_exams.html',
                    'https://www.rgukt.ac.in/examination-guidelines.html',
                    'https://www.rgukt.ac.in/examination-reverification.html',
                    'https://www.rgukt.ac.in/examination-challenge-revalution.html',
                    'https://www.rgukt.ac.in/examination-faq.html',
                ],
                "classes": ['page-content','page-row', 'panel-body', 'content-wrapper']
            },
    }
}
scrap_dataset(academic_section)

facilities = {
    "facilities": {
        "facilities": {
                "urls": [
                   'https://www.rgukt.ac.in/hostels.html',
                   'https://www.rgukt.ac.in/counseling.html',
                   'https://www.rgukt.ac.in/student-counselling-cell.html', 
                   'https://www.rgukt.ac.in/medical-staff.html',
                   'https://www.rgukt.ac.in/physical-education-section.html',
                   'https://www.rgukt.ac.in/placement/campus-life.html',
                   'https://www.rgukt.ac.in/hospital.html',
                   'https://www.rgukt.ac.in/hospital-staff.html',
                   'https://www.rgukt.ac.in/medical-information.html',
                ],
                "classes": ['content','page-content','page-row', 'panel-body', 'content-wrapper']
            },
    }
}
scrap_dataset(facilities)

training_and_placements = {
    "training_and_placements": {
        "tnp": {
                "urls": [
                   'https://www.rgukt.ac.in/placement/contact_us.html',
                   'https://www.rgukt.ac.in/placement/about-rgukt.html',
                   'https://www.rgukt.ac.in/placement/our-recruiters.html',
                   'https://www.rgukt.ac.in/placement/why-recruit-at-rgukt.html',
                   'https://www.rgukt.ac.in/placement/rules-and-procedures.html',
                   'https://www.rgukt.ac.in/placement/placement-policy.html',
                   'https://www.rgukt.ac.in/placement/placement-policy.html',
                   'https://www.rgukt.ac.in/placement/facilities-and-boarding.html',
                   'https://www.rgukt.ac.in/placement/departmentandcourses.html',
                   'https://www.rgukt.ac.in/placement/placement-calender.html',
                   'https://www.rgukt.ac.in/placement/placement-records.html',
                   'https://www.rgukt.ac.in/shopping-complex.html',
                   '',
                   '',
                ],
                "classes": ['row','grid stackable','raised','segment','ui container','ui','ui grid','page-content','page-row', 'panel-body', 'content-wrapper']
            },
    }
}
scrap_dataset(training_and_placements)

Scraping data for about_rgukt in folder rgukt-info...
PDF saved at rguktBasarDataset/rgukt-info/about_rgukt.pdf
Scraping data for bio_science in folder departments...
PDF saved at rguktBasarDataset/departments/bio_science.pdf
Scraping data for chemical in folder departments...
Error scraping URL : Invalid URL '': No scheme supplied. Perhaps you meant https://?. Skipping.
PDF saved at rguktBasarDataset/departments/chemical.pdf
Scraping data for chemistry in folder departments...
PDF saved at rguktBasarDataset/departments/chemistry.pdf
Scraping data for civil in folder departments...
PDF saved at rguktBasarDataset/departments/civil.pdf
Scraping data for cse_scrapped in folder departments...
Error scraping URL : Invalid URL '': No scheme supplied. Perhaps you meant https://?. Skipping.
PDF saved at rguktBasarDataset/departments/cse_scrapped.pdf
Scraping data for electrical in folder departments...
Error scraping URL : Invalid URL '': No scheme supplied. Perhaps you meant https://?. Skippi