**Importing the Libraries**

In [4]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from urllib.parse import urljoin
from collections import deque

Function to extract and clean readable text content from a given web page URL.

In [14]:
tags_list = ['script', 'style', 'nav', 'footer', 'header']

def clean_text(soup):
    try:
        for tag in soup(tags_list):
          tag.decompose()
        text = soup.get_text(separator=' ', strip=True)
        text = re.sub(r'©.*$|\s+', ' ', text).strip()
        return text
    except Exception as ex:
        print(f"Error cleaning text: {ex}")
        return ""

Function to filter out links that are non-HTML content (e.g., PDFs, Word docs, images), ensuring that only URLs with useful text content are processed during scraping.

In [28]:
extensions = (".pdf", ".doc", ".docx", ".xls", ".xlsx", ".png", ".jpeg", ".jpg")

def is_parsable(url):
  #Remove common extensions first
  if url.endswith(extensions):
    return False
  
  #Check MIME type
  response = requests.head(url, timeout=10)
  type = response.headers['content-type']
  if type is None:
    return False
  return type.startswith("text") or type.startswith("application/xhtml+xml")

In [16]:
base_url = "https://www.iitk.ac.in"

def process_a_tag(a):
    try:
        href = a.get("href")
        if not href:
            return None
    
        full_url = urljoin(current_url, href)
    
        if full_url.startswith(base_url) and full_url not in visited and is_parsable(full_url):
            return full_url
    except Exception as e:
        print(f"Error processing link {href}: {e}")
    return None

In [29]:
start = "https://www.iitk.ac.in/doaa/academic-departments"

data = {}
visited = set([start])
queue = deque([start])

In [61]:
start = "https://www.iitk.ac.in/doaa/academic-departments"

data = {}
visited = set([start])
queue = deque([start])

while queue:
    current_url = queue.popleft()

    try:
        response = requests.get(current_url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        print ("Scrapping: ", current_url)
        #Extract all the links
        for a in soup.select("a[href]"):
            new_url = process_a_tag(a)
            if new_url:
                visited.add(new_url)
                queue.append(new_url)

        #Extract all the data
        data[current_url] = clean_text(soup)


    except Exception as e:
        print(f"Error scrapping {current_url}: {e}")
        continue

Scrapping:  https://www.iitk.ac.in/doaa/academic-departments
Scrapping:  https://www.iitk.ac.in/bsbe/
Scrapping:  https://www.iitk.ac.in/doaa/?Itemid=897
Scrapping:  https://www.iitk.ac.in/doaa/awards-medals
Scrapping:  https://www.iitk.ac.in/doaa/spgc-forms
Scrapping:  https://www.iitk.ac.in/doaa/dean-academic-affairs
Scrapping:  https://www.iitk.ac.in/doaa/guidelines-for-admission-of-sponsored-candidates
Scrapping:  https://www.iitk.ac.in/me/
Scrapping:  https://www.iitk.ac.in/doaa/courses-of-study
Scrapping:  https://www.iitk.ac.in/doaa/ug-resources
Scrapping:  https://www.iitk.ac.in/doaa/pg-manual
Scrapping:  https://www.iitk.ac.in/doaa/post-graduate-admission
Scrapping:  https://www.iitk.ac.in/ce/
Scrapping:  https://www.iitk.ac.in/doaa/external-circulars
Scrapping:  https://www.iitk.ac.in/design/
Scrapping:  https://www.iitk.ac.in/doaa/sopc-minutes
Scrapping:  https://www.iitk.ac.in/chm/
Scrapping:  https://www.iitk.ac.in/doaa/exam-schedule
Scrapping:  https://www.iitk.ac.in/doaa

In [64]:
data_store = [{"url": url, "description": text} for url, text in data.items() if text.strip() != ""]
df = pd.DataFrame(data_store)
df = df[df['description'].notnull()]
df = df[df['description'].str.strip() != ""]
df = df[df["description"].str.len() > 100]
df.head()

Unnamed: 0,url,description
0,https://www.iitk.ac.in/doaa/academic-departments,Academic Departments Home Time Table Core Time...
1,https://www.iitk.ac.in/bsbe/,BSBE | IIT Kanpur Search Result About Us Histo...
2,https://www.iitk.ac.in/doaa/?Itemid=897,404 - Error: 404 Jump to error message and sea...
3,https://www.iitk.ac.in/doaa/awards-medals,Awards & Medals Home Time Table Core Time Tabl...
4,https://www.iitk.ac.in/doaa/spgc-forms,SPGC- Forms Home Time Table Core Time Table Ex...


In [66]:
df.shape

(2656, 2)

In [67]:
df.to_csv("iitk_cleaned_data.csv", index=False)