In [19]:
# Import libraries required for web scraping and data manipulation
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import openpyxl  # Not used in this code, but might be for future Excel export

# Define the target URL for scraping engineering colleges data
url = "https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india"

# Make a GET request to the URL and check the status code
r = requests.get(url)
print(f"Status Code: {r.status_code} (200 indicates successful retrieval)")

# Initialize empty lists to store scraped data
collegenames_list = []
locations_list = []
ratings_list = []
ownerships_list = []
imp_list = []

# Loop through two pages of results (adjust range for more pages)
for page_number in range(1,179):
    # Construct the URL for the current page
    page_url = f"https://engineering.careers360.com/colleges/list-of-engineering-colleges-in-india?page={page_number}"

    # Make a GET request to the current page URL
    r = requests.get(page_url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(r.text, 'lxml')

    # Find all college information blocks (adjust selector if needed)
    colleges = soup.find_all('div', class_="card_block")

    # Extract college names
    for college_block in colleges:
        college_name_element = college_block.find("h3", class_="college_name d-md-none")
        if college_name_element:
            college_name = college_name_element.text.strip()  # Remove leading/trailing whitespaces
            collegenames_list.append(college_name)
        else:
            collegenames_list.append("NA")  # Add "NA" for missing college name

    # Extract college locations
    locations = soup.find_all('div', class_="content_block d-block d-md-none")
    for location_block in locations:
        location_element = location_block.find('span')
        if location_element:
            location = location_element.text.strip()
            locations_list.append(location)
        else:
            locations_list.append("NA")  # Add "NA" for missing location

    # Extract college ratings
    ratings = soup.find_all('div', class_='block_border')
    for rating_block in ratings:
        rating_element = rating_block.find('span')
        if rating_element:
            rating = rating_element.text.strip()
            ratings_list.append(rating)
        else:
            ratings_list.append("NA")  # Add "NA" for missing rating

    # Extract college ownership information
    ownerships = soup.find_all('div', class_="content_block d-block d-md-none")
    for ownership_block in ownerships:
        ownership_elements = ownership_block.find_all('span')
        if len(ownership_elements) >= 4:
            ownership = ownership_elements[3].text.strip()
            ownerships_list.append(ownership)
        elif len(ownership_elements) == 2:
            ownership = ownership_elements[1].text.strip()
            ownerships_list.append(ownership)
        else:
            ownerships_list.append("NA")  # Add "NA" for missing ownership info

    # Extract important information snippets (containing fees and courses)
    imp = soup.find_all('div', class_="snippet_block")
    for imp_block in imp:
        imp_text_element = soup.find('ul')
        if imp_text_element:
            imp_text = imp_block.text.strip()
            imp_list.append(imp_text)
        else:
            imp_list.append("NA")  # Add "NA" for missing important information

# Process the "important information" list to extract fees and courses
primary_courses = [i.split(')')[0] + ')' for i in imp_list]  # Extract primary courses

# Regular expressions for extracting fees and course details
pattern = r'₹[\d\.]+ (?:Lakhs?|K)'  # Pattern to match fee format (Rupee symbol, digits, decimals, optional unit)
pattern1 = r'B\.E \/B\.Tech \(\d+ Courses\)'  # Pattern to match B.E/B.Tech course info (including number of courses)
pattern2 = r'M\.E \/M\.Tech\. \(\d+ Courses\)'  # Pattern to match M.E/M.Tech course info (including number of courses)

fees_list = []
for course_str in imp_list:
    fee = re.findall(pattern, course_str)  # Find all occurrences of the fee pattern in the current course string
    fees_list.append(fee)
        
fees_list = [row[0] if row else 'N/A' for row in fees_list]
# Explanation:
#   - 'fees_list' now contains a list of lists. Each inner list represents the extracted fees 
#     found in the corresponding course string from 'imp_list'.
#   - The list comprehension iterates through 'fees_list'.
#   - If the inner list (extracted fees) is not empty (meaning fees were found), 
#     it takes the first element (assuming there's only one fee value) and assigns it to the current row.
#   - If the inner list is empty (no fees found), it assigns 'N/A' to the current row.

B_tech_Course = []
for course_str in imp_list:
    course = re.findall(pattern1, course_str)  # Find all occurrences of B.E/B.Tech course pattern
    B_tech_Course.append(course)
B_tech_Course = [row[0] if row else 'N/A' for row in B_tech_Course]
# Similar explanation as for 'fees'

M_tech_Course = []
for course_str in imp_list:
    course = re.findall(pattern2, course_str)  # Find all occurrences of B.E/B.Tech course pattern
    M_tech_Course.append(course)
M_tech_Course = [row[0] if row else 'N/A' for row in M_tech_Course]

# Print the lengths of each scraped data list
#print(len(collegenames_list), len(locations_list), len(ratings_list), len(fees_list), len(course_list), len(ownerships_list))

# Validate data list lengths (optional)
# You can uncomment these lines to check if all lists have the same length, 
# indicating they were scraped consistently
# if len(collegenames_list) != len(locations_list) or \
#    len(collegenames_list) != len(ratings_list) or \
#    len(collegenames_list) != len(fees_list) or \
#    len(collegenames_list) != len(course_list) or \
#    len(collegenames_list) != len(ownerships_list):
#     print("Warning: Inconsistent data list lengths!")

# Create a Pandas DataFrame from the scraped data lists
df = pd.DataFrame({
    "College_Name": collegenames_list,
    "Locations": locations_list,
    "Ownership": ownerships_list,  # Corrected column name capitalization
    "Fees": fees_list,
    "B.Tech Courses" : B_tech_Course,
    "M.Tech Courses" : M_tech_Course,
    "Rating": ratings_list
})

# Print a preview of the DataFrame
print(df.head())  # Print the first few rows (you can adjust with .head(n) for n rows)

# Save the DataFrame to a CSV file
df.to_csv('Engineering_colleges_new.csv', index=False)  # Exclude index column in CSV

# Additional comments:
# - The DataFrame creation step creates a tabular structure from the lists, 
#   making data organization and manipulation easier.
# - Saving to CSV allows you to export the data for further analysis or use 
#   in other tools.



Status Code: 200 (200 indicates successful retrieval)
                                        College_Name  \
0  AAA College of Engineering and Technology, Siv...   
1       Aadishwar College of Technology, Gandhinagar   
2  AAERT and SSB Faculty of Architecture, Sarvaja...   
3           Aakar Academy of Architecture, Bangalore   
4  Aalim Muhammed Salegh Academy of Architecture,...   

                  Locations Ownership         Fees            B.Tech Courses  \
0  Virudhunagar, Tamil Nadu   Private          N/A   B.E /B.Tech (8 Courses)   
1    Bhoyan Rathod, Gujarat   Private  ₹2.84 Lakhs  B.E /B.Tech (11 Courses)   
2            Surat, Gujarat   Private          N/A                       N/A   
3      Bangalore, Karnataka   Private          N/A                       N/A   
4       Chennai, Tamil Nadu   Private          N/A                       N/A   

             M.Tech Courses Rating  
0                       N/A  5.0/5  
1  M.E /M.Tech. (4 Courses)  2.6/5  
2                