In [None]:
# State Bus Transport Links
state_links = [
    "https://www.redbus.in/online-booking/ksrtc-kerala/?utm_source=rtchometile",  # KERALA
    "https://www.redbus.in/online-booking/apsrtc/?utm_source=rtchometile",        # ANDHRA
    "https://www.redbus.in/online-booking/tsrtc/?utm_source=rtchometile",         # TELANGANA
    "https://www.redbus.in/online-booking/ktcl/?utm_source=rtchometile",          # KADAMBA
    "https://www.redbus.in/online-booking/rsrtc/?utm_source=rtchometile",         # RAJASTHAN
    "https://www.redbus.in/online-booking/south-bengal-state-transport-corporation-sbstc/?utm_source=rtchometile",  # SOUTH BENGAL
    "https://www.redbus.in/online-booking/hrtc/?utm_source=rtchometile",          # HIMACHAL PRADESH
    "https://www.redbus.in/online-booking/astc/?utm_source=rtchometile",          # ASSAM
    "https://www.redbus.in/online-booking/uttar-pradesh-state-road-transport-corporation-upsrtc/?utm_source=rtchometile",  # UTTAR PRADESH
    "https://www.redbus.in/online-booking/wbtc-ctc/?utm_source=rtchometile",      # WEST BENGAL
    "https://www.redbus.in/online-booking/pepsu/?utm_source=rtchometile"          # PUNJAB
] 

#Extraction of Kerala transport buses Routes and Links

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time


# Initialize the WebDriver
driver = webdriver.Chrome()
driver.get("https://www.redbus.in/online-booking/ksrtc-kerala/?utm_source=rtchometile")
driver.maximize_window()

# Wait until the main content is loaded
wait = WebDriverWait(driver, 30)


def capture_links(path):
    """
    Capture all links and route names based on a specified XPath.

    Args:
        path (str): The XPath of the elements to capture.

    Returns:
        tuple: A tuple containing two lists - links (URLs) and routes (names).
    """
    links = []
    routes = []

    try:
        paths = wait.until(EC.presence_of_all_elements_located((By.XPATH, path)))
        for link in paths:
            links.append(link.get_attribute("href"))
            routes.append(link.text)
        print(f"Captured {len(links)} links on this page.")
    except TimeoutException:
        print("Timeout while waiting for route links.")
    
    return links, routes


# Page 1: Capture links
links_kerala, route_kerala = capture_links("//a[@class='route']")

# Attempt to navigate to page 2 by setting page state
try:
    print("Navigating to page 2...")

    # Execute JavaScript to modify the pagination to go to page 2 directly
    driver.execute_script("""
        document.querySelectorAll('.DC_117_pageTabs')[1].click();
    """)
    time.sleep(3)  # Wait for page 2 content to load

    # Page 2: Capture links
    links_page_2, routes_page_2 = capture_links("//a[@class='route']")
    links_kerala.extend(links_page_2)
    route_kerala.extend(routes_page_2)

except Exception as e:
    print("Error navigating to page 2:", e)

# Create DataFrame and display the data
df_kl = pd.DataFrame({"Route_name": route_kerala, "Route_link": links_kerala})
print(df_kl)

# Save dataframe to CSV
df_kl.to_csv("C:/Users/Dine24/RedBus/df_KL.csv", index=False)

# Close the browser
driver.quit()

Captured 10 links on this page.
Navigating to page 2...
Captured 7 links on this page.
                         Route_name  \
0            Bangalore to Kozhikode   
1            Kozhikode to Ernakulam   
2            Kozhikode to Bangalore   
3            Ernakulam to Kozhikode   
4               Mysore to Kozhikode   
5               Kozhikode to Mysore   
6             Kozhikode to Thrissur   
7   Kozhikode to Thiruvananthapuram   
8             Kozhikode to Kottayam   
9             Thrissur to Kozhikode   
10   Bangalore to Kalpetta (kerala)   
11     Bangalore to Kannur (Kerala)   
12   Kalpetta (kerala) to Bangalore   
13            Kottayam to Kozhikode   
14  Thiruvananthapuram to Kozhikode   
15     Kannur (Kerala) to Bangalore   
16               Kozhikode to Aluva   

                                           Route_link  
0   https://www.redbus.in/bus-tickets/bangalore-to...  
1   https://www.redbus.in/bus-tickets/kozhikode-to...  
2   https://www.redbus.in/bus-tickets/kozh

Extraction of Andhra transport buses Routes and Links

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    ElementClickInterceptedException,
    NoSuchElementException
)
import pandas as pd
import time

# Initialize the WebDriver
driver_A = webdriver.Chrome()
driver_A.get("https://www.redbus.in/online-booking/apsrtc/?utm_source=rtchometile")
driver_A.maximize_window()

# Wait until the main content is loaded
wait = WebDriverWait(driver_A, 20)


def andhra_link_route(path):
    """
    Scrape links and route names across multiple pages.
    
    Args:
        path (str): XPath of the route links.

    Returns:
        tuple: Lists of route links and names.
    """
    links_andhra = []
    route_andhra = []

    for i in range(1, 6):  # Loop for each page, up to page 5 as an example
        print(f"Scraping page {i}...")

        # Capture links on the current page
        paths = wait.until(EC.presence_of_all_elements_located((By.XPATH, path)))

        for link in paths:
            links_andhra.append(link.get_attribute("href"))
            route_andhra.append(link.text)

        # Attempt to click "Next" for pagination
        try:
            pagination = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@class="DC_117_paginationTable"]')
            ))
            next_button = pagination.find_element(
                By.XPATH, f'//div[@class="DC_117_pageTabs " and text()="{i + 1}"]'
            )

            # Scroll into view to ensure visibility
            driver_A.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  # Wait to ensure scroll completes

            try:
                next_button.click()
                time.sleep(3)  # Wait for the next page to load

            except ElementClickInterceptedException:
                print("Element click intercepted. Checking for overlay...")
                # Close overlay if present
                try:
                    close_overlay = driver_A.find_element(
                        By.XPATH, '//button[contains(@class, "close-overlay-button")]'
                    )
                    close_overlay.click()
                    print("Overlay closed.")
                    time.sleep(1)
                    next_button.click()  # Retry clicking "Next"
                except NoSuchElementException:
                    print("No overlay found to close.")

        except (NoSuchElementException, TimeoutException):
            print(f"No more pages to paginate at page {i}")
            break

    return links_andhra, route_andhra


# Run the function and save results
links_andhra, route_andhra = andhra_link_route("//a[@class='route']")
df_aa = pd.DataFrame({"Route_name": route_andhra, "Route_link": links_andhra})
print(df_aa)

# Optional: Save to CSV
df_aa.to_csv("C:/Users/Dine24/RedBus/df_AA.csv", index=False)

# Close the browser
driver_A.quit()


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
No more pages to paginate at page 5
                                 Route_name  \
0                   Hyderabad to Vijayawada   
1                   Vijayawada to Hyderabad   
2                 Kakinada to Visakhapatnam   
3    Chittoor (Andhra Pradesh) to Bangalore   
4                     Bangalore to Tirupati   
5                     Tirupati to Bangalore   
6                       Kadapa to Bangalore   
7                       Hyderabad to Ongole   
8   Anantapur (andhra pradesh) to Bangalore   
9                       Bangalore to Kadapa   
10                      Ongole to Hyderabad   
11   Bangalore to Chittoor (Andhra Pradesh)   
12  Bangalore to Anantapur (andhra pradesh)   
13                Visakhapatnam to Kakinada   
14                      Kadiri to Bangalore   
15                   Vinukonda to Hyderabad   
16                      Bangalore to Kadiri   
17                Hyder

Extraction of Hyderabad transport buses Routes and Links

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    ElementClickInterceptedException,
    NoSuchElementException
)
import pandas as pd
import time

# Initialize the WebDriver
driver_A = webdriver.Chrome()
driver_A.get("https://www.redbus.in/online-booking/tsrtc/?utm_source=rtchometile")
driver_A.maximize_window()

# Wait until the main content is loaded
wait = WebDriverWait(driver_A, 20)


def hyderabad_link_route(path):
    """
    Scrape links and route names across multiple pages for Hyderabad routes.
    
    Args:
        path (str): XPath of the route links.

    Returns:
        tuple: Lists of route links and names.
    """
    links_hyderabad = []
    route_hyderabad = []

    for i in range(1, 4):  # Loop for each page, up to page 3 as an example
        print(f"Scraping page {i}...")

        # Capture links on the current page
        paths = wait.until(EC.presence_of_all_elements_located((By.XPATH, path)))

        for link in paths:
            links_hyderabad.append(link.get_attribute("href"))
            route_hyderabad.append(link.text)

        # Attempt to click "Next" for pagination
        try:
            pagination = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@class="DC_117_paginationTable"]')
            ))
            next_button = pagination.find_element(
                By.XPATH, f'//div[@class="DC_117_pageTabs " and text()="{i + 1}"]'
            )

            # Scroll into view to ensure visibility
            driver_A.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  # Wait to ensure scroll completes

            try:
                next_button.click()
                time.sleep(3)  # Wait for the next page to load

            except ElementClickInterceptedException:
                print("Element click intercepted. Checking for overlay...")
                # Close overlay if present
                try:
                    close_overlay = driver_A.find_element(
                        By.XPATH, '//button[contains(@class, "close-overlay-button")]'
                    )
                    close_overlay.click()
                    print("Overlay closed.")
                    time.sleep(1)
                    next_button.click()  # Retry clicking "Next"
                except NoSuchElementException:
                    print("No overlay found to close.")

        except (NoSuchElementException, TimeoutException):
            print(f"No more pages to paginate at page {i}")
            break

    return links_hyderabad, route_hyderabad


# Run the function and save results
links_hyderabad, route_hyderabad = hyderabad_link_route("//a[@class='route']")
df_hy = pd.DataFrame({"Route_name": route_hyderabad, "Route_link": links_hyderabad})
print(df_hy)

# Optional: Save to CSV
df_hy.to_csv("C:/Users/Dine24/RedBus/df_HY.csv", index=False)

# Close the browser
driver_A.quit()

Scraping page 1...
Scraping page 2...
Scraping page 3...
No more pages to paginate at page 3
                                 Route_name  \
0                      Khammam to Hyderabad   
1                   Hyderabad to Vijayawada   
2                      Hyderabad to Khammam   
3                    Hyderabad to Srisailam   
4                   Karimnagar to Hyderabad   
5                     Hyderabad to Adilabad   
6                   Kothagudem to Hyderabad   
7                   Hyderabad to Mancherial   
8      Guntur (Andhra Pradesh) to Hyderabad   
9                Godavarikhani to Hyderabad   
10                       Kodad to Hyderabad   
11                      Hyderabad to Ongole   
12                    Jagityal to Hyderabad   
13                      Hyderabad to Nirmal   
14     Hyderabad to Guntur (Andhra Pradesh)   
15                  Hyderabad to Karimnagar   
16                  Hyderabad to Kothagudem   
17                Hyderabad to Bhadrachalam   
18            

Extraction of Kadamba transport buses Routes and Links

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    ElementClickInterceptedException,
    NoSuchElementException
)
import pandas as pd
import time

# Initialize the WebDriver
driver_A = webdriver.Chrome()
driver_A.get("https://www.redbus.in/online-booking/ktcl/?utm_source=rtchometile")
driver_A.maximize_window()

# Wait until the main content is loaded
wait = WebDriverWait(driver_A, 20)


def kadamba_link_route(path):
    """
    Scrape links and route names across multiple pages for Kadamba routes.

    Args:
        path (str): XPath of the route links.

    Returns:
        tuple: Lists of route links and names.
    """
    links_kadamba = []
    route_kadamba = []

    for i in range(1, 6):  # Loop for each page, up to page 5 as an example
        print(f"Scraping page {i}...")

        # Capture links on the current page
        paths = wait.until(EC.presence_of_all_elements_located((By.XPATH, path)))

        for link in paths:
            links_kadamba.append(link.get_attribute("href"))
            route_kadamba.append(link.text)

        # Attempt to click "Next" for pagination
        try:
            pagination = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@class="DC_117_paginationTable"]')
            ))
            next_button = pagination.find_element(
                By.XPATH, f'//div[@class="DC_117_pageTabs " and text()="{i + 1}"]'
            )

            # Scroll into view to ensure visibility
            driver_A.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  # Wait to ensure scroll completes

            try:
                next_button.click()
                time.sleep(3)  # Wait for the next page to load

            except ElementClickInterceptedException:
                print("Element click intercepted. Checking for overlay...")
                # Close overlay if present
                try:
                    close_overlay = driver_A.find_element(
                        By.XPATH, '//button[contains(@class, "close-overlay-button")]'
                    )
                    close_overlay.click()
                    print("Overlay closed.")
                    time.sleep(1)
                    next_button.click()  # Retry clicking "Next"
                except NoSuchElementException:
                    print("No overlay found to close.")

        except (NoSuchElementException, TimeoutException):
            print(f"No more pages to paginate at page {i}")
            break

    return links_kadamba, route_kadamba


# Run the function and save results
links_kadamba, route_kadamba = kadamba_link_route("//a[@class='route']")
df_ka = pd.DataFrame({"Route_name": route_kadamba, "Route_link": links_kadamba})
print(df_ka)

# Optional: Save to CSV
df_ka.to_csv("C:/Users/Dine24/RedBus/df_KA.csv", index=False)

# Close the browser
driver_A.quit()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
No more pages to paginate at page 5
                         Route_name  \
0                       Pune to Goa   
1                       Goa to Pune   
2                     Mumbai to Goa   
3                     Goa to Mumbai   
4                   Belagavi to Goa   
5                  Bangalore to Goa   
6                  Goa to Bangalore   
7                 Pandharpur to Goa   
8                   Goa to Belagavi   
9      Goa to Kolhapur(Maharashtra)   
10                   Solapur to Goa   
11                Goa to Pandharpur   
12                   Goa to Solapur   
13   Calangute (goa) to Goa Airport   
14         Goa to Sangola (Solapur)   
15         Sangola (Solapur) to Goa   
16  Calangute (goa) to Mopa Airport   
17   Goa Airport to Calangute (goa)   
18  Mopa Airport to Calangute (goa)   
19                Shivamogga to Goa   
20               Marcel to Belagavi   
21         

Extraction of Rajasthan transport buses Routes and Links

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    ElementClickInterceptedException,
    NoSuchElementException
)
import pandas as pd
import time

# Initialize the WebDriver
driver_A = webdriver.Chrome()
driver_A.get("https://www.redbus.in/online-booking/rsrtc/?utm_source=rtchometile")
driver_A.maximize_window()

# Wait until the main content is loaded
wait = WebDriverWait(driver_A, 20)


def rajasthan_link_route(path):
    """
    Scrape links and route names across multiple pages for Rajasthan routes.

    Args:
        path (str): XPath of the route links.

    Returns:
        tuple: Lists of route links and names.
    """
    links_rajasthan = []
    route_rajasthan = []

    for i in range(1, 3):  # Loop for each page, up to page 2 as an example
        print(f"Scraping page {i}...")

        # Capture links on the current page
        paths = wait.until(EC.presence_of_all_elements_located((By.XPATH, path)))

        for link in paths:
            links_rajasthan.append(link.get_attribute("href"))
            route_rajasthan.append(link.text)

        # Attempt to click "Next" for pagination
        try:
            pagination = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@class="DC_117_paginationTable"]')
            ))
            next_button = pagination.find_element(
                By.XPATH, f'//div[@class="DC_117_pageTabs " and text()="{i + 1}"]'
            )

            # Scroll into view to ensure visibility
            driver_A.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  # Wait to ensure scroll completes

            try:
                next_button.click()
                time.sleep(3)  # Wait for the next page to load

            except ElementClickInterceptedException:
                print("Element click intercepted. Checking for overlay...")
                # Close overlay if present
                try:
                    close_overlay = driver_A.find_element(
                        By.XPATH, '//button[contains(@class, "close-overlay-button")]'
                    )
                    close_overlay.click()
                    print("Overlay closed.")
                    time.sleep(1)
                    next_button.click()  # Retry clicking "Next"
                except NoSuchElementException:
                    print("No overlay found to close.")

        except (NoSuchElementException, TimeoutException):
            print(f"No more pages to paginate at page {i}")
            break

    return links_rajasthan, route_rajasthan


# Run the function and save results
links_rajasthan, route_rajasthan = rajasthan_link_route("//a[@class='route']")
df_ra = pd.DataFrame({"Route_name": route_rajasthan, "Route_link": links_rajasthan})
print(df_ra)

# Optional: Save to CSV
df_ra.to_csv("C:/Users/Dine24/RedBus/df_RA.csv", index=False)

# Close the browser
driver_A.quit()

Scraping page 1...
Scraping page 2...
No more pages to paginate at page 2
                                       Route_name  \
0                                Jodhpur to Ajmer   
1        Beawar (Rajasthan) to Jaipur (Rajasthan)   
2                              Udaipur to Jodhpur   
3                   Jaipur (Rajasthan) to Jodhpur   
4                     Sikar to Jaipur (Rajasthan)   
5   Aligarh (uttar pradesh) to Jaipur (Rajasthan)   
6           Kota(Rajasthan) to Jaipur (Rajasthan)   
7                    Jaipur (Rajasthan) to Pilani   
8   Jaipur (Rajasthan) to Aligarh (uttar pradesh)   
9                Kishangarh to Jaipur (Rajasthan)   
10                               Sikar to Bikaner   
11                  Jodhpur to Beawar (Rajasthan)   
12                    Udaipur to Pali (Rajasthan)   
13          Jaipur (Rajasthan) to Kota(Rajasthan)   
14                     Kota(Rajasthan) to Udaipur   
15                Jaipur (Rajasthan) to Bharatpur   
16                  Jaipu

Extraction of South Bengal transport buses Routes and Links

In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    ElementClickInterceptedException,
    NoSuchElementException
)
import pandas as pd
import time

# Initialize the WebDriver
driver_A = webdriver.Chrome()
driver_A.get("https://www.redbus.in/online-booking/south-bengal-state-transport-corporation-sbstc/?utm_source=rtchometile")
driver_A.maximize_window()

# Wait until the main content is loaded
wait = WebDriverWait(driver_A, 20)


def bengal_link_route(path):
    """
    Scrape links and route names across multiple pages for South Bengal routes.

    Args:
        path (str): XPath of the route links.

    Returns:
        tuple: Lists of route links and names.
    """
    links_southbengal = []
    route_southbengal = []

    for i in range(1, 6):  # Loop for each page, up to page 5 as an example
        print(f"Scraping page {i}...")

        # Capture links on the current page
        paths = wait.until(EC.presence_of_all_elements_located((By.XPATH, path)))

        for link in paths:
            links_southbengal.append(link.get_attribute("href"))
            route_southbengal.append(link.text)

        # Attempt to click "Next" for pagination
        try:
            pagination = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@class="DC_117_paginationTable"]')
            ))
            next_button = pagination.find_element(
                By.XPATH, f'//div[@class="DC_117_pageTabs " and text()="{i + 1}"]'
            )

            # Scroll into view to ensure visibility
            driver_A.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  # Wait to ensure scroll completes

            try:
                next_button.click()
                time.sleep(3)  # Wait for the next page to load

            except ElementClickInterceptedException:
                print("Element click intercepted. Checking for overlay...")
                # Close overlay if present
                try:
                    close_overlay = driver_A.find_element(
                        By.XPATH, '//button[contains(@class, "close-overlay-button")]'
                    )
                    close_overlay.click()
                    print("Overlay closed.")
                    time.sleep(1)
                    next_button.click()  # Retry clicking "Next"
                except NoSuchElementException:
                    print("No overlay found to close.")

        except (NoSuchElementException, TimeoutException):
            print(f"No more pages to paginate at page {i}")
            break

    return links_southbengal, route_southbengal


# Run the function and save results
links_southbengal, route_southbengal = bengal_link_route("//a[@class='route']")
df_sb = pd.DataFrame({"Route_name": route_southbengal, "Route_link": links_southbengal})
print(df_sb)

# Optional: Save to CSV
df_sb.to_csv("C:/Users/Dine24/RedBus/df_SB.csv", index=False)

# Close the browser
driver_A.quit()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
No more pages to paginate at page 5
                                           Route_name  \
0                   Durgapur (West Bengal) to Kolkata   
1                                  Kolkata to Burdwan   
2                   Kolkata to Durgapur (West Bengal)   
3                                   Haldia to Kolkata   
4                                   Kolkata to Haldia   
5                                Midnapore to Kolkata   
6                   Kolkata to Arambagh (West Bengal)   
7                                    Kolkata to Digha   
8                                    Digha to Kolkata   
9                                  Kolkata to Bankura   
10                   Kolkata to Asansol (West Bengal)   
11                               Kolkata to Midnapore   
12                                Jhargram to Kolkata   
13                         Kolkata to Contai (Kanthi)   
14            

Extraction of Himachal Pradhesh transport buses Routes and Links

In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    ElementClickInterceptedException,
    NoSuchElementException
)
import pandas as pd
import time

# Initialize the WebDriver
driver_A = webdriver.Chrome()
driver_A.get("https://www.redbus.in/online-booking/hrtc/?utm_source=rtchometile")
driver_A.maximize_window()

# Wait until the main content is loaded
wait = WebDriverWait(driver_A, 20)


def himachal_link_route(path):
    """
    Scrape links and route names across multiple pages for Himachal Pradesh routes.

    Args:
        path (str): XPath of the route links.

    Returns:
        tuple: Lists of route links and names.
    """
    links_himachal = []
    route_himachal = []

    for i in range(1, 5):  # Loop for each page, up to page 4 as an example
        print(f"Scraping page {i}...")

        # Capture links on the current page
        paths = wait.until(EC.presence_of_all_elements_located((By.XPATH, path)))

        for link in paths:
            links_himachal.append(link.get_attribute("href"))
            route_himachal.append(link.text)

        # Attempt to click "Next" for pagination
        try:
            pagination = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@class="DC_117_paginationTable"]')
            ))
            next_button = pagination.find_element(
                By.XPATH, f'//div[@class="DC_117_pageTabs " and text()="{i + 1}"]'
            )

            # Scroll into view to ensure visibility
            driver_A.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  # Wait to ensure scroll completes

            try:
                next_button.click()
                time.sleep(3)  # Wait for the next page to load

            except ElementClickInterceptedException:
                print("Element click intercepted. Checking for overlay...")
                # Close overlay if present
                try:
                    close_overlay = driver_A.find_element(
                        By.XPATH, '//button[contains(@class, "close-overlay-button")]'
                    )
                    close_overlay.click()
                    print("Overlay closed.")
                    time.sleep(1)
                    next_button.click()  # Retry clicking "Next"
                except NoSuchElementException:
                    print("No overlay found to close.")

        except (NoSuchElementException, TimeoutException):
            print(f"No more pages to paginate at page {i}")
            break

    return links_himachal, route_himachal


# Run the function and save results
links_himachal, route_himachal = himachal_link_route("//a[@class='route']")
df_hp = pd.DataFrame({"Route_name": route_himachal, "Route_link": links_himachal})
print(df_hp)

# Optional: Save to CSV
df_hp.to_csv("C:/Users/Dine24/RedBus/df_HP.csv", index=False)

# Close the browser
driver_A.quit()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
No more pages to paginate at page 4
                                       Route_name  \
0                                 Delhi to Shimla   
1       Hamirpur (Himachal Pradesh) to Chandigarh   
2       Chandigarh to Hamirpur (Himachal Pradesh)   
3                                 Shimla to Delhi   
4            Hamirpur (Himachal Pradesh) to Delhi   
5            Delhi to Hamirpur (Himachal Pradesh)   
6                            Kangra to Chandigarh   
7         Chamba (Himachal Pradesh) to Chandigarh   
8                             Delhi to Chandigarh   
9                          Palampur to Chandigarh   
10   Dharamshala (Himachal Pradesh) to Chandigarh   
11   Chandigarh to Dharamshala (Himachal Pradesh)   
12             Delhi to Chamba (Himachal Pradesh)   
13                         Chandigarh to Palampur   
14                              Delhi to Nalagarh   
15                           Shimla to C

Extraction of Assam transport buses Routes and Links

In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    ElementClickInterceptedException,
    NoSuchElementException
)
import pandas as pd
import time

# Initialize the WebDriver
driver_A = webdriver.Chrome()
driver_A.get("https://www.redbus.in/online-booking/astc/?utm_source=rtchometile")
driver_A.maximize_window()

# Wait until the main content is loaded
wait = WebDriverWait(driver_A, 20)


def assam_link_route(path):
    """
    Scrape links and route names across multiple pages for Assam routes.

    Args:
        path (str): XPath of the route links.

    Returns:
        tuple: Lists of route links and names.
    """
    links_assam = []
    route_assam = []

    for i in range(1, 6):  # Loop for each page, up to page 5 as an example
        print(f"Scraping page {i}...")

        # Capture links on the current page
        paths = wait.until(EC.presence_of_all_elements_located((By.XPATH, path)))

        for link in paths:
            links_assam.append(link.get_attribute("href"))
            route_assam.append(link.text)

        # Attempt to click "Next" for pagination
        try:
            pagination = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@class="DC_117_paginationTable"]')
            ))
            next_button = pagination.find_element(
                By.XPATH, f'//div[@class="DC_117_pageTabs " and text()="{i + 1}"]'
            )

            # Scroll into view to ensure visibility
            driver_A.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  # Wait to ensure scroll completes

            try:
                next_button.click()
                time.sleep(3)  # Wait for the next page to load

            except ElementClickInterceptedException:
                print("Element click intercepted. Checking for overlay...")
                # Close overlay if present
                try:
                    close_overlay = driver_A.find_element(
                        By.XPATH, '//button[contains(@class, "close-overlay-button")]'
                    )
                    close_overlay.click()
                    print("Overlay closed.")
                    time.sleep(1)
                    next_button.click()  # Retry clicking "Next"
                except NoSuchElementException:
                    print("No overlay found to close.")

        except (NoSuchElementException, TimeoutException):
            print(f"No more pages to paginate at page {i}")
            break

    return links_assam, route_assam


# Run the function and save results
links_assam, route_assam = assam_link_route("//a[@class='route']")
df_as = pd.DataFrame({"Route_name": route_assam, "Route_link": links_assam})
print(df_as)

# Optional: Save to CSV
df_as.to_csv("C:/Users/Dine24/RedBus/df_AS.csv", index=False)

# Close the browser
driver_A.quit()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
No more pages to paginate at page 5
                             Route_name  \
0                    Tezpur to Guwahati   
1                    Guwahati to Tezpur   
2            Nagaon (Assam) to Guwahati   
3            Guwahati to Nagaon (Assam)   
4                  Goalpara to Guwahati   
5                    Dhubri to Guwahati   
6   Sibsagar (Assam) to North Lakhimpur   
7   North Lakhimpur to Sibsagar (Assam)   
8                    Guwahati to Dhubri   
9             Jorhat to North Lakhimpur   
10               Dhekiajuli to Guwahati   
11                  Jorhat to Dibrugarh   
12            North Lakhimpur to Jorhat   
13                    Jorhat to Dhemaji   
14                    Dhemaji to Jorhat   
15                   Jorhat to Tinsukia   
16                  Tezpur to Dibrugarh   
17                   Tinsukia to Jorhat   
18                  Dibrugarh to Jorhat   
19       

Extraction of Uttar Pradhesh transport buses Routes and Links

In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    ElementClickInterceptedException,
    NoSuchElementException
)
import pandas as pd
import time

# Initialize the WebDriver
driver_A = webdriver.Chrome()
driver_A.get("https://www.redbus.in/online-booking/uttar-pradesh-state-road-transport-corporation-upsrtc/?utm_source=rtchometile")
driver_A.maximize_window()

# Wait until the main content is loaded
wait = WebDriverWait(driver_A, 20)


def uttar_pradesh_link_route(path):
    """
    Scrape links and route names across multiple pages for Uttar Pradesh routes.

    Args:
        path (str): XPath of the route links.

    Returns:
        tuple: Lists of route links and names.
    """
    links_uttar_pradesh = []
    route_uttar_pradesh = []

    for i in range(1, 6):  # Loop for each page, up to page 5 as an example
        print(f"Scraping page {i}...")

        # Capture links on the current page
        paths = wait.until(EC.presence_of_all_elements_located((By.XPATH, path)))

        for link in paths:
            links_uttar_pradesh.append(link.get_attribute("href"))
            route_uttar_pradesh.append(link.text)

        # Attempt to click "Next" for pagination
        try:
            pagination = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@class="DC_117_paginationTable"]')
            ))
            next_button = pagination.find_element(
                By.XPATH, f'//div[@class="DC_117_pageTabs " and text()="{i + 1}"]'
            )

            # Scroll into view to ensure visibility
            driver_A.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  # Wait to ensure scroll completes

            try:
                next_button.click()
                time.sleep(3)  # Wait for the next page to load

            except ElementClickInterceptedException:
                print("Element click intercepted. Checking for overlay...")
                # Close overlay if present
                try:
                    close_overlay = driver_A.find_element(
                        By.XPATH, '//button[contains(@class, "close-overlay-button")]'
                    )
                    close_overlay.click()
                    print("Overlay closed.")
                    time.sleep(1)
                    next_button.click()  # Retry clicking "Next"
                except NoSuchElementException:
                    print("No overlay found to close.")

        except (NoSuchElementException, TimeoutException):
            print(f"No more pages to paginate at page {i}")
            break

    return links_uttar_pradesh, route_uttar_pradesh


# Run the function and save results
links_uttar_pradesh, route_uttar_pradesh = uttar_pradesh_link_route("//a[@class='route']")
df_up = pd.DataFrame({"Route_name": route_uttar_pradesh, "Route_link": links_uttar_pradesh})
print(df_up)

# Optional: Save to CSV
df_up.to_csv("C:/Users/Dine24/RedBus/df_UP.csv", index=False)

# Close the browser
driver_A.quit()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
No more pages to paginate at page 5
                                           Route_name  \
0                   Delhi to Bareilly (Uttar Pradesh)   
1                                    Delhi to Lucknow   
2                   Bareilly (Uttar Pradesh) to Delhi   
3                Lucknow to Gorakhpur (uttar pradesh)   
4                    Aligarh (uttar pradesh) to Delhi   
5                 Lucknow to Prayagraj(Uttar Pradesh)   
6                    Delhi to Aligarh (uttar pradesh)   
7                                    Lucknow to Delhi   
8                 Prayagraj(Uttar Pradesh) to Lucknow   
9                Delhi to Farrukhabad (Uttar Pradesh)   
10                   Delhi to Sitapur (Uttar Pradesh)   
11                                    Agra to Lucknow   
12                 Delhi to Gorakhpur (uttar pradesh)   
13               Farrukhabad (Uttar Pradesh) to Delhi   
14            

Extraction of Uttar Pradhesh transport buses Routes and Links

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    ElementClickInterceptedException,
    NoSuchElementException
)
import pandas as pd
import time

# Initialize the WebDriver
driver_A = webdriver.Chrome()
driver_A.get("https://www.redbus.in/online-booking/chandigarh-transport-undertaking-ctu")
driver_A.maximize_window()

# Wait until the main content is loaded
wait = WebDriverWait(driver_A, 20)


def chandigarh_link_route(path):
    """
    Scrape links and route names across multiple pages for Chandigarh routes.

    Args:
        path (str): XPath of the route links.

    Returns:
        tuple: Lists of route links and names.
    """
    links_chandigarh = []
    route_chandigarh = []

    for i in range(1, 4):  # Loop for each page, up to page 3 as an example
        print(f"Scraping page {i}...")

        # Capture links on the current page
        paths = wait.until(EC.presence_of_all_elements_located((By.XPATH, path)))

        for link in paths:
            links_chandigarh.append(link.get_attribute("href"))
            route_chandigarh.append(link.text)

        # Attempt to click "Next" for pagination
        try:
            pagination = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@class="DC_117_paginationTable"]')
            ))
            next_button = pagination.find_element(
                By.XPATH, f'//div[@class="DC_117_pageTabs " and text()="{i + 1}"]'
            )

            # Scroll into view to ensure visibility
            driver_A.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  # Wait to ensure scroll completes

            try:
                next_button.click()
                time.sleep(3)  # Wait for the next page to load

            except ElementClickInterceptedException:
                print("Element click intercepted. Checking for overlay...")
                # Close overlay if present
                try:
                    close_overlay = driver_A.find_element(
                        By.XPATH, '//button[contains(@class, "close-overlay-button")]'
                    )
                    close_overlay.click()
                    print("Overlay closed.")
                    time.sleep(1)
                    next_button.click()  # Retry clicking "Next"
                except NoSuchElementException:
                    print("No overlay found to close.")

        except (NoSuchElementException, TimeoutException):
            print(f"No more pages to paginate at page {i}")
            break

    return links_chandigarh, route_chandigarh


# Run the function and save results
links_chandigarh, route_chandigarh = chandigarh_link_route("//a[@class='route']")
df_ch = pd.DataFrame({"Route_name": route_chandigarh, "Route_link": links_chandigarh})
print(df_ch)

# Optional: Save to CSV
df_ch.to_csv("C:/Users/Dine24/RedBus/df_CH.csv", index=False)

# Close the browser
driver_A.quit()

Scraping page 1...
Scraping page 2...
Scraping page 3...
No more pages to paginate at page 3
                                      Route_name  \
0                            Delhi to Chandigarh   
1                            Chandigarh to Delhi   
2                     Yamuna Nagar to Chandigarh   
3                         Ludhiana to Chandigarh   
4                     Chandigarh to Yamuna Nagar   
5      Chandigarh to Hamirpur (Himachal Pradesh)   
6                         Chandigarh to Baijnath   
7      Hamirpur (Himachal Pradesh) to Chandigarh   
8                         Chandigarh to Ludhiana   
9                         Chandigarh to Dehradun   
10     Chandigarh to Sujanpur (himachal pradesh)   
11                       Chandigarh to Vrindavan   
12                       Chandigarh to Pathankot   
13                        Dehradun to Chandigarh   
14  Chandigarh to Dharamshala (Himachal Pradesh)   
15     Sujanpur (himachal pradesh) to Chandigarh   
16              Chandig

Extraction of Punjab transport buses Routes and Links

In [11]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    ElementClickInterceptedException,
    NoSuchElementException
)
import pandas as pd
import time

# Initialize the WebDriver
driver_A = webdriver.Chrome()
driver_A.get("https://www.redbus.in/online-booking/pepsu/?utm_source=rtchometile")
driver_A.maximize_window()

# Wait until the main content is loaded
wait = WebDriverWait(driver_A, 20)


def punjab_link_route(path):
    """
    Scrape links and route names across multiple pages for Punjab routes.

    Args:
        path (str): XPath of the route links.

    Returns:
        tuple: Lists of route links and names.
    """
    links_punjab = []
    route_punjab = []

    for i in range(1, 4):  # Loop for each page, up to page 3 as an example
        print(f"Scraping page {i}...")

        # Capture links on the current page
        paths = wait.until(EC.presence_of_all_elements_located((By.XPATH, path)))

        for link in paths:
            links_punjab.append(link.get_attribute("href"))
            route_punjab.append(link.text)

        # Attempt to click "Next" for pagination
        try:
            pagination = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@class="DC_117_paginationTable"]')
            ))
            next_button = pagination.find_element(
                By.XPATH, f'//div[@class="DC_117_pageTabs " and text()="{i + 1}"]'
            )

            # Scroll into view to ensure visibility
            driver_A.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  # Wait to ensure scroll completes

            try:
                next_button.click()
                time.sleep(3)  # Wait for the next page to load

            except ElementClickInterceptedException:
                print("Element click intercepted. Checking for overlay...")
                # Close overlay if present
                try:
                    close_overlay = driver_A.find_element(
                        By.XPATH, '//button[contains(@class, "close-overlay-button")]'
                    )
                    close_overlay.click()
                    print("Overlay closed.")
                    time.sleep(1)
                    next_button.click()  # Retry clicking "Next"
                except NoSuchElementException:
                    print("No overlay found to close.")

        except (NoSuchElementException, TimeoutException):
            print(f"No more pages to paginate at page {i}")
            break

    return links_punjab, route_punjab


# Run the function and save results
links_punjab, route_punjab = punjab_link_route("//a[@class='route']")
df_pu = pd.DataFrame({"Route_name": route_punjab, "Route_link": links_punjab})
print(df_pu)

# Optional: Save to CSV
df_pu.to_csv("C:/Users/Dine24/RedBus/df_PU.csv", index=False)

# Close the browser
driver_A.quit()

Scraping page 1...
Scraping page 2...
Scraping page 3...
No more pages to paginate at page 3
                     Route_name  \
0              Delhi to Patiala   
1              Patiala to Delhi   
2             Ludhiana to Delhi   
3             Delhi to Ludhiana   
4     Ludhiana to Delhi Airport   
5            Delhi to Jalandhar   
6      Delhi Airport to Patiala   
7         Chandigarh to Patiala   
8            Jalandhar to Delhi   
9     Delhi Airport to Ludhiana   
10   Jalandhar to Delhi Airport   
11       Chandigarh to Bathinda   
12            Phagwara to Delhi   
13   Delhi Airport to Jalandhar   
14            Delhi to Amritsar   
15    Phagwara to Delhi Airport   
16            Delhi to Phagwara   
17            Amritsar to Delhi   
18    Delhi Airport to Phagwara   
19    Amritsar to Delhi Airport   
20  Delhi Airport to Kapurthala   
21          Amritsar to Patiala   
22          Patiala to Amritsar   

                                           Route_link  
0   https:

In [None]:
Extraction of West Bengal transport buses Routes and Links

In [12]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    ElementClickInterceptedException,
    NoSuchElementException
)
import pandas as pd
import time

# Initialize the WebDriver
driver_A = webdriver.Chrome()
driver_A.get("https://www.redbus.in/online-booking/wbtc-ctc/?utm_source=rtchometile")
driver_A.maximize_window()

# Wait until the main content is loaded
wait = WebDriverWait(driver_A, 20)


def west_bengal_link_route(path):
    """
    Scrape links and route names across multiple pages for West Bengal routes.

    Args:
        path (str): XPath of the route links.

    Returns:
        tuple: Lists of route links and names.
    """
    links_west_bengal = []
    route_west_bengal = []

    for i in range(1, 5):  # Loop for each page, up to page 4 as an example
        print(f"Scraping page {i}...")

        # Capture links on the current page
        paths = wait.until(EC.presence_of_all_elements_located((By.XPATH, path)))

        for link in paths:
            links_west_bengal.append(link.get_attribute("href"))
            route_west_bengal.append(link.text)

        # Attempt to click "Next" for pagination
        try:
            pagination = wait.until(EC.presence_of_element_located(
                (By.XPATH, '//*[@class="DC_117_paginationTable"]')
            ))
            next_button = pagination.find_element(
                By.XPATH, f'//div[@class="DC_117_pageTabs " and text()="{i + 1}"]'
            )

            # Scroll into view to ensure visibility
            driver_A.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  # Wait to ensure scroll completes

            try:
                next_button.click()
                time.sleep(3)  # Wait for the next page to load

            except ElementClickInterceptedException:
                print("Element click intercepted. Checking for overlay...")
                # Close overlay if present
                try:
                    close_overlay = driver_A.find_element(
                        By.XPATH, '//button[contains(@class, "close-overlay-button")]'
                    )
                    close_overlay.click()
                    print("Overlay closed.")
                    time.sleep(1)
                    next_button.click()  # Retry clicking "Next"
                except NoSuchElementException:
                    print("No overlay found to close.")

        except (NoSuchElementException, TimeoutException):
            print(f"No more pages to paginate at page {i}")
            break

    return links_west_bengal, route_west_bengal


# Run the function and save results
links_west_bengal, route_west_bengal = west_bengal_link_route("//a[@class='route']")
df_wb = pd.DataFrame({"Route_name": route_west_bengal, "Route_link": links_west_bengal})
print(df_wb)

# Optional: Save to CSV
df_wb.to_csv("C:/Users/Dine24/RedBus/df_WB.csv", index=False)

# Close the browser
driver_A.quit()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
No more pages to paginate at page 4
                                           Route_name  \
0                      Digha to Barasat (West Bengal)   
1                   Durgapur (West Bengal) to Kolkata   
2                                    Digha to Kolkata   
3                      Barasat (West Bengal) to Digha   
4                   Kolkata to Durgapur (West Bengal)   
5                                    Kolkata to Digha   
6            Barasat (West Bengal) to Contai (Kanthi)   
7                  Barasat (West Bengal) to Midnapore   
8   Barasat (West Bengal) to Nandakumar (west bengal)   
9                   Barasat (West Bengal) to Kolaghat   
10                                    Kolkata to Suri   
11                               Midnapore to Kolkata   
12                 Midnapore to Barasat (West Bengal)   
13    Barasat (West Bengal) to Durgapur (West Bengal)   
14                     Barasat (W

#Concat to have the Selenium scrapping from all the CSV files in one consolidated CSV files 

In [13]:
import pandas as pd

# Example concatenation with all defined DataFrames
df_combined = pd.concat(
    [df_kl, df_aa, df_hy, df_ka, df_ra, df_hp, df_sb, df_as, df_up, df_ch,df_pu,df_wb], 
    ignore_index=True
)

# Display the consolidated DataFrame
print(df_combined)

# Save the consolidated DataFrame to a CSV file
df_combined.to_csv("C:/Users/Dine24/RedBus/df_combined.csv", index=False)


                                         Route_name  \
0                            Bangalore to Kozhikode   
1                            Kozhikode to Ernakulam   
2                            Kozhikode to Bangalore   
3                            Ernakulam to Kozhikode   
4                               Mysore to Kozhikode   
..                                              ...   
418                 Habra to Durgapur (West Bengal)   
419                              Kolkata to Purulia   
420  Barasat (West Bengal) to Asansol (West Bengal)   
421                               Habra to Kolaghat   
422                               Kolkata to Haldia   

                                            Route_link  
0    https://www.redbus.in/bus-tickets/bangalore-to...  
1    https://www.redbus.in/bus-tickets/kozhikode-to...  
2    https://www.redbus.in/bus-tickets/kozhikode-to...  
3    https://www.redbus.in/bus-tickets/ernakulam-to...  
4    https://www.redbus.in/bus-tickets/mysore-to-ko...

In [2]:
from pptx import Presentation
from docx import Document

# Creating the Word Document
doc_path = "C:/Users/Dine24/RedBus/BusTransport_Project_Documentation.docx"
doc = Document()
doc.add_heading("Bus Transport Data Analysis Project Documentation", level=1)

doc.add_heading("1. Project Overview", level=2)
doc.add_paragraph(
    "The Bus Transport Data Analysis Project involves scraping bus route and transport data, "
    "storing it in a structured SQL database, and creating an interactive Streamlit application "
    "for data exploration and filtering."
)

doc.add_heading("2. Key Features", level=2)
doc.add_paragraph(
    "1. Accurate scraping of bus data, ensuring completeness and correctness.\n"
    "2. Efficient SQL database schema for storage and retrieval.\n"
    "3. User-friendly Streamlit application for data filtering and analysis.\n"
    "4. Responsive and effective data filtering functionality.\n"
    "5. Adherence to coding standards and best practices."
)

doc.add_heading("3. Database Design", level=2)
doc.add_paragraph(
    "The database schema includes a table named `bus_routes` with the following columns:\n"
    "- id: INT, Primary Key, Auto Increment.\n"
    "- route_name: TEXT, Bus route details.\n"
    "- route_link: TEXT, Link to the route details.\n"
    "- bus_name: TEXT, Name of the bus.\n"
    "- bus_type: TEXT, Type of the bus.\n"
    "- departing_time: TIME, Departure time.\n"
    "- duration: TEXT, Journey duration.\n"
    "- reaching_time: TIME, Arrival time.\n"
    "- star_rating: FLOAT, Rating of the bus.\n"
    "- price: DECIMAL(10, 2), Price of the ticket.\n"
    "- seats_available: INT, Available seats."
)

doc.add_heading("4. Application Features", level=2)
doc.add_paragraph(
    "The Streamlit application enables users to:\n"
    "- View the complete bus route data.\n"
    "- Filter data based on route name, bus type, and price range.\n"
    "- Analyze data using visualizations such as bar charts."
)

doc.add_heading("5. Challenges and Solutions", level=2)
doc.add_paragraph(
    "1. **Data Scraping Errors:** Corrected invalid syntax and ensured valid JSON-like structures were removed.\n"
    "2. **Database Integration:** Debugged SQL syntax issues and used parameterized queries for secure data insertion.\n"
    "3. **Application Functionality:** Addressed Streamlit errors and ensured smooth data filtering and visualization."
)

doc.add_heading("6. Instructions for Use", level=2)
doc.add_paragraph(
    "1. Run the Streamlit application using `streamlit run <file_path>`.\n"
    "2. Use the sidebar to apply filters and analyze data.\n"
    "3. Ensure the database is populated with the required data before running the application."
)

doc.add_heading("7. Future Enhancements", level=2)
doc.add_paragraph(
    "1. Incorporate advanced analytics such as predictive modeling.\n"
    "2. Add more user-friendly features like downloadable reports.\n"
    "3. Optimize database queries for faster performance."
)

doc.save(doc_path)

# Creating the PowerPoint Presentation
ppt_path = "C:/Users/Dine24/RedBus/BusTransport_Project_Presentation.pptx"
ppt = Presentation()

slide = ppt.slides.add_slide(ppt.slide_layouts[0])
title = slide.shapes.title
subtitle = slide.placeholders[1]
title.text = "Bus Transport Data Analysis Project"
subtitle.text = "An end-to-end solution for bus route data management and visualization"

slide = ppt.slides.add_slide(ppt.slide_layouts[1])
title = slide.shapes.title
content = slide.placeholders[1]
title.text = "Project Overview"
content.text = (
    "• Data scraping of bus routes and transport information.\n"
    "• SQL database storage for structured data management.\n"
    "• Interactive Streamlit application for data exploration."
)

slide = ppt.slides.add_slide(ppt.slide_layouts[1])
title = slide.shapes.title
content = slide.placeholders[1]
title.text = "Key Features"
content.text = (
    "• Accurate data scraping ensuring completeness.\n"
    "• Efficient database design and storage.\n"
    "• User-friendly Streamlit interface.\n"
    "• Responsive data filtering and analysis.\n"
    "• Visualizations for better insights."
)

slide = ppt.slides.add_slide(ppt.slide_layouts[1])
title = slide.shapes.title
content = slide.placeholders[1]
title.text = "Database Schema"
content.text = (
    "Table: bus_routes\n"
    "Columns:\n"
    "• id: Primary Key\n"
    "• route_name, route_link, bus_name, bus_type\n"
    "• departing_time, duration, reaching_time\n"
    "• star_rating, price, seats_available"
)

slide = ppt.slides.add_slide(ppt.slide_layouts[1])
title = slide.shapes.title
content = slide.placeholders[1]
title.text = "Streamlit Application"
content.text = (
    "• View complete bus data.\n"
    "• Filter data by route name, bus type, and price range.\n"
    "• Analyze data through interactive charts."
)

slide = ppt.slides.add_slide(ppt.slide_layouts[1])
title = slide.shapes.title
content = slide.placeholders[1]
title.text = "Challenges and Solutions"
content.text = (
    "• Data Scraping: Fixed JSON syntax errors.\n"
    "• SQL Integration: Debugged syntax and used parameterized queries.\n"
    "• Application: Resolved Streamlit runtime errors."
)

slide = ppt.slides.add_slide(ppt.slide_layouts[1])
title = slide.shapes.title
content = slide.placeholders[1]
title.text = "Future Enhancements"
content.text = (
    "• Advanced analytics (e.g., predictive modeling).\n"
    "• User-friendly downloadable reports.\n"
    "• Optimized database queries for faster performance."
)

ppt.save(ppt_path)

doc_path, ppt_path


('C:/Users/Dine24/RedBus/BusTransport_Project_Documentation.docx',
 'C:/Users/Dine24/RedBus/BusTransport_Project_Presentation.pptx')

In [None]:
import shutil

# Paths for the generated files
local_doc_path = "C:/Users/Dine24/RedBus/BusTransport_Project_Documentation.docx"
local_ppt_path = "C:/Users/Dine24/RedBus/BusTransport_Project_Presentation.pptx"

# Moving files to download-accessible location
doc_download_path = "C:/Users/Dine24/RedBus/BusTransport_Project_Documentation.docx"
ppt_download_path = "C:/Users/Dine24/RedBus/BusTransport_Project_Presentation.pptx"

shutil.copy(local_doc_path, doc_download_path)
shutil.copy(local_ppt_path, ppt_download_path)

doc_download_path, ppt_download_path


In [3]:
!pip install python-pptx python-docx

Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading XlsxWriter-3.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading python_pptx-1.0.2-py3-none-any.whl (472 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
Installing collected packages: XlsxWriter, python-docx, python-pptx
Successfully installed XlsxWriter-3.2.0 python-docx-1.1.2 python-pptx-1.0.2


In [4]:
!pip show python-pptx
!pip show python-docx

Name: python-pptx
Version: 1.0.2
Summary: Create, read, and update PowerPoint 2007+ (.pptx) files.
Home-page: https://github.com/scanny/python-pptx
Author: 
Author-email: Steve Canny <stcanny@gmail.com>
License: MIT
Location: C:\Users\Dine24\anaconda3\Lib\site-packages
Requires: lxml, Pillow, typing-extensions, XlsxWriter
Required-by: 
Name: python-docx
Version: 1.1.2
Summary: Create, read, and update Microsoft Word .docx files.
Home-page: https://github.com/python-openxml/python-docx
Author: 
Author-email: Steve Canny <stcanny@gmail.com>
License: MIT
Location: C:\Users\Dine24\anaconda3\Lib\site-packages
Requires: lxml, typing-extensions
Required-by: 
