In [1]:
!pip install requests
!pip install beautifulsoup4
!pip install pandas
!pip install selenium
!pip install webdriver-manager


Collecting selenium
  Downloading selenium-4.36.0-py3-none-any.whl.metadata (7.5 kB)
Collecting urllib3<3.0,>=2.5.0 (from urllib3[socks]<3.0,>=2.5.0->selenium)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio<1.0,>=0.30.0 (from selenium)
  Downloading trio-0.31.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.6.15 (from selenium)
  Downloading certifi-2025.10.5-py3-none-any.whl.metadata (2.5 kB)
Collecting typing_extensions<5.0,>=4.14.0 (from selenium)
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting websocket-client<2.0,>=1.8.0 (from selenium)
  Downloading websocket_client-1.9.0-py3-none-any.whl.metadata (8.3 kB)
Collecting sortedcontainers (from trio<1.0,>=0.30.0->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio<1.0,

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
weasel 0.4.1 requires smart-open<8.0.0,>=5.2.1, but you have smart-open 0.0.0 which is incompatible.


Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.2.1-py3-none-any.whl (21 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.2.1 webdriver-manager-4.0.2


In [7]:
# Q1 — Books to Scrape (Static Site)
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

base_url = "https://books.toscrape.com/catalogue/page-{}.html"
books_data = []
page = 1

while True:
    url = base_url.format(page)
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        break  

    soup = BeautifulSoup(response.text, "html.parser")
    books = soup.find_all("article", class_="product_pod")
    if not books:
        break

    for book in books:
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text.strip()
        availability = book.find("p", class_="instock availability").text.strip()
        star = book.find("p")["class"][1]  

        books_data.append({
            "Title": title,
            "Price": price,
            "Availability": availability,
            "Star Rating": star
        })

    page += 1
    time.sleep(1)

books_df = pd.DataFrame(books_data)
books_df.to_csv("books.csv", index=False)
print("books.csv saved")
books_df.head()


books.csv saved


Unnamed: 0,Title,Price,Availability,Star Rating
0,A Light in the Attic,Â£51.77,In stock,Three
1,Tipping the Velvet,Â£53.74,In stock,One
2,Soumission,Â£50.10,In stock,One
3,Sharp Objects,Â£47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,Â£54.23,In stock,Five


In [4]:
# Q2 — IMDB Top 250 (Dynamic Site)
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.imdb.com/chart/top/")
time.sleep(3)  # Allow dynamic content to load

movies_data = []

rows = driver.find_elements(By.CSS_SELECTOR, "li.ipc-metadata-list-summary-item")

for row in rows:
    try:
        rank = row.find_element(By.CSS_SELECTOR, "span.ipc-metadata-list-summary-item__rank").text.strip(".")
        title = row.find_element(By.CSS_SELECTOR, "h3.ipc-title__text").text
        year = row.find_element(By.CSS_SELECTOR, "span.sc-479faa3c-8").text.strip("()")
        rating = row.find_element(By.CSS_SELECTOR, "span.ipc-rating-star--rating").text

        movies_data.append({
            "Rank": rank,
            "Movie Title": title,
            "Year of Release": year,
            "IMDB Rating": rating
        })
    except Exception:
        pass

driver.quit()

imdb_df = pd.DataFrame(movies_data)
imdb_df.to_csv("imdb_top250.csv", index=False)
print("imdb_top250.csv saved")
imdb_df.head()

imdb_top250.csv saved


In [6]:
# Q3 — Weather Information
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
url = "https://www.timeanddate.com/weather/"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

cities_data = []
city_links = soup.select("table.zebra.tb-theme tbody tr td a")[:20]  

for link in city_links:
    city_name = link.text.strip()
    city_url = "https://www.timeanddate.com" + link["href"]

    try:
        city_page = requests.get(city_url, headers=headers)
        city_soup = BeautifulSoup(city_page.text, "html.parser")
        qlook = city_soup.find("div", id="qlook")

        if qlook:
            temp = qlook.find("div", class_="h2").text.strip()
            condition = qlook.find("p").text.strip()
        else:
            temp, condition = "N/A", "N/A"

        cities_data.append({
            "City Name": city_name,
            "Temperature": temp,
            "Weather Condition": condition
        })
        time.sleep(1)
    except Exception:
        continue

weather_df = pd.DataFrame(cities_data)
weather_df.to_csv("weather.csv", index=False)
print("weather.csv saved")
weather_df.head()


weather.csv saved
