# Web Scraping Tool for Business-Related Data

# Implement the bellow for small-scale scraping.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Define headers for user-agent rotation
HEADERS = [
    {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"},
    {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
]




In [2]:
# Scraping IMDb Top Movies
def scrape_imdb(url):
    response = requests.get(url, headers=random.choice(HEADERS))
    soup = BeautifulSoup(response.text, 'html.parser')

    movies = []
    for movie in soup.select('td.titleColumn'):
        title = movie.a.text
        year = movie.span.text.strip('()')
        movies.append({"Title": title, "Year": year})
    return movies




In [3]:
# Scraping Data.gov Datasets
def scrape_data_gov(url):
    response = requests.get(url, headers=random.choice(HEADERS))
    soup = BeautifulSoup(response.text, 'html.parser')

    datasets = []
    for dataset in soup.find_all('div', class_='dataset-content'):
        title = dataset.find('h3').text.strip() if dataset.find('h3') else "N/A"
        description = dataset.find('div', class_='notes').text.strip() if dataset.find('div', class_='notes') else "N/A"
        datasets.append({"Dataset Title": title, "Description": description})
    return datasets



In [4]:
# Main Script
if __name__ == "__main__":
    imdb_url = "https://www.imdb.com/chart/top/"
    data_gov_url = "https://catalog.data.gov/dataset"

    all_data = []

    try:
        print("Scraping IMDb...")
        imdb_data = scrape_imdb(imdb_url)
        all_data.extend(imdb_data)
        time.sleep(random.uniform(1, 3))  # Respectful delay
    except Exception as e:
        print(f"Error scraping IMDb: {e}")

    try:
        print("Scraping Data.gov...")
        data_gov_data = scrape_data_gov(data_gov_url)
        all_data.extend(data_gov_data)
        time.sleep(random.uniform(1, 3))  # Respectful delay
    except Exception as e:
        print(f"Error scraping Data.gov: {e}")

   

Scraping IMDb...
Scraping Data.gov...


In [5]:
 if all_data:
    # Create DataFrame
    df = pd.DataFrame(all_data)
    
    # Save as CSV
    df.to_csv("scraped_data.csv", index=False)
    print("Data saved to scraped_data.csv.")
    
    # Save as JSON
    df.to_json("scraped_data.json", orient="records", indent=4)
    print("Data saved to scraped_data.json.")
    
    # Save as TXT
    with open("scraped_data.txt", "w") as file:
        for index, row in df.iterrows():
            file.write(f"Row {index + 1}: {row.to_dict()}\n")
    print("Data saved to scraped_data.txt.")
else:
    print("No data scraped.")


Data saved to scraped_data.csv.
Data saved to scraped_data.json.
Data saved to scraped_data.txt.


# Implement the bellow for large-scale scraping.

In [6]:
!pip install selenium


Defaulting to user installation because normal site-packages is not writeable




In [7]:
driver_path = r"C:\Users\User\Downloads\chromedriver-win64\chromedriver.exe"


In [8]:
driver_path = "C:/Users/User/Downloads/chromedriver-win64/chromedriver.exe"


In [9]:
driver_path = "C:\\Users\\User\\Downloads\\chromedriver-win64\\chromedriver.exe"


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd

def scrape_internshala():
    # Set up the Selenium driver
    service = Service("C:\Users\User\chromedriver-win64")  # Replace with your ChromeDriver path
    driver = webdriver.Chrome(service=service)
    
    # Open the Internshala page
    url = "https://internshala.com/internships"
    driver.get(url)
    time.sleep(5)  # Allow the page to load

    # Scrape job postings
    internships = []
    try:
        # Find internship cards
        cards = driver.find_elements(By.CLASS_NAME, 'internship_meta')
        for card in cards:
            title = card.find_element(By.CLASS_NAME, 'profile').text
            company = card.find_element(By.CLASS_NAME, 'company_name').text
            location = card.find_element(By.CLASS_NAME, 'location_link').text
            stipend = card.find_element(By.CLASS_NAME, 'stipend').text

            internships.append({
                "Title": title,
                "Company": company,
                "Location": location,
                "Stipend": stipend
            })
    except Exception as e:
        print(f"Error scraping: {e}")

    # Save results to a CSV
    driver.quit()
    df = pd.DataFrame(internships)
    df.to_csv("internshala_jobs.csv", index=False)
    print("Data saved to internshala_jobs.csv")

scrape_internshala()


In [14]:
service = Service(r"C:\Users\User\chromedriver-win64\chromedriver.exe")  # Raw string for the correct path


In [15]:
service = Service("C:\\Users\\User\\chromedriver-win64\\chromedriver.exe")  # Escaping backslashes


In [19]:
!pip install selenium
!pip install pandas


Defaulting to user installation because normal site-packages is not writeable




Defaulting to user installation because normal site-packages is not writeable




In [20]:
# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

def scrape_indeed():
    # Set up ChromeDriver
    service = Service(r"C:\Users\User\chromedriver-win64\chromedriver.exe")  # Adjust path as necessary
    options = Options()
    driver = webdriver.Chrome(service=service, options=options)

    # Open the Indeed login page
    url = "https://in.indeed.com/auth?dest=%2F%3Fr%3Dus"
    driver.get(url)
    
    try:
        # Wait for the email field to be visible and interactable
        wait = WebDriverWait(driver, 10)
        username = wait.until(EC.visibility_of_element_located((By.NAME, "email")))
        password = driver.find_element(By.NAME, "password")
        
        # Send login credentials
        username.send_keys("your_email")
        password.send_keys("your_password")
        password.send_keys(Keys.RETURN)
        
        time.sleep(5)  # Wait for login to complete
        
        # Scrape the job listings after login
        job_listings = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')
        jobs = []
        for job in job_listings:
            title = job.find_element(By.CLASS_NAME, 'jobTitle').text
            company = job.find_element(By.CLASS_NAME, 'companyName').text
            location = job.find_element(By.CLASS_NAME, 'companyLocation').text
            jobs.append({"Title": title, "Company": company, "Location": location})

        # Save to CSV
        df = pd.DataFrame(jobs)
        df.to_csv("indeed_jobs.csv", index=False)
        print("Data saved to indeed_jobs.csv")
    except Exception as e:
        print(f"Error: {e}")
    
    driver.quit()

# Run the scraping function
scrape_indeed()


Error: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF6F8A06CF5+28821]
	(No symbol) [0x00007FF6F8973880]
	(No symbol) [0x00007FF6F881578A]
	(No symbol) [0x00007FF6F88691BE]
	(No symbol) [0x00007FF6F88694AC]
	(No symbol) [0x00007FF6F88B2647]
	(No symbol) [0x00007FF6F888F33F]
	(No symbol) [0x00007FF6F88AF412]
	(No symbol) [0x00007FF6F888F0A3]
	(No symbol) [0x00007FF6F885A778]
	(No symbol) [0x00007FF6F885B8E1]
	GetHandleVerifier [0x00007FF6F8D3FCED+3408013]
	GetHandleVerifier [0x00007FF6F8D5745F+3504127]
	GetHandleVerifier [0x00007FF6F8D4B63D+3455453]
	GetHandleVerifier [0x00007FF6F8ACBDFB+835995]
	(No symbol) [0x00007FF6F897EB9F]
	(No symbol) [0x00007FF6F897A854]
	(No symbol) [0x00007FF6F897A9ED]
	(No symbol) [0x00007FF6F896A1D9]
	BaseThreadInitThunk [0x00007FFAD5A5259D+29]
	RtlUserThreadStart [0x00007FFAD6BCAF38+40]



In [22]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

def scrape_data_gov():
    # Set up the Selenium driver
    service = Service(r"C:\Users\User\chromedriver-win64\chromedriver.exe")
    driver = webdriver.Chrome(service=service)
    
    # Open the Data.gov page
    url = "https://data.gov/"
    driver.get(url)
    time.sleep(5)  # Wait for the page to load
    
    # Example: Scrape dataset titles or other information
    datasets = driver.find_elements(By.CLASS_NAME, 'dataset-heading')
    data = []
    for dataset in datasets:
        title = dataset.text
        data.append({"Dataset Title": title})

    # Save data to CSV
    import pandas as pd
    df = pd.DataFrame(data)
    df.to_csv("data_gov_datasets.csv", index=False)
    print("Data saved to data_gov_datasets.csv")
    
    driver.quit()

scrape_data_gov()


Data saved to data_gov_datasets.csv


In [23]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

def scrape_opencorporates():
    # Set up the Selenium driver
    service = Service(r"C:\Users\User\chromedriver-win64\chromedriver.exe")
    driver = webdriver.Chrome(service=service)
    
    # Open the OpenCorporates page
    url = "https://opencorporates.com/"
    driver.get(url)
    time.sleep(5)  # Wait for the page to load
    
    # Example: Scrape company names or other data
    companies = driver.find_elements(By.CLASS_NAME, 'company_name')
    data = []
    for company in companies:
        name = company.text
        data.append({"Company Name": name})

    # Save data to CSV
    import pandas as pd
    df = pd.DataFrame(data)
    df.to_csv("opencorporates_data.csv", index=False)
    print("Data saved to opencorporates_data.csv")
    
    driver.quit()

scrape_opencorporates()


Data saved to opencorporates_data.csv


In [27]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

def scrape_indeed_jobs():
    # Set up ChromeDriver
    service = Service(r"C:\Users\User\chromedriver-win64\chromedriver.exe")  # Adjust path as necessary
    options = Options()
    driver = webdriver.Chrome(service=service, options=options)
    
    # Open the specific Indeed job listings page
    url = "https://in.indeed.com/jobs?q=fresher&l=Mumbai%2C+Maharashtra&from=searchOnHP%2Cwhatautocomplete&vjk=2d15813d41bc295a"
    driver.get(url)

    # Wait for the job cards to load (using WebDriverWait for explicit waiting)
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'job_seen_beacon')))
    except Exception as e:
        print(f"Error waiting for page elements: {e}")
        driver.quit()
        return

    job_listings = []
    try:
        # Scrape the job cards
        job_cards = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')
        
        for card in job_cards:
            try:
                # Extract the job title, company name, and location using more reliable locators
                title = card.find_element(By.CSS_SELECTOR, 'h2.jobTitle').text
                company = card.find_element(By.CSS_SELECTOR, 'span.companyName').text
                location = card.find_element(By.CSS_SELECTOR, 'div.companyLocation').text
                
                job_listings.append({
                    "Title": title,
                    "Company": company,
                    "Location": location
                })
            except Exception as e:
                print(f"Error extracting data from a job card: {e}")

        # Save the data to a CSV file
        df = pd.DataFrame(job_listings)
        df.to_csv("indeed_fresher_jobs.csv", index=False)
        print("Data saved to indeed_fresher_jobs.csv")

    except Exception as e:
        print(f"Error scraping Indeed: {e}")

    driver.quit()

# Run the scraping function
scrape_indeed_jobs()


Error extracting data from a job card: Message: no such element: Unable to locate element: {"method":"css selector","selector":"span.companyName"}
  (Session info: chrome=131.0.6778.109); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF6F8A06CF5+28821]
	(No symbol) [0x00007FF6F8973880]
	(No symbol) [0x00007FF6F881578A]
	(No symbol) [0x00007FF6F88691BE]
	(No symbol) [0x00007FF6F88694AC]
	(No symbol) [0x00007FF6F885C52C]
	(No symbol) [0x00007FF6F888F33F]
	(No symbol) [0x00007FF6F885C3F6]
	(No symbol) [0x00007FF6F888F510]
	(No symbol) [0x00007FF6F88AF412]
	(No symbol) [0x00007FF6F888F0A3]
	(No symbol) [0x00007FF6F885A778]
	(No symbol) [0x00007FF6F885B8E1]
	GetHandleVerifier [0x00007FF6F8D3FCED+3408013]
	GetHandleVerifier [0x00007FF6F8D5745F+3504127]
	GetHandleVerifier [0x00007FF6F8D4B63D+3455453]
	GetHandleVerifier [0x00007FF6F8ACBDFB+835995]
	(No symbo

In [30]:
import requests

def get_gists():
    url = "https://api.github.com/gists/public"
    response = requests.get(url)
    gists = response.json()
    
    # Print gist information
    for gist in gists:
        print(f"Gist Description: {gist.get('description', 'No description')}")
        print(f"Gist URL: {gist['html_url']}")
        print("-" * 30)

get_gists()


Gist Description: freeSSHD v1.3.1 - Failed - Package Tests Results
Gist URL: https://gist.github.com/choco-bot/5f2f789b23daa6737925dbb34f4ce942
------------------------------
Gist Description: 
Gist URL: https://gist.github.com/GrahamcOfBorg/4902e2c875d2f05ca08ea832869df2b4
------------------------------
Gist Description: 
Gist URL: https://gist.github.com/GrahamcOfBorg/1a63f2a6b5bb0a7457ec595a6ece3509
------------------------------
Gist Description: ssdt12 v12.0.50512.0 - Passed - Package Tests Results
Gist URL: https://gist.github.com/choco-bot/8cf9b1c1ba6432a166e56993b964997e
------------------------------
Gist Description: Http Status
Gist URL: https://gist.github.com/ing-reyes/9cd1d669726318315389e8910f80dd95
------------------------------
Gist Description: TryCF Gist
Gist URL: https://gist.github.com/trycf/280380ee3457063a18b43f06bbdd1e9f
------------------------------
Gist Description: TheBat v6.8.8 - Failed - Package Tests Results
Gist URL: https://gist.github.com/choco-bot/e40

In [37]:
!pip install IMDbPY


Defaulting to user installation because normal site-packages is not writeable
Collecting IMDbPY
  Downloading IMDbPY-2022.7.9-py3-none-any.whl (1.2 kB)
Collecting cinemagoer
  Downloading cinemagoer-2023.5.1-py3-none-any.whl (297 kB)
     ------------------------------------ 297.2/297.2 kB 834.1 kB/s eta 0:00:00
Installing collected packages: cinemagoer, IMDbPY
Successfully installed IMDbPY-2022.7.9 cinemagoer-2023.5.1




In [48]:
import csv
import imdb  # Assuming you're using the IMDbPY library

def get_movie_data():
    # Initialize the IMDb object
    ia = imdb.IMDb()
    
    # Search for movies (example: top 5)
    movies = ia.search_movie('Inception')  # Search for a specific movie
    
    # Open a CSV file for writing
    with open('movies_data.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Title", "Year", "IMDb ID"])
        
        for movie in movies:
            # Use .get() to safely access the keys and handle missing values
            title = movie.get('title', 'N/A')
            year = movie.get('year', 'N/A')
            imdbID = movie.get('imdbID', 'N/A')  # Use .get() to handle missing key
            
            # Write movie data to CSV
            writer.writerow([title, year, imdbID])

# Run the function
get_movie_data()



In [45]:
import requests
import csv

def get_weather():
    # Your OpenWeatherMap API key
    api_key = "YOUR_API_KEY"
    city = "Mumbai"  # Example city
    
    # API endpoint
    url = f"http://api.openweathermap.org/data/2.5/weather?q={city}&appid={api_key}&units=metric"
    
    # Make the API request
    response = requests.get(url)
    
    # Check if the response is successful
    if response.status_code == 200:
        weather = response.json()
        
        # Debugging: Print the full response
        print(weather)
        
        # Extracting relevant data
        city_name = weather.get('name', 'N/A')
        weather_description = weather['weather'][0].get('description', 'N/A')
        temperature = weather['main'].get('temp', 'N/A')
        
        # Save to CSV
        with open('weather_data.csv', 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["City", "Weather", "Temperature"])
            writer.writerow([city_name, weather_description, temperature])
    else:
        print(f"Error: Unable to fetch data (status code: {response.status_code})")

get_weather()



Error: Unable to fetch data (status code: 401)
