# Internshala Scraper

In [2]:
url = 'https://internshala.com/internships/work-from-home-artificial-intelligence-ai-internships/stipend-10000/'

## Using BeautifulSoup

In [3]:
from bs4 import BeautifulSoup
import requests

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

print(soup.title.text)

Top 36 Work From Home Artificial Intelligence (AI) Internships


## Using Selenium

### Scraping role, company name

In [None]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

import pandas as pd
import time

# Set up the Firefox driver
options = FirefoxOptions()
options.headless = True  # Run in headless mode
driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()), options=options)

# Get the URL
url = 'https://internshala.com/internships/work-from-home-artificial-intelligence-ai-internships/stipend-10000/'
driver.get(url)

# Wait for the page to load
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'internship_meta')))

# Handle the popup if it appears
try:
    close_button = driver.find_element(By.ID, 'close_popup')
    close_button.click()
    time.sleep(1)  # Give it a moment to close
except NoSuchElementException:
    pass  # No popup appeared

# Scroll down to load more items if necessary
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    try:
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'loading')))
        wait.until(EC.staleness_of(driver.find_element(By.CLASS_NAME, 'loading')))
    except:
        break

# Find all elements with the class name 'individual_internship'
internships = driver.find_elements(By.CLASS_NAME, 'individual_internship')

# Define lists to store data
roles = []
companies = []

# Print the text of each internship element, skipping ads and elements without job-internship-name
for internship in internships:
    try:
        role = internship.find_element(By.CLASS_NAME, 'job-internship-name').text
        company = internship.find_element(By.CLASS_NAME, 'company-name').text
        
        print(f"Role: {role}")
        print(f"Company: {company}")
        print("-" * 50)

        # Append data to lists
        roles.append(role)
        companies.append(company)

    except NoSuchElementException:
        # Skip this element if it doesn't have the required information
        continue

# Quit the driver
driver.quit()

# Create a DataFrame
data = {
    'Internship Role': roles,
    'Company Name': companies,
}
df = pd.DataFrame(data)

# Save DataFrame to CSV
df.index = df.index + 1
df.to_csv('internships.csv', index=True)

print("Data saved to internships.csv")

### Scraping role, company name, hiring status, stipend, posted date

In [18]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import time

# Set up the Firefox driver
options = FirefoxOptions()
options.headless = True  # Run in headless mode
driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()), options=options)

# Get the URL
url = 'https://internshala.com/internships/work-from-home-artificial-intelligence-ai-internships/stipend-10000/'
driver.get(url)

# Wait for the page to load
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'internship_meta')))

# Handle the popup if it appears
try:
    # Find and click the close button on the popup
    close_button = driver.find_element(By.ID, 'close_popup')
    close_button.click()
    time.sleep(1)  # Give it a moment to close
except:
    pass  # No popup appeared

# Scroll down to load more items if necessary
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    try:
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'loading')))
        wait.until(EC.staleness_of(driver.find_element(By.CLASS_NAME, 'loading')))
    except:
        break

# Find all elements with the class name 'individual_internship'
internships = driver.find_elements(By.CLASS_NAME, 'individual_internship')

# Define lists to store data
roles = []
companies = []
hiring_status = []
stipends = []
post_dates = []

# Add this within the loop where you process each internship
for index, internship in enumerate(internships):  # Skipping the first element
    try:
        role = internship.find_element(By.CLASS_NAME, 'job-internship-name').text
        company = internship.find_element(By.CLASS_NAME, 'company_name').text
        hiring = 'Actively hiring' if 'actively hiring' in internship.text.lower() else 'Not actively hiring'
        stipend = internship.find_element(By.CLASS_NAME, 'stipend').text
        
        # Try to find post date with 'status-info', 'status-inactive', or 'status-success'
        try:
            post_date = internship.find_element(By.CLASS_NAME, 'status-info').text
        except NoSuchElementException:
            try:
                post_date = internship.find_element(By.CLASS_NAME, 'status-inactive').text
            except NoSuchElementException:
                post_date = internship.find_element(By.CLASS_NAME, 'status-success').text
        
        # Append data to lists
        roles.append(role)
        companies.append(company)
        hiring_status.append(hiring)
        stipends.append(stipend)
        post_dates.append(post_date)

    except Exception as e:
        internship_html = internship.get_attribute('outerHTML')
        print(f"Error processing internship at index {index}: {e}\nHTML content: {internship_html}")

# Quit the driver
driver.quit()

# Create a DataFrame
data = {
    'Internship Role': roles,
    'Company Name': companies,
    'Hiring Status': hiring_status,
    'Stipend': stipends,
    'Posted': post_dates,
}
df = pd.DataFrame(data)

# Save DataFrame to CSV
df.index = df.index + 1
df.to_csv('internships.csv', index=True)

print("Data saved to internships.csv")

Error processing internship at index 0: Message: Unable to locate element: .job-internship-name; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:511:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16

HTML content: <div class="container-fluid individual_internship">
    <div class="internship_meta">
        <div class="individual_internship_header contest">
            <div class="main_heading heading_4_5">
               <span>Get Internship and Job Preparation training FREE!</span>
            </div>
            <div class="logo_container">
                <div class="logo">OFFER</div>
            </div>
        </div>
        <div class="individua

### Scraping all useful information

In [None]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

import pandas as pd
import time

# Set up the Firefox driver
options = FirefoxOptions()
options.headless = True  # Run in headless mode
driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()), options=options)

# Get the URL
url = 'https://internshala.com/internships/work-from-home-artificial-intelligence-ai-internships/stipend-10000/'
driver.get(url)

# Wait for the page to load
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'internship_meta')))

# Handle the popup if it appears
try:
    close_button = driver.find_element(By.ID, 'close_popup')
    close_button.click()
    time.sleep(1)  # Give it a moment to close
except NoSuchElementException:
    pass  # No popup appeared

# Scroll down to load more items if necessary
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    try:
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'loading')))
        wait.until(EC.staleness_of(driver.find_element(By.CLASS_NAME, 'loading')))
    except:
        break

# Find all elements with the class name 'individual_internship'
internships = driver.find_elements(By.CLASS_NAME, 'individual_internship')

# Define lists to store data
roles = []

# Print the text of each internship element, skipping ads and elements without job-internship-name
for internship in internships:
    try:
        role = internship.find_element(By.CLASS_NAME, 'job-internship-name').text
        
        # Append data to lists
        roles.append(role)

    except NoSuchElementException:
        # Skip this element if it doesn't have the required information
        continue

# Quit the driver
driver.quit()

# Create a DataFrame
data = {
    'Internship Role': roles,
}
df = pd.DataFrame(data)

# Save DataFrame to CSV
df.index = df.index + 1
df.to_csv('internships.csv', index=True)

print("Data saved to internships.csv")