In [1]:
import pandas as pd
import numpy as np
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

options = Options()
options.add_argument('--headless')  # don't show browser
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--window-size=1920,1080')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--remote-debugging-port=9222')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=options)

url = "https://www.ambitionbox.com/list-of-companies?page=1"
driver.get(url)

soup = BeautifulSoup(driver.page_source, 'html.parser')

driver.quit()


In [None]:
soup

## Provide html in structured way

In [None]:
print(soup.prettify())

### Find h1 tag element text

In [None]:
soup.find_all('h1')[0].text.strip()

## Find companies name with h2 tag

In [None]:
for company in soup.find_all('h2'):
  print(company.text.strip())

## Find all companies card

In [2]:
companies=soup.find_all('div',class_='companyCardWrapper')
len(companies)

20

## Create a dataframe with necessary data

In [9]:
name=[]
isVerified=[]
rating=[]
reviews=[]
company_type=[]
salaries=[]
jobs=[]

for company in companies:
  name.append(company.find('a', class_='companyCardWrapper__companyName').text.strip())
  isVerified.append(1 if company.find('span', class_='companyCardWrapper__companyVerified') else 0)
  rating.append(company.find('div' , class_='rating_text').text.strip())
  reviews.append(company.find('span' , class_='companyCardWrapper__companyRatingCount').text.strip())
  company_type.append(company.find('span', class_='companyCardWrapper__interLinking').text.strip().split('|')[0].strip() if company.find('span', class_='companyCardWrapper__interLinking') else 'N/A')
  salaries.append(company.find_all('span' , class_='companyCardWrapper__ActionCount')[1].text.strip())
  jobs.append(company.find_all('span' , class_='companyCardWrapper__ActionCount')[3].text.strip())

df=pd.DataFrame({
   'name':name,
   'isVerified':isVerified,
   'rating':rating,
   'reviews':reviews,
   'company_type':company_type,
   'salaries':salaries,
   'jobs':jobs,
   })
  
df

Unnamed: 0,name,isVerified,rating,reviews,company_type,salaries,jobs
0,IntouchCX,1,2.9,(2k),BPO,8.4k,104
1,Radisson Hotels,0,4.1,(1.9k),New Delhi +113 other locations,5.5k,131
2,Xoriant,1,4.1,(2k),IT Services & Consulting,11.1k,45
3,Nokia Networks,0,4.2,(1.9k),Hardware & Networking,11.7k,--
4,Canara HSBC Life Insurance,1,3.6,(2k),Insurance,6.3k,9
5,Star Health & Allied Insurance,0,3.6,(2k),Insurance,8.3k,12
6,DBS Bank,0,3.8,(2k),Banking,12.5k,98
7,Extramarks Education,1,3.4,(1.9k),EdTech,8k,23
8,Adani Power,0,3.9,(1.9k),Power,8.2k,--
9,Cloudnine Hospital,1,4.4,(2k),Healthcare,2.6k,248


# Creating full dataframe for all the pages

In [None]:
final_df = pd.DataFrame()
all_dataframes = []

for j in range(1,21):
  try:
    driver = webdriver.Chrome(options=options)
    url = f"https://www.ambitionbox.com/list-of-companies?page={j}"
    driver.get(url)
    # print(url)
    # time.sleep(2) # Wait to ensure content loads
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    companies=soup.find_all('div',class_='companyCardWrapper')
    print(len(companies))
    
    name=[]
    isVerified=[]
    rating=[]
    reviews=[]
    company_type=[]
    salaries=[]
    jobs=[]

    for company in companies:
      name.append(company.find('a', class_='companyCardWrapper__companyName').text.strip())
      isVerified.append(1 if company.find('span', class_='companyCardWrapper__companyVerified') else 0)
      rating.append(company.find('div' , class_='rating_text').text.strip())
      reviews.append(company.find('span' , class_='companyCardWrapper__companyRatingCount').text.strip())
      company_type.append(company.find('span', class_='companyCardWrapper__interLinking').text.strip().split('|')[0].strip() if company.find('span', class_='companyCardWrapper__interLinking') else 'N/A')
      salaries.append(company.find_all('span' , class_='companyCardWrapper__ActionCount')[1].text.strip())
      jobs.append(company.find_all('span' , class_='companyCardWrapper__ActionCount')[3].text.strip())
    
    df = pd.DataFrame({
      'name':name,
      'isVerified':isVerified,
      'rating':rating,
      'reviews':reviews,
      'company_type':company_type,
      'salaries':salaries,
      'jobs':jobs,
      })
    all_dataframes.append(df)

  except Exception as e:
    print(f"Error on page {j}: {e}")
    continue

final_df = pd.concat(all_dataframes, ignore_index=True)

In [None]:
final_df
# export data set as csv
# final_df.to_csv('companies_data.csv')