**Import Libraries:**

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import psycopg2
from psycopg2 import sql


**Define a Function to Detect CMS/MVC:**

In [2]:
#Create a function to detect the CMS/MVC from the HTTP headers or meta tags:

def detect_cms(response):
    headers = response.headers
    if 'x-powered-by' in headers:
        return headers['x-powered-by']
    soup = BeautifulSoup(response.content, 'html.parser')
    generator = soup.find('meta', {'name': 'generator'})
    if generator:
        return generator.get('content')
    return 'Unknown'


**Define a Function to Scrape Data:**

In [3]:
#Create a function that takes a URL and returns the required information:

def scrape_website(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            company_name = soup.find('meta', {'property': 'og:site_name'})
            contact_email = soup.find('a', {'href': lambda x: x and 'mailto:' in x})
            contact_address = soup.find('address')
            contact_number = soup.find('a', {'href': lambda x: x and 'tel:' in x})
            cms_mvc = detect_cms(response)

            company_info = {
                'Company Name': company_name['content'] if company_name else None,
                'Contact Email': contact_email.get('href').replace('mailto:', '') if contact_email else None,
                'Contact Address': contact_address.get_text().strip() if contact_address else None,
                'Contact Number': contact_number.get('href').replace('tel:', '') if contact_number else None,
                'Company Website URL': url,
                'CMS/MVC': cms_mvc
            }

            return company_info
        else:
            return None
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None


**Define a List of URLs:**

In [4]:
#Create a list of 100 URLs you want to scrape:

urls = [
 # Travel and Tourism
"https://www.booking.com",
"https://www.agoda.com",
"https://www.trivago.com",
"https://www.airbnb.co.in",
"https://www.skyscanner.co.in/?previousCultureSource=GEO_LOCATION&redirectedFrom=www.skyscanner.com",
"https://www.kayak.co.in/?ispredir=true",
"https://www.expedia.co.in",
"https://www.google.com/travel/flights",
"https://www.momondo.in",
# News Websites
"https://www.cnn.com",
"https://www.bbc.com",
"https://www.nytimes.com",
"https://www.aljazeera.com",
"https://www.theguardian.com",
"https://www.usatoday.com",
"https://www.foxnews.com",
"https://www.nbcnews.com",
"https://www.huffpost.com",
"https://www.forbes.com",
# E-commerce Websites
"https://www.amazon.com",
"https://www.bestbuy.com",
"https://www.walmart.com",
"https://www.bestbuy.com",
"https://www.alibaba.com",
"https://www.target.com",
"https://www.bestbuy.com",
"https://www.overstock.com",
"https://www.macy.com",
"https://www.shopify.com",
# Technology and Software
"https://www.techcrunch.com",
"https://www.wired.com",
"https://www.theverge.com",
"https://www.cnet.com",
"https://www.gizmodo.com",
"https://www.androidcentral.com",
"https://www.appleinsider.com",
"https://www.engadget.com",
"https://www.zdnet.com",
"https://www.slashdot.org",
# Education and Research
"https://www.coursera.org",
"https://www.edx.org",
"https://www.khanacademy.org",
"https://www.academia.edu",
"https://ocw.mit.edu",
"https://www.jstor.org",
"https://www.medium.com",
"https://www.hbr.org",
"https://www.springer.com",
"https://pubmed.ncbi.nlm.nih.gov",
# Entertainment
"https://www.netflix.com",
"https://www.hulu.com",
"https://www.disneyplus.com",
"https://www.hbo.com",
"https://www.spotify.com",
"https://www.imdb.com",
"https://www.rottentomatoes.com",
"https://www.metacritic.com",
"https://www.tmz.com",
"https://www.rollingstone.com",
# Health and Fitness
"https://www.webmd.com",
"https://www.mayoclinic.org",
"https://www.healthline.com",
"https://www.medicalnewstoday.com",
"https://www.who.int",
"https://www.nih.gov",
"https://www.fitbit.com",
"https://www.bodybuilding.com",
"https://www.menshealth.com",
"https://allheartweb.com",

# Finance and Business
"https://www.bloomberg.com",
"https://finance.yahoo.com",
"https://www.cnbc.com",
"https://www.ft.com",
"https://www.economist.com",
"https://www.investopedia.com",
"https://www.fool.com",
"https://www.nerdwallet.com",
"https://www.forbes.com",
"https://www.businessinsider.com",
# Blogs and Personal Websites
"https://www.wordpress.com",
"https://www.blogger.com",
"https://www.medium.com",
"https://www.squarespace.com",
"https://www.tumblr.com",
"https://www.wix.com",
"https://www.weebly.com",
"https://www.joomla.org",
"https://www.brainofbriancomics.com",
"https://www.brainofbriancomics.com",
# Social Media
"https://www.facebook.com",
"https://www.twitter.com",
"https://www.instagram.com",
"https://www.linkedin.com",
"https://www.pinterest.com",
"https://www.telegram.org",
"https://www.reddit.com",
"https://www.snapchat.com",
"https://www.tumblr.com",
"https://www.quora.com"
]

**Scrape the Data:**

In [5]:
#Loop through the list of URLs and scrape the data:

data = []

for url in urls:
    info = scrape_website(url)
    if info:
        data.append(info)

# Create a DataFrame from the data
df = pd.DataFrame(data)


**Save the Data to a CSV File:**

In [6]:
#Save the scraped data into a CSV file:

df.to_csv('scraped_data.csv', index=False)
print("Data saved to scraped_data.csv")

Data saved to scraped_data.csv


In [7]:
df = pd.read_csv('scraped_data.csv')

In [8]:
df.head()

Unnamed: 0,Company Name,Contact Email,Contact Address,Contact Number,Company Website URL,CMS/MVC
0,Booking.com,,,,https://www.booking.com,Unknown
1,,,,,https://www.agoda.com,Unknown
2,Airbnb,,,,https://www.airbnb.co.in,Unknown
3,,,,,https://www.skyscanner.co.in/?previousCultureS...,Unknown
4,KAYAK,,,,https://www.kayak.co.in/?ispredir=true,Unknown


**Connect to PostgreSQL and Create Table:**

In [9]:
database = "project",
username = "postgres",
pwd = "721507",
hostname = "localhost",
port_id = "5432",
conn = None
cursor = None

In [None]:
# Database connection
try:
  conn = psycopg2.connect(
    dbname = database,
    user = username,
    password = pwd,
    host = hostname,
    port = port_id
  )

  cursor = conn.cursor()

 # Create table
  create_script = """ CREATE TABLE IF NOT EXISTS companies_info (
                          id  SERIAL PRIMARY KEY,
                          company_name VARCHAR(255),
                          contact_email   VARCHAR(255),
                          contact_address  TEXT,
                          contact_number  VARCHAR(50),
                          company_website_url   VARCHAR(255),
                          cms_mvc  TEXT)"""

  cursor.execute(create_script)

  conn.commit()


except Exception as error:
  print(f"Error connecting to database: {error}") # Print the actual error message for debugging
finally:
  if conn is not None:
    if cursor is not None:
        cursor.close()
    conn.close()

# Insert data
insert_query = '''
INSERT INTO company_info (company_name, contact_email, contact_address, contact_number, company_website_url, cms_mvc)
VALUES (%s, %s, %s, %s, %s, %s);
'''

# Check if the connection and cursor were created successfully before inserting data
if conn is not None and cursor is not None:
    for index, row in df.iterrows():
        cursor.execute(insert_query, (row['Company Name'], row['Contact Email'], row['Contact Address'], row['Contact Number'], row['Company Website URL'], row['CMS/MVC']))

    conn.commit()
    cursor.close()
    conn.close()
else:
    print("Error: Database connection or cursor not established.")