In [25]:
import requests
from bs4 import BeautifulSoup
import json
import urllib.parse
import random

In [26]:
def clean_text(text):
    return ' '.join(text.split())

In [27]:

def get_random_user_agent():
    user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.37 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/89.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.37',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
    # Add more user agents as needed
]
    return random.choice(user_agents)

In [28]:
def scrape_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        }

        parsed_url = urllib.parse.urlparse(url)

        path = parsed_url.path.strip("/")

        url_keywords = [keyword for keyword in path.split("/") if keyword]

        response = requests.get(url, headers=headers)
        response.raise_for_status() 

        soup = BeautifulSoup(response.text, 'html.parser')

        title = clean_text(soup.title.text) if soup.title else ''
        h_tags = [clean_text(h.text) for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]

        meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
        meta_description = soup.find('meta', attrs={'name': 'description'})

        keywords = meta_keywords['content'] if meta_keywords else ''
        description = meta_description['content'] if meta_description else ''

        keyword_words = [word.strip() for word in keywords.split(',')]

        view_count_tag = soup.find('span', class_='view-read-row__view')
        view_count = view_count_tag['data-articleviews'] if view_count_tag else None

        date_published_tag = soup.find('span', class_='card-article-details__heading__date')
        date_published = clean_text(date_published_tag.text) if date_published_tag else None


        return {
            'url': url,
            'title': title,
            'description': description,
            'keywords': keyword_words,
            'views': view_count,
            'date_published': date_published,
            'h_tags': h_tags,
            'url_keywords': url_keywords,
            'error': None
            }

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return {
            'url': url,
            'title': '',
            'description': '',
            'keywords': '',
            'views': '0',
            'date_published': '',
            'h_tags': [],
            'url_keywords': [],
            'error': str(e)
            }

In [29]:
import json

def main():
    with open('sitemap_data.json', 'r', encoding='utf-8') as input_file:
        urls = json.load(input_file)

    result_list = []
    unreachable_count = 0

    for i, url_info in enumerate(urls, start=1):
        url = url_info["URL"]
        print(f"Processing link {i}/{len(urls)}: {url}")
        result = scrape_page(url)
        result_list.append(result)

        if result['error']:
            unreachable_count += 1

    with open('output_XML.json', 'w', encoding='utf-8') as output_file:
        json.dump({'results': result_list, 'unreachable_count': unreachable_count}, output_file, ensure_ascii=False, indent=2)


In [30]:
if __name__ == "__main__":
    main()

Processing link 1/11012: https://www.bajajfinserv.in/
Processing link 2/11012: https://www.bajajfinserv.in/bajaj-pay
Processing link 3/11012: https://www.bajajfinserv.in/bajaj-pay-wallet
Processing link 4/11012: https://www.bajajfinserv.in/gift-card
Processing link 5/11012: https://www.bajajfinserv.in/upi
Processing link 6/11012: https://www.bajajfinserv.in/investments
Processing link 7/11012: https://www.bajajfinserv.in/investments/fixed-deposit
Processing link 8/11012: https://www.bajajfinserv.in/investments/fixed-deposit-for-wedding-expenses
Processing link 9/11012: https://www.bajajfinserv.in/investments/fixed-deposit-for-retirement-fund
Processing link 10/11012: https://www.bajajfinserv.in/investments/fixed-deposit-for-home-expenses
Processing link 11/11012: https://www.bajajfinserv.in/investments/fixed-deposit-for-higher-education
Processing link 12/11012: https://www.bajajfinserv.in/investments/fixed-deposit-interest-rates
Processing link 13/11012: https://www.bajajfinserv.in/in