In [6]:
import requests
from bs4 import BeautifulSoup
import json
import urllib.parse

In [7]:
def clean_text(text):
    # Remove unwanted characters like "\n" and "\t"
    return ' '.join(text.split())

In [8]:
def scrape_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        parsed_url = urllib.parse.urlparse(url)

        path = parsed_url.path.strip("/")
        # Split path by "/" and remove empty elements
        url_keywords = [keyword for keyword in path.split("/") if keyword]

        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Check if the request was successful

        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract title and h tags, cleaning up the text
        title = clean_text(soup.title.text) if soup.title else ''
        h_tags = [clean_text(h.text) for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]

        # Extract meta tags for keywords and description
        meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
        meta_description = soup.find('meta', attrs={'name': 'description'})

        keywords = meta_keywords['content'] if meta_keywords else ''
        description = meta_description['content'] if meta_description else ''

        # Extract individual words from comma-separated keywords
        keyword_words = [word.strip() for word in keywords.split(',')]

        return {
            'url': url,
            'meta': {
                'title': title,
                'h_tags': h_tags,
                'url_keywords': url_keywords,
                'keywords': keyword_words,
                'description': description
            },
            'error': None
        }

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return {
            'url': url,
            'meta': {
                'title': '',
                'h_tags': [],
                'url_keywords': [],
                'keywords': '',
                'description': ''
            },
            'error': str(e)
        }

In [9]:
def main():
    # Load URLs from a JSON file
    with open('filtered_links101.json', 'r', encoding='utf-8') as input_file:
        urls = json.load(input_file)

    result_list = []
    unreachable_count = 0

    for url in urls:
        result = scrape_page(url)
        result_list.append(result)

        if result['error']:
            unreachable_count += 1

    # Save the results and unreachable count to another JSON file
    with open('output.json', 'w', encoding='utf-8') as output_file:
        json.dump({'results': result_list, 'unreachable_count': unreachable_count}, output_file, ensure_ascii=False, indent=2)

In [10]:
if __name__ == "__main__":
    main()

Error fetching https://bajajfinserv.in/finance-investor-relations-financial-results: 404 Client Error: Not Found for url: https://www.bajajfinserv.in/finance-investor-relations-financial-results
Error fetching https://bajajfinserv.in/Complaints/Add: 404 Client Error: Not Found for url: https://www.bajajfinserv.in/Complaints/Add
Error fetching https://bajajfinserv.in/insta-personal-loan-for-home-improvement
: 404 Client Error: Not Found for url: https://www.bajajfinserv.in/insta-personal-loan-for-home-improvement%0D%0A
Error fetching https://bajajfinserv.in/partners/india-shelter.html: 404 Client Error: Not Found for url: https://www.bajajfinserv.in/partners/india-shelter.html
Error fetching https://bajajfinserv.in/company: 404 Client Error: Not Found for url: https://www.bajajfinserv.in/company
Error fetching https://bajajfinserv.in/stock/ongc: 404 Client Error: Not Found for url: https://www.bajajfinserv.in/stock/ongc
Error fetching https://bajajfinserv.in/tel:75072 45858: 404 Client 