# Web Scraping

* Following code to retrieve only new data, not re-scrape existing pages. We can schedule this code using Linux cron tab or in windwos using windows scheduler.
* I am providing Scraped content saving in .json file and .jason file reading code

In [1]:
import requests
from bs4 import BeautifulSoup
import json

In [2]:
url = "https://indiacorplaw.in"

In [3]:
#Package the request, send the request and catch the response: r
r = requests.get(url)
# Extract the response as html: text
text = r.text

In [4]:
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(text, 'html.parser')

# Find all 'a' (anchor) tags
anchor_tags = soup.find_all('a')

final_url=set()
# Extract and print the href attributes that end with '.html'
for tag in anchor_tags:
    href = tag.get('href')
    if href and href.endswith('.html'):
        #print(href)
        final_url.add(href)


In [5]:
final_url

{'https://indiacorplaw.in/2018/04/supreme-court-rules-mandatory-procedure-sarfaesi-act.html',
 'https://indiacorplaw.in/2020/04/a-stamp-paper-what-good-is-it-beyond-six-months.html',
 'https://indiacorplaw.in/2021/12/supreme-court-on-the-regularization-of-temporary-employees.html',
 'https://indiacorplaw.in/2023/04/denying-input-tax-credit-to-bona-fide-recipients-where-gst-is-not-paid-by-the-supplier.html',
 'https://indiacorplaw.in/2023/07/revival-of-time-barred-debts.html',
 'https://indiacorplaw.in/2023/09/a-banks-post-merger-liability-for-pre-merger-crimes.html',
 'https://indiacorplaw.in/2023/09/an-exercise-in-smoke-and-mirrors-imposing-a-goods-tax-on-actionable-claims.html',
 'https://indiacorplaw.in/2023/09/analysing-the-tax-implications-of-bonus-shares.html',
 'https://indiacorplaw.in/2023/09/capital-reduction-tax-conundrums.html',
 'https://indiacorplaw.in/2023/09/companys-interest-vs-duty-to-protect-the-environment-a-directors-balancing-act.html',
 'https://indiacorplaw.in/20

In [6]:

# List of URLs to scrape
urls = final_url  # Replace with your list of URLs

# Load the list of already scraped blog URLs from a file or database
# For demonstration, we'll assume a simple text file to store URLs
already_scraped_urls = set()

with open('scraped_urls.txt', 'r') as file:
    already_scraped_urls = set(file.read().splitlines())

# Create an empty list to store the data for each new page
new_page_data_list = []

# Function to scrape a single page and extract heading and content
def scrape_page(url):
    if url in already_scraped_urls:
        print(f"Skipping already scraped URL: {url}")
        return None

    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the main heading (adjust the element as needed)
        main_heading = soup.find('h1').get_text()

        # Extract the full content (adjust the element as needed)
        if response.status_code == 200:
            # Parse the HTML content of the page using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find the main content element based on its class or structure
            main_content = soup.find('div', class_='entry-content')  # Adjust this based on the actual structure of the page

            # Extract text and headings from the main content
            text_and_headings = []
            for element in main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
                text_and_headings.append(element.get_text())

            # Join the extracted text and headings into a single string
            #result = str(text_and_headings).replace('\xa0', ' ')

            # Save the URL to the set of already scraped URLs
            already_scraped_urls.add(url)

        # Create a dictionary for the page data
        page_data = {
            'url': url,
            'heading': main_heading,
            'content': text_and_headings
        }

        return page_data
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None

# Iterate through the list of URLs and scrape each page
for url in urls:
    page_data = scrape_page(url)
    if page_data:
        new_page_data_list.append(page_data)

# Save the scraped data to a JSON file with newlines between objects
output_filename = 'scraped_data.json'
with open(output_filename, 'a', encoding='utf-8') as json_file:
    for page_data in new_page_data_list:
        json.dump(page_data, json_file, ensure_ascii=False, indent=4)
        json_file.write('\n')  # Add a newline to separate entries


# Save the updated list of already scraped URLs to a file
with open('scraped_urls.txt', 'w') as file:
    file.write('\n'.join(already_scraped_urls))

print(f"Scraped data saved to {output_filename}")


Skipping already scraped URL: https://indiacorplaw.in/2021/12/supreme-court-on-the-regularization-of-temporary-employees.html
Skipping already scraped URL: https://indiacorplaw.in/2018/04/supreme-court-rules-mandatory-procedure-sarfaesi-act.html
Skipping already scraped URL: https://indiacorplaw.in/2023/09/restoring-equity-supreme-court-on-liquidator-discretion-in-ibc-bidding-process.html
Skipping already scraped URL: https://indiacorplaw.in/2023/09/critiquing-sebis-narrow-interpretation-of-the-inter-se-transfer-exemption.html
Skipping already scraped URL: https://indiacorplaw.in/2023/04/denying-input-tax-credit-to-bona-fide-recipients-where-gst-is-not-paid-by-the-supplier.html
Skipping already scraped URL: https://indiacorplaw.in/2023/09/sc-expands-scope-of-enquiry-under-section-11-of-the-arbitration-act.html
Skipping already scraped URL: https://indiacorplaw.in/2023/09/capital-reduction-tax-conundrums.html
Skipping already scraped URL: https://indiacorplaw.in/2023/09/rajasthans-gig-w

# Reading .json file

In [7]:
import json

# Specify the path to your JSON file
json_file_path = 'scraped_data.json'

# Open and read the JSON file
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    # Read each line separately as JSON objects may be separated by newlines
    for line in json_file:
        print(line)
       


{

    "url": "https://indiacorplaw.in/2023/10/rbis-draft-directions-on-wilful-defaulters-implications-and-concerns.html",

    "heading": "RBI’s Draft Directions on Wilful Defaulters: Implications and Concerns",

    "content": [

        "[Aamir Kapadia and Tejas Venkatesh are penultimate year BBA L.L.B. (Hons.) students at Jindal Global Law School]",

        "On September 21, 2023, the Reserve Bank of India (“RBI”) released the Draft Master Direction on treatment of Wilful Defaulters and Large Defaulters. The purpose behind the draft is to solicit public comments on proposed regulations to tighten the norms applicable to wilful defaulters. While the draft regulations significantly strengthen the wilful defaulter norms, they also raise certain vital concerns.",

        "Applicable Legal Regime",

        "The master circular on wilful defaulters establishes a system to identify and disseminate credit information regarding wilful defaulters to banks and financial institutions, to re

In [10]:
!pip freeze > requirements.txt


In [11]:
from IPython.display import FileLink
FileLink("requirements.txt")
