## Data Extraction

In [8]:
import pandas as pd

# Load the input file
input_file = "Input.xlsx"
df = pd.read_excel(input_file)
urls = df['URL']  # Assuming column name for URLs is 'URL'
url_ids = df['URL_ID']  # Assuming column name for URL_IDs is 'URL_ID'


In [10]:
import requests
from bs4 import BeautifulSoup
import os

def extract_article(url):
    try:
        # Send an HTTP request
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract title and article text
        title = soup.find('h1').get_text(strip=True)  # Assuming title is in <h1>
        article_text = " ".join([p.get_text(strip=True) for p in soup.find_all('p')])

        return title, article_text

    except Exception as e:
        print(f"Error extracting article from {url}: {e}")
        return None, None


In [12]:
output_dir = "Articles"  # Directory to save articles
os.makedirs(output_dir, exist_ok=True)

for url, url_id in zip(urls, url_ids):
    title, article_text = extract_article(url)
    if title and article_text:
        # Save to a text file
        with open(os.path.join(output_dir, f"{url_id}.txt"), "w", encoding="utf-8") as file:
            file.write(f"{title}\n\n{article_text}")
    else:
        print(f"Skipping URL_ID {url_id} due to extraction error.")
