## <span style= 'color: cyan;'> CODE FOR SCRAPING DATA FROM BUSINESS DAY NEWS ARTICLE </span>

In [15]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from rake_nltk import Rake
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import time
import os

# === Download required NLTK resources ===
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

# === STEP 1: Load URLs from Excel ===
input_excel = "MY_BUSINESSDAY_ARTICLE.xlsx"

if not os.path.exists(input_excel):
    raise FileNotFoundError(f"❌ File not found: {input_excel}")

df_urls = pd.read_excel(input_excel)

# === STEP 2: Set headers and sentiment analyzer ===
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
results = []
analyzer = SentimentIntensityAnalyzer()

print(f"🚀 Starting scrape on {len(df_urls)} article URLs...\n")

# === STEP 3: Scrape and analyze each article ===
for idx, row in df_urls.iterrows():
    url = row.get("url") or row.get("URL")
    if not isinstance(url, str) or not url.startswith("http"):
        continue

    # ✅ Remove AMP version if present
    url = url.replace("?amp", "")

    try:
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # === Extract Title ===
        title_tag = soup.find(class_="post-title")
        title = title_tag.get_text(strip=True) if title_tag else row.get("title", "")

        # === Robust Author Extraction ===
        author = ""

        # Try CSS selector first
        author_tag = soup.select_one("span.post-author a")
        if author_tag:
            author = author_tag.get_text(strip=True)

        # Fallback: meta tag
        if not author:
            author_meta = soup.find("meta", attrs={"name": "author"})
            if author_meta and author_meta.has_attr("content"):
                author = author_meta["content"].strip()

        if not author:
            print(f"[DEBUG] Author not found on: {url}")

        # === Extract Published Date ===
        date_tag = soup.find(class_="post-date")
        pub_date = date_tag.get_text(strip=True) if date_tag else row.get("published_date", "")

        # === Extract Content ===
        content_tag = soup.find(class_="post-content")
        content = content_tag.get_text(separator=" ", strip=True) if content_tag else ""

        # === Sentiment Analysis using VADER ===
        sentiment_scores = analyzer.polarity_scores(content)
        polarity = sentiment_scores['compound']
        sentiment = (
            "Positive" if polarity >= 0.05 else
            "Negative" if polarity <= -0.05 else
            "Neutral"
        )

        # === Simple Summary: First 5 sentences ===
        sentences = nltk.sent_tokenize(content)
        summary = " ".join(sentences[:5])

        # === Keyword Extraction using RAKE ===
        try:
            rake = Rake()
            rake.extract_keywords_from_text(content)
            keyword_list = rake.get_ranked_phrases()[:10]
            top_keywords = ", ".join(keyword_list)
        except:
            top_keywords = ""

        # === Save result ===
        results.append({
            "URL": url,
            "Title": title,
            "Author": author,
            "Published Date": pub_date,
            "Content": content,
            "Summary": summary,
            "Keywords": top_keywords,
            "Sentiment": sentiment,
            "Polarity Score": polarity
        })

        print(f"{idx+1}/{len(df_urls)} ✅ {title[:60]}...")
        time.sleep(1)

    except Exception as e:
        print(f"{idx+1}/{len(df_urls)} ❌ Error scraping {url}: {e}")
        continue

# === STEP 4: Save results to CSV ===
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_csv = f"businessday_articles_vader_{timestamp}.csv"
pd.DataFrame(results).to_csv(output_csv, index=False)

print(f"\n✅ Done! All data saved to: {output_csv}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gracious\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gracious\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Gracious\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


🚀 Starting scrape on 5903 article URLs...

1/5903 ✅ $1trn economy: CBN insists banking sector essential to power...
2/5903 ✅ Five things to know as Fitch upgrades Nigeria’s rating to st...
3/5903 ✅ Diagana, World Bank’s regional vice president to visit Niger...
4/5903 ✅ MAN advocates five-year tax free policy to boost manufacturi...
5/5903 ✅ Fitch upgrades Nigeria to B on back of string of reform meas...
6/5903 ✅ Nigeria risks missing out on the Trump tariff opportunity...
7/5903 ✅ African nations must avoid retaliatory tariffs- Akinwumi Ade...
8/5903 ✅ Fitch upgrades Nigeria’s credit rating to B following reform...


KeyboardInterrupt: 

In [5]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# === NLTK Downloads ===
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

# === Load Existing CSV ===
input_csv = "updated_scraped.csv"  
df = pd.read_csv(input_csv)

# === Preprocessing Setup ===
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()  # Ensure string and lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    clean_tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and word.isalpha()
    ]
    return " ".join(clean_tokens)

# === Apply Preprocessing ===
df["Cleaned Content"] = df["Content"].astype(str).apply(preprocess_text)

# === Re-run VADER Sentiment Analysis ===
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    scores = analyzer.polarity_scores(text)
    polarity = scores['compound']
    sentiment = (
        "Positive" if polarity >= 0.05 else
        "Negative" if polarity <= -0.05 else
        "Neutral"
    )
    return pd.Series([sentiment, polarity])

df[["Cleaned Sentiment", "Cleaned Polarity"]] = df["Cleaned Content"].apply(analyze_sentiment)

# === Save to New CSV ===
output_csv = input_csv.replace(".csv", "_cleaned_with_sentiment.csv")
df.to_csv(output_csv, index=False)

print(f"✅ All done! Cleaned data with new sentiment saved to: {output_csv}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gracious\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gracious\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gracious\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Gracious\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Gracious\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


✅ All done! Cleaned data with new sentiment saved to: updated_scraped_cleaned_with_sentiment.csv
