In [None]:
!pip install scrapy

Collecting scrapy
  Downloading scrapy-2.13.0-py3-none-any.whl.metadata (5.4 kB)
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting itemadapter>=0.1.0 (from scrapy)
  Downloading itemadapter-0.11.0-py3-none-any.whl.metadata (18 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.2-py3-none-any.whl.metadata (3.9 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.10.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting protego>=0.1.15 (from scrapy)
  Downloading Protego-0.4.0-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting pydispatcher>=2.0.5 (from scrapy)
  Downloading PyDispatcher-2.0.7-py3-none-any.whl.metadata (2.4 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.8.0-py3-none-any.whl.metadata (6.1 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting tldextract (from scrapy)


In [None]:


class NewsSpider(scrapy.Spider):
    name = 'enhanced_news_spider'
    custom_settings = {
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'DOWNLOAD_DELAY': 1,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 2,
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'enhanced_news_articles.csv',
        'DEPTH_LIMIT': 1,
        'FEED_EXPORT_FIELDS': ['title', 'url', 'section_name', 'category', 'source', 'timestamp', 'section_url']
    }

    SECTION_MAPPING = {
        'business': ['business', 'economy', 'finance', 'markets', 'money', 'invest', 'stocks', 'companies', 'corporate', 'trade'], # <-- ADDED BUSINESS
        'politics': ['politics', 'government', 'election', 'parliament', 'congress', 'democracy'],
        'sports': ['sports', 'football', 'soccer', 'basketball', 'tennis', 'golf', 'olympics'],
        'culture': ['culture', 'arts', 'entertainment', 'movies', 'music', 'television', 'theatre', 'gallery', 'celebrities']
    }

    # Alternative channels for each category if primary doesn't have it
    ALTERNATIVE_CHANNELS = {
        'business': ['https://www.bbc.com/news/business', 'https://edition.cnn.com/business', 'https://www.aljazeera.com/economy/'], # <-- ADDED BUSINESS
        'politics': ['https://www.aljazeera.com', 'https://www.theguardian.com/international'],
        'sports': ['https://www.bbc.com/sport', 'https://edition.cnn.com/sport'],
        'culture': ['https://www.bbc.com/culture', 'https://www.theguardian.com/uk/culture']
    }

    start_urls = [
        'https://www.bbc.com',
        'https://edition.cnn.com',
        'https://www.aljazeera.com',
        'https://www.theguardian.com/international',
        'https://www.newsday.co.zw'
    ]

    def parse(self, response):
        if response.meta.get('is_section_page'):
            self.logger.info(f"Extracting articles from section page: {response.url}")
            yield from self.extract_articles(response)
        elif response.url in self.start_urls:
            self.logger.info(f"Discovering sections from homepage: {response.url}")
            yield from self.discover_sections(response)
        else:
            self.logger.warning(f"Unexpected URL in parse(): {response.url}")
            if 'source' not in response.meta:
                    response.meta['source'] = self.get_source_name(response.url)
            if 'category' not in response.meta:
                    response.meta['category'] = self.detect_category(response)
            if 'potential_section_name' not in response.meta:
                    response.meta['potential_section_name'] = ''
            yield from self.extract_articles(response)

    def discover_sections(self, response):
        """Discover relevant sections from homepage navigation"""
        link_extractor = LinkExtractor(
            restrict_css='nav, ul.menu, div.navigation, header nav, footer nav, .primary-nav',
            deny_extensions=['jpg', 'png', 'pdf']
        )

        found_categories = set()

        for link in link_extractor.extract_links(response):
            url = link.url.lower()
            text = link.text.lower() if link.text else ''

            for category, keywords in self.SECTION_MAPPING.items():
                if any(kw in url or kw in text for kw in keywords):
                    found_categories.add(category)
                    meta_data = {
                        'category': category,
                        'source': self.get_source_name(response.url),
                        'potential_section_name': link.text.strip() if link.text else '',
                        'is_section_page': True
                    }
                    yield scrapy.Request(
                        url=link.url,
                        callback=self.parse,
                        meta=meta_data
                    )
                    break

        # Check for missing categories and suggest alternatives
        # This logic now applies to the current source (response.url)
        current_source_domain = self.get_source_name(response.url).lower().replace(' ', '') # e.g. "bbc" from "BBC"

        missing_categories_on_this_source = set(self.SECTION_MAPPING.keys()) - found_categories
        for category in missing_categories_on_this_source:
            self.logger.info(f"No '{category}' section found on {response.url} via navigation links. Checking ALTERNATIVE_CHANNELS.")
            for alt_url in self.ALTERNATIVE_CHANNELS.get(category, []):
                # Only yield alternative if it belongs to the current source or is a general alternative not tied to a source
                alt_source_domain = self.get_source_name(alt_url).lower().replace(' ', '')
                # Check if the alternative URL's domain matches the current response's domain
                # This ensures we only try to find e.g. BBC's business section when on BBC.
                if any(s in alt_url for s in self.start_urls if current_source_domain in s) or \
                   any(s_domain in alt_url for s_domain in [d.split('//')[-1].split('/')[0] for d in self.start_urls if current_source_domain in d.split('//')[-1].split('/')[0]]):

                    self.logger.info(f"Attempting to use alternative URL for '{category}' on {current_source_domain}: {alt_url}")
                    yield scrapy.Request(
                        url=alt_url,
                        callback=self.parse, # Send to parse, it will then go to extract_articles
                        meta={
                            'category': category, # Pre-assign category
                            'source': self.get_source_name(alt_url), # Source from the alt_url itself
                            'potential_section_name': category.capitalize(), # Use category as fallback section name
                            'is_section_page': True # Mark as section page
                        }
                    )


    def extract_articles(self, response):
        """Extract articles from section pages with proper section names"""
        source = response.meta.get('source', self.get_source_name(response.url))
        category = response.meta.get('category', self.detect_category(response))
        potential_section_name = response.meta.get('potential_section_name', '')

        section_name = self.extract_section_name(response, potential_section_name)

        article_selectors = {
            'bbc.com': 'a[href*="/news/"], a[href*="/sport/"], a[href*="/culture/"], a[href*="/business/"]', # Added /business/ for BBC
            'cnn.com': 'a[href*="/article/"], a[href*="/202"], .container__link, a[href*="/business/"]', # Added /business/ for CNN
            'aljazeera.com': 'a.u-clickable-card__link, a.gc__title, h3 a, a[href*="/economy/"]', # Added /economy/ for Al Jazeera
            'theguardian.com': 'a[data-link-name="article"], .fc-item__link, a[href*="/business/"]', # Added /business/ for Guardian
            'newsday.co.zw': 'a.story-link, h2 a' # NewsDay might need a specific business selector if available
        }

        selector_str = None
        for domain, sel in article_selectors.items():
            if domain in response.url:
                selector_str = sel
                break
        if not selector_str:
            selector_str = 'a[href*="/article/"], a[href*="/news/"], a[href*="/story/"], a[href*="/post/"], a[href*="/business/"]' # Added generic business

        articles = response.css(selector_str)

        article_count = 0
        for article in articles:
            if article_count >= 10:
                break

            title_parts = article.css('::text').getall()
            title = " ".join(part.strip() for part in title_parts if part.strip()).strip()
            title = re.sub(r'\s+', ' ', title)

            href = article.css('::attr(href)').get()

            if not title or not href:
                continue

            if len(title) < 15 or len(title) > 200:
                self.logger.debug(f"Skipping article with title length issue: '{title}' from {response.url}")
                continue

            url = response.urljoin(href)

            yield {
                'title': title,
                'url': url,
                'section_name': section_name,
                'category': category,
                'source': source,
                'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                'section_url': response.url
            }
            article_count += 1
        if article_count == 0:
            self.logger.info(f"No articles found on {response.url} with selector '{selector_str}'")

    def extract_section_name(self, response, fallback_name):
        """Extract the actual section name from the page"""
        selectors = [
            'h1.section-title::text', '.section-header h1::text', '.page-title::text',
            'h1.title::text', 'h1.headline::text', '.content__label::text',
            'header h1::text',
            'h1::text'
        ]
        section_name_str = ''
        for sel in selectors:
            section_name_str = response.css(sel).get('')
            if section_name_str.strip():
                break

        if not section_name_str.strip():
            section_name_str = fallback_name

        section_name_str = re.sub(r'[^\w\s-]', '', section_name_str).strip()

        if not section_name_str:
            self.logger.debug(f"Section name not found via selectors/fallback for {response.url}. Trying URL parsing.")
            path_parts = [part for part in response.url.split('/') if part]
            generic_terms = {'www', 'com', 'co', 'uk', 'org', 'news', 'article', 'category'}
            for part in reversed(path_parts[-3:]):
                if part and len(part) > 3 and part.lower() not in generic_terms and not part.isdigit():
                    section_name_str = part.replace('-', ' ').replace('_', ' ').title()
                    break

        return section_name_str[:100]

    def get_source_name(self, url):
        """Extract source name from URL"""
        domain_map = {
            'bbc.com': 'BBC',
            'cnn.com': 'CNN',
            'aljazeera.com': 'Al Jazeera',
            'theguardian.com': 'The Guardian',
            'newsday.co.zw': 'NewsDay Zimbabwe'
        }
        for domain, name in domain_map.items():
            if domain in url:
                return name
        try:
            return url.split('//')[-1].split('/')[0].replace('www.', '')
        except IndexError:
            return 'Unknown Source'

    def detect_category(self, response):
        """Detect category from multiple signals"""
        url_lower = response.url.lower()
        title_lower = response.css('title::text').get('').lower()
        meta_keywords_lower = response.css('meta[name="keywords"]::attr(content)').get('').lower()

        breadcrumb_texts = response.css('.breadcrumb a::text, .breadcrumbs a::text, [itemtype*="BreadcrumbList"] span[itemprop="name"]::text').getall()
        breadcrumb_str = ' '.join(text.lower().strip() for text in breadcrumb_texts)

        signals_text = [
            url_lower,
            title_lower,
            meta_keywords_lower,
            breadcrumb_str,
            response.css('meta[property="og:section"]::attr(content)').get('').lower(),
            response.css('meta[name="section"]::attr(content)').get('').lower(),
            response.css('meta[name="parsely-section"]::attr(content)').get('').lower()
        ]

        for text_signal in signals_text:
            if not text_signal:
                continue
            for category, keywords in self.SECTION_MAPPING.items():
                if any(kw in text_signal for kw in keywords):
                    return category
        return 'general'

if __name__ == "__main__":
    process = CrawlerProcess()
    process.crawl(NewsSpider)
    process.start()

INFO:scrapy.utils.log:Scrapy 2.13.0 started (bot: scrapybot)
2025-05-12 13:06:22 [scrapy.utils.log] INFO: Scrapy 2.13.0 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions:
{'lxml': '5.4.0',
 'libxml2': '2.13.8',
 'cssselect': '1.3.0',
 'parsel': '1.10.0',
 'w3lib': '2.3.1',
 'Twisted': '24.11.0',
 'Python': '3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0]',
 'pyOpenSSL': '24.2.1 (OpenSSL 3.3.2 3 Sep 2024)',
 'cryptography': '43.0.3',
 'Platform': 'Linux-6.1.123+-x86_64-with-glibc2.35'}
2025-05-12 13:06:22 [scrapy.utils.log] INFO: Versions:
{'lxml': '5.4.0',
 'libxml2': '2.13.8',
 'cssselect': '1.3.0',
 'parsel': '1.10.0',
 'w3lib': '2.3.1',
 'Twisted': '24.11.0',
 'Python': '3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0]',
 'pyOpenSSL': '24.2.1 (OpenSSL 3.3.2 3 Sep 2024)',
 'cryptography': '43.0.3',
 'Platform': 'Linux-6.1.123+-x86_64-with-glibc2.35'}
INFO:scrapy.addons:Enabled addons:
[]
2025-05-12 13:06:22 [scrapy.addons] INFO: Enabled addons:
[]
DEBUG:scrapy.utils.log:Us

RuntimeError: This event loop is already running

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.newsday.co.zw/category/4/business> (referer: https://www.newsday.co.zw)
2025-05-12 13:06:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.newsday.co.zw/category/4/business> (referer: https://www.newsday.co.zw)


INFO:numexpr.utils:NumExpr defaulting to 2 threads.
2025-05-12 13:06:45 [numexpr.utils] INFO: NumExpr defaulting to 2 threads.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

NGROK_AUTHTOKEN has been set to: 2wjGixesK4lSkf6H5k7LJNDzgTz_67GdfYACNFqAPWX24BWBt


In [None]:
pip install scrapy



In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from datetime import datetime
import re
import logging

class NewsSpider(scrapy.Spider):
    name = 'enhanced_news_spider'
    custom_settings = {
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'DOWNLOAD_DELAY': 1,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 2,
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'enhanced_news_articles.csv', # Output file
        'DEPTH_LIMIT': 1,
        'FEED_EXPORT_FIELDS': ['title', 'url', 'section_name', 'category', 'source', 'timestamp', 'section_url'],
        'LOG_LEVEL': 'INFO',
        'FEED_OVERWRITE': True, # Overwrite the CSV if it exists
    }

    SECTION_MAPPING = {
        'business': ['business', 'economy', 'finance', 'markets', 'money', 'invest', 'stocks', 'companies', 'corporate', 'trade'],
        'politics': ['politics', 'government', 'election', 'parliament', 'congress', 'democracy'],
        'sports': ['sports', 'football', 'soccer', 'basketball', 'tennis', 'golf', 'olympics'],
        'culture': ['culture', 'arts', 'entertainment', 'movies', 'music', 'television', 'theatre', 'gallery', 'celebrities']
    }

    ALTERNATIVE_CHANNELS = {
        'business': ['https://www.bbc.com/news/business', 'https://edition.cnn.com/business', 'https://www.aljazeera.com/economy/'],
        'politics': ['https://www.aljazeera.com/politics/', 'https://www.theguardian.com/politics'],
        'sports': ['https://www.bbc.com/sport', 'https://edition.cnn.com/sport'],
        'culture': ['https://www.bbc.com/culture', 'https://www.theguardian.com/uk/culture']
    }

    start_urls = [
        'https://www.bbc.com',
        'https://edition.cnn.com',
        'https://www.aljazeera.com',
        'https://www.theguardian.com/international',
        # 'https://www.newsday.co.zw' # You can uncomment this if needed
    ]

    def __init__(self, *args, **kwargs):
        super(NewsSpider, self).__init__(*args, **kwargs)
        self.logger.setLevel(logging.INFO)

    def parse(self, response):
        if response.meta.get('is_section_page'):
            self.logger.info(f"Extracting articles from section page: {response.url} (Category: {response.meta.get('category')})")
            yield from self.extract_articles(response)
        elif response.url in self.start_urls:
            self.logger.info(f"Discovering sections from homepage: {response.url}")
            yield from self.discover_sections(response)
        else:
            self.logger.warning(f"Unexpected URL in parse(): {response.url}. Attempting article extraction.")
            if 'source' not in response.meta:
                    response.meta['source'] = self.get_source_name(response.url)
            if 'category' not in response.meta:
                    response.meta['category'] = self.detect_category(response)
            if 'potential_section_name' not in response.meta:
                    response.meta['potential_section_name'] = response.meta.get('category', 'Unknown Section').capitalize()
            yield from self.extract_articles(response)

    def discover_sections(self, response):
        link_extractor = LinkExtractor(
            restrict_css='nav, ul.menu, div.navigation, header nav, footer nav, .primary-nav, #orb-nav-links',
            deny_extensions=['jpg', 'png', 'pdf', 'mp4', 'mp3', 'zip', 'gz', 'css', 'js'],
            unique=True
        )
        found_categories_on_page = set()
        current_source_main_domain = response.url.split('//')[-1].split('/')[0]

        for link in link_extractor.extract_links(response):
            url_lower = link.url.lower()
            text_lower = link.text.lower().strip() if link.text else ''
            if not text_lower and '#' in link.url: continue

            for category, keywords in self.SECTION_MAPPING.items():
                if any(kw in url_lower or kw in text_lower for kw in keywords):
                    if category not in found_categories_on_page :
                        self.logger.info(f"Found potential section link for '{category}': '{link.text}' ({link.url}) on {response.url}")
                        found_categories_on_page.add(category)
                        yield scrapy.Request(
                            url=response.urljoin(link.url),
                            callback=self.parse,
                            meta={
                                'category': category,
                                'source': self.get_source_name(response.url),
                                'potential_section_name': link.text.strip() if link.text else category.capitalize(),
                                'is_section_page': True
                            }
                        )
                        break

        missing_categories_for_this_source = set(self.SECTION_MAPPING.keys()) - found_categories_on_page
        for category in missing_categories_for_this_source:
            self.logger.info(f"No direct link for '{category}' found in navigation of {response.url}. Checking ALTERNATIVE_CHANNELS.")
            for alt_url in self.ALTERNATIVE_CHANNELS.get(category, []):
                if current_source_main_domain in alt_url:
                    self.logger.info(f"Attempting alternative URL for '{category}' from {current_source_main_domain}: {alt_url}")
                    yield scrapy.Request(
                        url=alt_url, callback=self.parse,
                        meta={'category': category, 'source': self.get_source_name(alt_url),
                              'potential_section_name': category.capitalize(), 'is_section_page': True}
                    )

    def extract_articles(self, response):
        source = response.meta.get('source', self.get_source_name(response.url))
        category = response.meta.get('category', self.detect_category(response))
        potential_section_name = response.meta.get('potential_section_name', category.capitalize())
        section_name = self.extract_section_name(response, potential_section_name)
        if not section_name: section_name = category.capitalize()

        article_selectors_map = {
            'bbc.com': ['a[type="article"]', 'a[href*="/news/articles/"]', '.gs-c-promo-heading[class*="__title"]'],
            'cnn.com': ['a[data-link-type="article"]', '.card a', 'a[href*="/videos/"]'],
            'aljazeera.com': ['a.u-clickable-card__link', 'article h3 a', 'a.article-trending__title-link'],
            'theguardian.com': ['a[data-link-name="article"]', '.fc-item__link', 'a[aria-label*="article"]']
        }
        selector_list_for_domain = []
        for domain_key in article_selectors_map:
            if domain_key in response.url: selector_list_for_domain = article_selectors_map[domain_key]; break

        generic_selectors = ['article a[href]', 'div[class*="article"] a[href]', 'a[href*="/article/"]']
        combined_selectors = selector_list_for_domain + generic_selectors

        articles_found_on_page = set(); article_count = 0
        for selector_str in combined_selectors:
            if article_count >= 10: break
            for article_element in response.css(selector_str):
                if article_count >= 10: break
                href = article_element.css('::attr(href)').get()
                if not href: continue
                full_url = response.urljoin(href)
                if full_url in articles_found_on_page or not (full_url.startswith('http://') or full_url.startswith('https://')): continue

                title = " ".join(part.strip() for part in article_element.css('::text').getall() if part.strip()).strip()
                title = re.sub(r'\s+', ' ', title).strip()
                if len(title) < 10:
                    h_texts = article_element.css('h1::text, h2::text, h3::text, h4::text, span[class*="title"]::text').getall()
                    if h_texts: title = " ".join(h.strip() for h in h_texts if h.strip()).strip()
                if not title or len(title) < 10 or len(title) > 200: continue

                articles_found_on_page.add(full_url)
                yield {'title': title, 'url': full_url, 'section_name': section_name, 'category': category,
                       'source': source, 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                       'section_url': response.url}
                article_count += 1
        if article_count == 0: self.logger.info(f"No articles extracted from {response.url} (Cat: {category})")

    def extract_section_name(self, response, fallback_name):
        selectors = ['h1.section-header__title::text', 'h1[class*="PageTitle"]::text', '.page-title ::text', 'h1::text']
        section_name_str = ''
        for sel in selectors:
            for name_part in response.css(sel).getall():
                cleaned_name = name_part.strip()
                if cleaned_name: section_name_str = cleaned_name; break
            if section_name_str: break
        if not section_name_str.strip(): section_name_str = fallback_name
        section_name_str = re.sub(r'[^\w\s-]', '', section_name_str).strip()
        if not section_name_str:
            path_parts = [p for p in response.url.split('/') if p]
            generic = {'www', 'com', 'news', 'article', 'category'}
            for part in reversed(path_parts[-3:]):
                cleaned = part.split('.')[0]
                if cleaned and len(cleaned) > 3 and cleaned.lower() not in generic and not cleaned.isdigit():
                    section_name_str = cleaned.replace('-', ' ').replace('_', ' ').title(); break
        return section_name_str[:100] if section_name_str else fallback_name

    def get_source_name(self, url):
        domain_map = {'bbc.com': 'BBC', 'cnn.com': 'CNN', 'aljazeera.com': 'Al Jazeera', 'theguardian.com': 'The Guardian'}
        for domain, name in domain_map.items():
            if domain in url: return name
        try: return url.split('//')[-1].split('/')[0].replace('www.', '').split('.')[0].capitalize()
        except: return 'Unknown Source'

    def detect_category(self, response):
        if response.meta.get('category'): return response.meta.get('category')
        signals = [response.url.lower(), response.css('title::text').get('').lower(),
                   ' '.join(response.css('h1::text, h2::text').getall()).lower(),
                   response.css('meta[property="og:section"]::attr(content)').get('').lower()]
        path_segments = [seg for seg in response.url.lower().split('/') if seg]
        for seg in path_segments:
            for cat, kws in self.SECTION_MAPPING.items():
                if seg in kws or cat == seg: return cat
        for signal in signals:
            if not signal: continue
            for cat, kws in self.SECTION_MAPPING.items():
                if any(kw in signal for kw in kws): return cat
        return 'general'

if __name__ == "__main__":
    print("Starting Scrapy spider (NewsSpider)...")
    print("Output will be 'enhanced_news_articles.csv'")
    # Note: FEED_OVERWRITE is in custom_settings, so it will be used.
    process = CrawlerProcess(settings={ # Ensure settings are passed if not fully relying on custom_settings
        'LOG_LEVEL': 'INFO', # Can be DEBUG for more verbosity
        'USER_AGENT': NewsSpider.custom_settings['USER_AGENT'], # Example of ensuring it's passed
        'DOWNLOAD_DELAY': NewsSpider.custom_settings['DOWNLOAD_DELAY'],
        'CONCURRENT_REQUESTS_PER_DOMAIN': NewsSpider.custom_settings['CONCURRENT_REQUESTS_PER_DOMAIN'],
        'DEPTH_LIMIT': NewsSpider.custom_settings['DEPTH_LIMIT'],
        'FEED_FORMAT': NewsSpider.custom_settings['FEED_FORMAT'],
        'FEED_URI': NewsSpider.custom_settings['FEED_URI'],
        'FEED_EXPORT_FIELDS': NewsSpider.custom_settings['FEED_EXPORT_FIELDS'],
        'FEED_OVERWRITE': True,
    })
    process.crawl(NewsSpider)
    process.start() # This will block until the crawl is finished
    print("Scrapy spider (NewsSpider) finished.")

INFO:scrapy.utils.log:Scrapy 2.13.0 started (bot: scrapybot)
2025-05-12 13:08:01 [scrapy.utils.log] INFO: Scrapy 2.13.0 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions:
{'lxml': '5.4.0',
 'libxml2': '2.13.8',
 'cssselect': '1.3.0',
 'parsel': '1.10.0',
 'w3lib': '2.3.1',
 'Twisted': '24.11.0',
 'Python': '3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0]',
 'pyOpenSSL': '24.2.1 (OpenSSL 3.3.2 3 Sep 2024)',
 'cryptography': '43.0.3',
 'Platform': 'Linux-6.1.123+-x86_64-with-glibc2.35'}
2025-05-12 13:08:01 [scrapy.utils.log] INFO: Versions:
{'lxml': '5.4.0',
 'libxml2': '2.13.8',
 'cssselect': '1.3.0',
 'parsel': '1.10.0',
 'w3lib': '2.3.1',
 'Twisted': '24.11.0',
 'Python': '3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0]',
 'pyOpenSSL': '24.2.1 (OpenSSL 3.3.2 3 Sep 2024)',
 'cryptography': '43.0.3',
 'Platform': 'Linux-6.1.123+-x86_64-with-glibc2.35'}
INFO:scrapy.addons:Enabled addons:
[]
2025-05-12 13:08:01 [scrapy.addons] INFO: Enabled addons:
[]
DEBUG:scrapy.utils.log:Us

Starting Scrapy spider (NewsSpider)...
Output will be 'enhanced_news_articles.csv'


INFO:scrapy.middleware:Enabled downloader middlewares:
['scrapy.downloadermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2025-05-12 13:08:01 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.downloadermiddlewares.httpauth.HttpA

RuntimeError: This event loop is already running

In [None]:
pip install



INFO:scrapy.utils.log:Scrapy 2.13.0 started (bot: scrapybot)
2025-05-12 13:12:52 [scrapy.utils.log] INFO: Scrapy 2.13.0 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions:
{'lxml': '5.4.0',
 'libxml2': '2.13.8',
 'cssselect': '1.3.0',
 'parsel': '1.10.0',
 'w3lib': '2.3.1',
 'Twisted': '24.11.0',
 'Python': '3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0]',
 'pyOpenSSL': '24.2.1 (OpenSSL 3.3.2 3 Sep 2024)',
 'cryptography': '43.0.3',
 'Platform': 'Linux-6.1.123+-x86_64-with-glibc2.35'}
2025-05-12 13:12:52 [scrapy.utils.log] INFO: Versions:
{'lxml': '5.4.0',
 'libxml2': '2.13.8',
 'cssselect': '1.3.0',
 'parsel': '1.10.0',
 'w3lib': '2.3.1',
 'Twisted': '24.11.0',
 'Python': '3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0]',
 'pyOpenSSL': '24.2.1 (OpenSSL 3.3.2 3 Sep 2024)',
 'cryptography': '43.0.3',
 'Platform': 'Linux-6.1.123+-x86_64-with-glibc2.35'}
INFO:scrapy.addons:Enabled addons:
[]
2025-05-12 13:12:52 [scrapy.addons] INFO: Enabled addons:
[]
DEBUG:scrapy.utils.log:Us

Starting Scrapy spider (NewsSpider)...
Output will be 'enhanced_news_articles.csv'


INFO:scrapy.middleware:Enabled downloader middlewares:
['scrapy.downloadermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2025-05-12 13:12:52 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.downloadermiddlewares.httpauth.HttpA

Scrapy spider (NewsSpider) finished.


In [None]:
# prompt: convert the above csv into excel

import pandas as pd

# Load the CSV file into a pandas DataFrame
try:
    df = pd.read_csv('enhanced_news_articles.csv')

    # Convert the DataFrame to an Excel file
    df.to_excel('enhanced_news_articles.xlsx', index=False)  # Set index=False to avoid writing row indices

    print("CSV file successfully converted to Excel.")

except FileNotFoundError:
    print("Error: 'enhanced_news_articles.csv' not found. Please make sure the file exists.")
except Exception as e:
    print(f"An error occurred: {e}")


INFO:numexpr.utils:NumExpr defaulting to 2 threads.
2025-05-12 13:20:21 [numexpr.utils] INFO: NumExpr defaulting to 2 threads.


CSV file successfully converted to Excel.


Successfully converted 'enhanced_news_articles.csv' to 'enhanced_news_articles.xlsx'


--- Starting Clustering Process ---
Processing 888 articles for clustering...
Performing K-Means with 5 clusters...
Silhouette Score: 0.08
Model saved to kmeans_model.pkl, Vectorizer to tfidf_vectorizer.pkl
Clustered articles saved to 'clustered_news_articles.csv'

Cluster Distribution:
 cluster_label
0     35
1     14
2     13
3    749
4     77
Name: count, dtype: int64
--- Clustering Process Finished ---


Flask templates created in 'templates/' directory.


In [None]:
pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.8-py3-none-any.whl.metadata (10 kB)
Downloading pyngrok-7.2.8-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.8


Flask App Cell: NGROK_AUTHTOKEN from env: None
CRITICAL: NGROK_AUTHTOKEN was not found in the environment. ngrok will likely fail.
Ngrok authtoken not set. Flask app will run locally only.
Starting Flask app...
Flask app will run locally on http://127.0.0.1:5000 but (potentially) NOT via ngrok.
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
2025-05-12 13:27:39 [werkzeug] INFO: [33mPress CTRL+C to quit[0m


Flask App Cell: NGROK_AUTHTOKEN from env: None
CRITICAL: NGROK_AUTHTOKEN was not found in the environment. ngrok will likely fail.
Starting Flask app with ngrok...
Ensure 'clustered_news_articles.csv' exists (created by Cell 5).
Your public URL from ngrok will appear shortly below.
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
2025-05-12 13:28:05 [werkzeug] INFO: [33mPress CTRL+C to quit[0m
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): bin.equinox.io:443
DEBUG:urllib3.connectionpool:https://bin.equinox.io:443 "GET /c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip HTTP/1.1" 200 13921656
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): localhost:4040
Exception in thread Thread-8:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] C

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from datetime import datetime
import re
import logging
import nest_asyncio

nest_asyncio.apply()

class NewsSpider(scrapy.Spider):
    name = 'enhanced_news_spider'
    # Define the domains this spider is allowed to crawl
    allowed_domains = [
        'bbc.com',
        'bbc.co.uk', # Important for BBC
        'cnn.com',
        'aljazeera.com',
        'theguardian.com'
        ]
    custom_settings = {
        # ... (rest of your custom_settings)
        'FEED_URI': 'enhanced_news_articles_filtered.csv', # Changed output name
        # ...
    }

    # Keep the SECTION_MAPPING, ALTERNATIVE_CHANNELS, start_urls as before
    SECTION_MAPPING = {
        'business': ['business', 'economy', 'finance', 'markets', 'money', 'invest', 'stocks', 'companies', 'corporate', 'trade'],
        'politics': ['politics', 'government', 'election', 'parliament', 'congress', 'democracy'],
        'sports': ['sports', 'football', 'soccer', 'basketball', 'tennis', 'golf', 'olympics'],
        'culture': ['culture', 'arts', 'entertainment', 'movies', 'music', 'television', 'theatre', 'gallery', 'celebrities']
    }

    ALTERNATIVE_CHANNELS = {
        'business': ['https://www.bbc.com/news/business', 'https://edition.cnn.com/business', 'https://www.aljazeera.com/economy/'],
        'politics': ['https://www.aljazeera.com/politics/', 'https://www.theguardian.com/politics'],
        'sports': ['https://www.bbc.com/sport', 'https://edition.cnn.com/sport'],
        'culture': ['https://www.bbc.com/culture', 'https://www.theguardian.com/uk/culture']
    }

    start_urls = [
        'https://www.bbc.com',
        'https://edition.cnn.com',
        'https://www.aljazeera.com',
        'https://www.theguardian.com/international',
    ]


    # __init__, parse, discover_sections remain largely the same

    def __init__(self, *args, **kwargs):
        super(NewsSpider, self).__init__(*args, **kwargs)
        self.logger.setLevel(logging.INFO)

    def parse(self, response):
        # Check if the response URL is actually from an allowed domain
        # This adds an extra layer of safety, though allowed_domains should handle it
        is_allowed = any(domain in response.url for domain in self.allowed_domains)
        if not is_allowed:
             self.logger.debug(f"Ignoring response from disallowed domain: {response.url}")
             return # Stop processing this response

        # --- Rest of your parse logic ---
        if response.meta.get('is_section_page'):
            self.logger.info(f"Extracting articles from section page: {response.url} (Category: {response.meta.get('category')})")
            yield from self.extract_articles(response)
        elif response.url in self.start_urls:
            self.logger.info(f"Discovering sections from homepage: {response.url}")
            yield from self.discover_sections(response)
        else:
            # This case might still happen if a section URL redirects, check domain again
            self.logger.warning(f"Unexpected URL in parse() from allowed domain: {response.url}. Attempting article extraction.")
            if 'source' not in response.meta: response.meta['source'] = self.get_source_name(response.url)
            if 'category' not in response.meta: response.meta['category'] = self.detect_category(response)
            if 'potential_section_name' not in response.meta: response.meta['potential_section_name'] = response.meta.get('category', 'Unknown Section').capitalize()
            yield from self.extract_articles(response)


    def discover_sections(self, response):
        # LinkExtractor respects allowed_domains by default
        link_extractor = LinkExtractor(
            restrict_css='nav, ul.menu, div.navigation, header nav, footer nav, .primary-nav, #orb-nav-links',
            deny_extensions=['jpg', 'png', 'pdf', 'mp4', 'mp3', 'zip', 'gz', 'css', 'js'],
            unique=True,
            # Explicitly adding allowed_domains here is redundant but safe
            allow_domains=self.allowed_domains
        )
        # --- Rest of discover_sections logic ---
        found_categories_on_page = set()
        current_source_main_domain = '/'.join(response.url.split('/')[:3]) # Get scheme + domain

        for link in link_extractor.extract_links(response):
             url_lower = link.url.lower()
             text_lower = link.text.lower().strip() if link.text else ''
             if not text_lower and '#' in link.url: continue

             # Ensure the extracted link's domain is allowed (LinkExtractor should do this, but belt-and-suspenders)
             link_domain_allowed = any(domain in link.url for domain in self.allowed_domains)
             if not link_domain_allowed:
                 # self.logger.debug(f"LinkExtractor returned disallowed link (shouldn't happen often): {link.url}")
                 continue

             for category, keywords in self.SECTION_MAPPING.items():
                 if any(kw in url_lower or kw in text_lower for kw in keywords):
                     if category not in found_categories_on_page:
                         self.logger.info(f"Found potential section link for '{category}': '{link.text}' ({link.url}) on {response.url}")
                         found_categories_on_page.add(category)
                         yield scrapy.Request(
                             url=response.urljoin(link.url),
                             callback=self.parse,
                             meta={
                                 'category': category,
                                 'source': self.get_source_name(link.url), # Get source from the link URL
                                 'potential_section_name': link.text.strip() if link.text else category.capitalize(),
                                 'is_section_page': True
                             }
                         )
                         break # Move to the next link

        missing_categories_for_this_source = set(self.SECTION_MAPPING.keys()) - found_categories_on_page
        current_hostname = response.url.split('/')[2] # e.g., www.bbc.com
        for category in missing_categories_for_this_source:
             # self.logger.info(f"Checking ALTERNATIVE_CHANNELS for '{category}' on {current_hostname}.")
             for alt_url in self.ALTERNATIVE_CHANNELS.get(category, []):
                 # Check if the alternative URL belongs to the *same* base domain being processed
                 alt_hostname = alt_url.split('/')[2]
                 is_same_source = any(domain in current_hostname and domain in alt_hostname for domain in self.allowed_domains if '.' in domain) # Check base domain match

                 if is_same_source:
                     self.logger.info(f"Attempting alternative URL for '{category}' from {current_hostname}: {alt_url}")
                     yield scrapy.Request(
                         url=alt_url, callback=self.parse,
                         meta={'category': category, 'source': self.get_source_name(alt_url),
                               'potential_section_name': category.capitalize(), 'is_section_page': True}
                     )
                     break # Found an alt URL for this source, move to next category


    def extract_articles(self, response):
        # --- Get metadata as before ---
        source = response.meta.get('source', self.get_source_name(response.url))
        category = response.meta.get('category', self.detect_category(response))
        potential_section_name = response.meta.get('potential_section_name', category.capitalize())
        section_name = self.extract_section_name(response, potential_section_name)
        if not section_name: section_name = category.capitalize()

        # --- Article selectors map etc. as before ---
        article_selectors_map = {
            'bbc.com': ['a[type="article"]', 'a[href*="/news/articles/"]', '.gs-c-promo-heading[class*="__title"] a'],
            'cnn.com': ['a[data-link-type="article"]', '.card a', 'a[href*="/videos/"]'],
            'aljazeera.com': ['a.u-clickable-card__link', 'article h3 a', 'a.article-trending__title-link'],
            'theguardian.com': ['a[data-link-name="article"]', '.fc-item__link', 'a[aria-label*="article"]']
        }
        selector_list_for_domain = []
        for domain_key in article_selectors_map:
            if domain_key in response.url:
                selector_list_for_domain = article_selectors_map[domain_key]; break
        generic_selectors = ['article a[href]', 'div[class*="article"] a[href]', 'div[class*="post"] a[href]', 'div[class*="item"] a[href]', 'a[href*="/article/"]', 'a[href*="/story/"]', 'a[href*="/news/"]','h2 a[href]', 'h3 a[href]']
        combined_selectors = selector_list_for_domain + generic_selectors

        articles_found_on_page = set(); article_count = 0
        MAX_ARTICLES_PER_SECTION = 10

        for selector_str in combined_selectors:
            if article_count >= MAX_ARTICLES_PER_SECTION: break
            for article_element in response.css(selector_str):
                if article_count >= MAX_ARTICLES_PER_SECTION: break

                href = article_element.css('::attr(href)').get()
                if not href or href.startswith('#') or href.startswith('javascript:'): continue

                full_url = response.urljoin(href)

                # **** Crucial Check: Ensure the extracted URL is within allowed domains ****
                if not any(domain in full_url for domain in self.allowed_domains):
                    # self.logger.debug(f"Skipping link to disallowed domain: {full_url}")
                    continue
                # **** End Crucial Check ****

                if full_url in articles_found_on_page or not (full_url.startswith('http://') or full_url.startswith('https://')): continue
                # ... (rest of the filtering like domain root, mailto, etc.)

                # --- Title extraction and validation as before ---
                title_parts = article_element.xpath(".//text()[normalize-space()]").getall()
                title = " ".join(part.strip() for part in title_parts if part.strip()).strip()
                title = re.sub(r'\s+', ' ', title).strip()

                if not title or len(title) < 10:
                    potential_titles = [
                        article_element.css('h1::text, h2::text, h3::text, h4::text').get(),
                        article_element.css('span[class*="title"]::text').get(),
                        article_element.css('::attr(title)').get(),
                        article_element.css('::attr(aria-label)').get(),
                        article_element.xpath('string(.)').get()
                    ]
                    for pt in potential_titles:
                         if pt and len(pt.strip()) >= 10:
                             title = re.sub(r'\s+', ' ', pt.strip()).strip()
                             break

                if not title or len(title) < 10 or len(title) > 200: continue

                # --- Optional: Cross-section link check as before ---
                url_lower_path = '/'.join(full_url.lower().split('/')[3:])
                different_category_found = False
                for cat, kws in self.SECTION_MAPPING.items():
                     if cat != category and any(kw in url_lower_path for kw in kws):
                         different_category_found = True; break
                if different_category_found: continue

                # --- Yield item ---
                articles_found_on_page.add(full_url)
                # **** Re-confirm source name based on the *final* article URL ****
                final_source = self.get_source_name(full_url)
                if final_source == 'Unknown Source':
                    self.logger.warning(f"Could not determine source for allowed URL: {full_url}. Falling back to response source: {source}")
                    final_source = source # Use the source from the page it was found on as fallback

                yield {
                    'title': title,
                    'url': full_url,
                    'section_name': section_name,
                    'category': category,
                    'source': final_source, # Use source derived from the article URL
                    'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    'section_url': response.url
                    }
                article_count += 1

        if article_count == 0:
            self.logger.info(f"No valid articles extracted from allowed domain page {response.url} (Category: {category}, Section: {section_name})")


    # extract_section_name remains the same

    def extract_section_name(self, response, fallback_name):
        selectors = ['h1.section-header__title::text', 'h1[class*="PageTitle"]::text', 'h1[class*="section-title"]::text', '.page-title ::text', 'h1[itemprop="name"]::text', 'header h1::text', 'h1::text']
        section_name_str = ''
        for sel in selectors:
            extracted_name = response.css(sel).get()
            if extracted_name and extracted_name.strip():
                section_name_str = extracted_name.strip(); break
        if not section_name_str: section_name_str = fallback_name
        section_name_str = re.sub(r'[^\w\s-]', '', section_name_str).strip()
        section_name_str = re.sub(r'\s+', ' ', section_name_str)
        if not section_name_str:
            try:
                path_parts = [p for p in response.url.split('/') if p]
                generic = {'www', 'com', 'co', 'uk', 'org', 'net', 'html', 'htm', 'php', 'asp', 'news', 'article', 'articles', 'category', 'categories', 'section', 'sections', 'world', 'international', 'us', 'uk', 'europe', 'asia', 'africa'}
                for part in reversed(path_parts[-3:]):
                    cleaned = part.split('.')[0].lower()
                    if cleaned and len(cleaned) > 3 and cleaned not in generic and not cleaned.isdigit():
                        section_name_str = cleaned.replace('-', ' ').replace('_', ' ').title(); break
            except Exception: pass
        return section_name_str[:100] if section_name_str else fallback_name[:100]


    # get_source_name remains the same - it's okay for it to return 'Unknown Source'
    # if allowed_domains filter works correctly. We added a fallback in extract_articles.
    def get_source_name(self, url):
        domain_map = {'bbc.com': 'BBC', 'bbc.co.uk': 'BBC', 'cnn.com': 'CNN', 'aljazeera.com': 'Al Jazeera', 'theguardian.com': 'The Guardian'}
        for domain, name in domain_map.items():
            if domain in url: return name
        try:
            hostname = url.split('//')[-1].split('/')[0]
            parts = hostname.replace('www.', '').split('.')
            if len(parts) > 2 and len(parts[-2]) <= 3:
                 source_guess = parts[-3]
            else:
                 source_guess = parts[0]
            return source_guess.capitalize()
        except Exception:
            # Log the URL that caused the issue for debugging
            self.logger.error(f"Failed to parse domain for URL: {url}")
            return 'Unknown Source' # Fallback


# if __name__ == "__main__": section remains the same
if __name__ == "__main__":
    output_file = NewsSpider.custom_settings['FEED_URI']
    print(f"Starting Scrapy spider (NewsSpider)... Output: {output_file}")
    process = CrawlerProcess(settings={
        'LOG_LEVEL': 'INFO',
        'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor',
        # Ensure FEED_URI from custom_settings is used if not overridden
        'FEED_URI': output_file,
        'FEED_FORMAT': NewsSpider.custom_settings['FEED_FORMAT'],
         # You might want to ensure overwrite is explicitly true here too
        'FEED_OVERWRITE': True,
    })
    process.crawl(NewsSpider)
    process.start()
    print(f"Scrapy spider (NewsSpider) finished. Output saved to {output_file}")