In [1]:
import subprocess as sub
import sys, time

try:
    import scrapy
except ModuleNotFoundError:
    install = sub.run([sys.executable, "-m", "pip", 'install', "scrapy"], stderr=sub.PIPE, stdout=sub.PIPE) # Process for installing scrapy library
    if install.returncode == 0:
        print('scrapy installed successfull!\n')
        time.sleep(2) #  Sleep for 2 seconds before going on with the scraping
    else:
        print('Unsuccessful installation!,\nHint: Check your internet.')
        sys.exit(1)
        
from scrapy import Request, Selector, Spider
from scrapy.crawler import CrawlerProcess
from collections import defaultdict
from pathlib import Path


def create_remove_f_news():
  """
  This method is going to delete old NEWS if it exists,
  else it will create a new NEWS.
  """
  path = Path('Top 15 news.txt')
  if path.exists():
    path.unlink()
  else:
    path.touch()


class PunchScraper(Spider):
    name = "Punch_scraper"
    first_news = 0 # This will let me keep track of the first NEWS, so as to maintain the newline character.
    
    def __init__(self):
      self.all_paragraph = []  #  To store each paragraph of the NEWS

    def start_requests(self):
        urls = ["https://punchng.com/"]
        for url in urls:
            yield Request(url, callback=self.link_follow)

    def link_follow(self, response):
        """
            This method contains all the link in the Punch website.
        """
        section = response.xpath("//*/section[@class='col-md-12 col-lg-6 latest-news-wraper']")
        links = section.css("div.row > ul li a::attr(href)").extract_first() # xpath("./div[@class='row']/ul//li//a/@href").extract()
        for link in links:
            yield response.follow(url=link, callback=self.parse)

    def parse(self, response):
        """
        This method is for processing the NEWS for user digest.
        """
        title = response.css("h1.post_title::text").extract_first() # Title of the  page
        content = response.css("div.entry-content") # All the content of the punch
        page_sum = content.xpath(".//*[@style='text-align: justify;']")  # All paragraphs with unknown format
        if page_sum == []:
          page_sum = content.xpath(".//p")  # All paragraphs
        for pg in page_sum:
            parag = " ".join(pg.xpath('.//text()').extract())
            self.all_paragraph.append(parag)
        
        news = "\n".join(self.all_paragraph)  # Separating paragraphs with newline character
        self.all_paragraph = [] # Set free
        with open(f'Top 15 news.txt', 'a+') as doc:  #  Saving each news here
            title_format = f'{title}\n' if PunchScraper.first_news == 0 else "\n"+title+'\n'
            news_url = f"NEWS URL: {response.url}\n" if PunchScraper.first_news == 0 else f"\nNEWS URL: {response.url}"
            doc.writelines(news_url)
            doc.writelines(title_format)
            doc.writelines(news)
            doc.writelines('\n')
            PunchScraper.first_news = 1
            
                    
if __name__ == '__main__':
    create_remove_f_news()
    process = CrawlerProcess()
    process.crawl(PunchScraper)
    process.start()

2020-12-03 13:48:08 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapybot)
2020-12-03 13:48:08 [scrapy.utils.log] INFO: Versions: lxml 4.2.6.0, libxml2 2.9.8, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.9 (default, Oct  8 2020, 12:12:24) - [GCC 8.4.0], pyOpenSSL 20.0.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.2.1, Platform Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic
2020-12-03 13:48:08 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2020-12-03 13:48:08 [scrapy.crawler] INFO: Overridden settings:
{}
2020-12-03 13:48:08 [scrapy.extensions.telnet] INFO: Telnet Password: dd5965c85a06e5e3
2020-12-03 13:48:08 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2020-12-03 13:48:08 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloa

In [2]:
!pip install requests



In [2]:
import requests

In [4]:
html = requests.get("https://punchng.com/d/")

2020-12-03 13:39:31 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): punchng.com:443
2020-12-03 13:39:31 [urllib3.connectionpool] DEBUG: https://punchng.com:443 "GET /d/ HTTP/1.1" 200 None


In [5]:
html.text

'<!DOCTYPE html><html lang="en-US"><head> <script data-cfasync=\'false\' data-mrf-script="garda" data-mrf-dt="1" data-mrf-host="bc.marfeelcache.com" src="https://bc.marfeelcache.com/statics/marfeel/gardac-sync.js"></script> <meta charset="UTF-8"><link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.1.0/css/all.css" integrity="sha384-lKuwvrZot6UHsBSfcMvOkWwlCMgc0TaWr+30HWe3a4ltaBwTZhyTEggF5tJv8tbt" crossorigin="anonymous"><link data-optimized="2" rel="stylesheet" href="https://punchng.com/wp-content/litespeed/cssjs/e8237.css?98678" /><meta http-equiv="x-dns-prefetch-control" content="on"><meta name="viewport" content="width=device-width, initial-scale=1"><link rel="profile" href="https://gmpg.org/xfn/11"> <script src="data:text/javascript;base64, KGZ1bmN0aW9uKCl7J3VzZSBzdHJpY3QnO3ZhciBnPWZ1bmN0aW9uKGEpe3ZhciBiPTA7cmV0dXJuIGZ1bmN0aW9uKCl7cmV0dXJuIGI8YS5sZW5ndGg/e2RvbmU6ITEsdmFsdWU6YVtiKytdfTp7ZG9uZTohMH19fSxsPXRoaXN8fHNlbGYsbT0vXltcdysvXy1dK1s9XXswLDJ9JC8scD1udWxsLHE9ZnVuY

In [6]:
sel = Selector(html)

In [9]:
 sel.xpath("*//*[@style='text-align: justify;']") == []

True

In [16]:
title = sel.css("h1.post_title::text").extract_first() # Title of the  page
content = sel.css("div.entry-content") # All the content of the punch
page_sum = content.xpath(".//*[@style='text-align: justify;']")  # All paragraphs

In [7]:
title

'Maradona’s son wants Messi’s jersey retired'

In [17]:
page_sum.extract()

['<p style="text-align: justify;">Diego Maradona’s eldest son has said he wants to see the No. 10 jersey retired at Barcelona and the other clubs his father played for.</p>',
 '<p style="text-align: justify;">Argentina legend Maradona, who died on November 25 from a heart attack, played for several clubs throughout his career – most notably Boca Juniors, Barcelona and Napoli.</p>',
 '<p style="text-align: justify;">When asked if all No. 10 jerseys should be retired, as suggested by Marseille manager Andre Villas-Boas, Diego Maradona Jr. told <em>Marca</em>, “In the teams that he (Maradona) played for, I believe so, including Barcelona. Without a doubt.”</p>',
 '<p style="text-align: justify;">In 2000, Napoli retired the No. 10 shirt in honour of Maradona, who scored 115 goals in 259 games and helped the Italian club clinch their only two Serie A titles in 1987 and 1990, as well as the UEFA Cup in 1989.</p>',
 '<p style="text-align: justify;">After scoring, the Argentina captain took of

In [10]:
a = []