In [7]:
import requests
from bs4 import BeautifulSoup

In [8]:
class ElComercioScrapper():
    diary = "El Comercio"
    base_url_elComercio = "https://elcomercio.pe"
    url_coronavirus_news = "https://elcomercio.pe/noticias/coronavirus-peru/"
    soup = []
    articles = []
    articles_soup = []
    
    def __init__(self, limit = 5):
        self.limit = limit
    
    def fetch_news(self):
        page = requests.get(self.url_coronavirus_news)
        self.soup = BeautifulSoup(page.content, 'html.parser')
    
    def fetch_articles(self):
        articles_soup = self.soup.findAll("div", {"class": "story-item"})
        self.articles_soup = articles_soup[:self.limit]
        
    def get_data_from_articles(self, story):
        time = story.find("p", {"class": "story-item__date"})
        title = story.find("a", {"class": "story-item__title"})
        img_src = story.find("img")
        subtitle = story.find("p", {"class": "story-item__subtitle"})
        
        return {
            "time": time.get_text(),
            "title": title.get_text(),
            "img_src": img_src.get('data-src'),
            "subtitle": subtitle.get_text(),
            "article_url": self.base_url_elComercio + title.get('href')
        }
    
    def build_json_articles(self):
        json_list = []
        for article in self.articles_soup:
            article_json = self.get_data_from_articles(article)
            json_list.append(article_json)
        
        self.articles = json_list
    
    def get_articles(self):
        self.fetch_news()
        self.fetch_articles()
        self.build_json_articles()
        
        return {
            "diary": self.diary,
            "articles": self.articles
        }

In [9]:
class RPPScrapper():
    diary = "RPP"
    base_url_rpp = "https://rpp.pe"
    url_coronavirus_news = "https://rpp.pe/noticias/coronavirus?ref=rpp"
    soup = []
    articles = []
    articles_soup = []
    
    def __init__(self, limit = 5):
        self.limit = limit
    
    def fetch_news(self):
        page = requests.get(self.url_coronavirus_news)
        self.soup = BeautifulSoup(page.content, 'html.parser')
    
    def fetch_articles(self):
        articles_soup = self.soup.findAll("article")
        self.articles_soup = articles_soup[:self.limit]
        
    def get_data_from_articles(self, story):
        time = story.find("time", {"class": "x-ago"})
        title = story.find("h2").find("a")
        img_src = story.find("figure")
        subtitle = story.find("p")
        #print(img_src)
        return {
            "time": time.get("data-x"),
            "title": title.get_text(),
            "img_src": self.get_internal_image(title.get('href')),
            "subtitle": subtitle.get_text(),
            "article_url": title.get('href')
        }
    
    def get_internal_image(self, article_url):
        article = requests.get(article_url)
        article_soup = BeautifulSoup(article.content, 'html.parser')
        picture = article_soup.find("div", {"class": "cover"})
        img = picture.find("img")
        return img.get('src')
    
    def build_json_articles(self):
        json_list = []
        for article in self.articles_soup:
            article_json = self.get_data_from_articles(article)
            json_list.append(article_json)
        
        self.articles = json_list
    
    def get_articles(self):
        self.fetch_news()
        self.fetch_articles()
        self.build_json_articles()
        
        return {
            "diary": self.diary,
            "articles": self.articles
        }

In [57]:
class LaRepublicaScrapper():
    diary = "La República"
    base_url_laRepublica = "https://larepublica.pe"
    url_coronavirus_news = "https://larepublica.pe/tag/coronavirus-en-peru/"
    soup = []
    articles = []
    articles_soup = []
    
    def __init__(self, limit = 5):
        self.limit = limit
    
    def fetch_news(self):
        page = requests.get(self.url_coronavirus_news)
        self.soup = BeautifulSoup(page.content, 'html.parser')
    
    def fetch_articles(self):
        articles_soup = self.soup.findAll("article", {"class": "PostSection"})
        self.articles_soup = articles_soup[:self.limit]
        
    def get_data_from_articles(self, story):
        time = story.find("span", {"class": "PostSectionListSPAN"})
        title = story.find("a", {"class": "PostSectionListA"})
        img_src = story.find("img")
        subtitle = story.find("p", {"class": "PostSectionContent"})
        
        article_url = self.base_url_laRepublica + title.get('href')
        
        return {
            "time": time.get_text(),
            "title": title.get_text(),
            "img_src": self.get_internal_image(article_url),
            "subtitle": subtitle.get_text(),
            "article_url": article_url
        }
    
    def get_internal_image(self, article_url):
        article = requests.get(article_url)
        article_soup = BeautifulSoup(article.content, 'html.parser')
        try:
            picture = article_soup.find("picture")
            img = picture.find("source", {"media": "(max-width: 767px)"})
            return img.get('srcset')
        except:
            return ""
    
    def build_json_articles(self):
        json_list = []
        for article in self.articles_soup:
            article_json = self.get_data_from_articles(article)
            json_list.append(article_json)
        
        self.articles = json_list
    
    def get_articles(self):
        self.fetch_news()
        self.fetch_articles()
        self.build_json_articles()
        
        return {
            "diary": self.diary,
            "articles": self.articles
        }

In [58]:
class Peru21Scrapper():
    diary = "Perú21"
    base_url_peru21 = "https://peru21.pe"
    url_coronavirus_news = "https://peru21.pe/noticias/coronavirus/"
    soup = []
    articles = []
    articles_soup = []
    
    def __init__(self, limit = 5):
        self.limit = limit
    
    def fetch_news(self):
        page = requests.get(self.url_coronavirus_news)
        self.soup = BeautifulSoup(page.content, 'html.parser')
    
    def fetch_articles(self):
        articles_soup = self.soup.findAll("div", {"class": "story-item"})
        self.articles_soup = articles_soup[:self.limit]
        
    def get_data_from_articles(self, story):
        time = story.find("p", {"class": "story-item__date"})
        title = story.find("a", {"class": "story-item__title"})
        img_src = story.find("img")
        subtitle = story.find("p", {"class": "story-item__subtitle"})
        
        return {
            "time": time.get_text(),
            "title": title.get_text(),
            "img_src": img_src.get('data-src'),
            "subtitle": subtitle.get_text(),
            "article_url": self.base_url_peru21 + title.get('href')
        }
    
    def build_json_articles(self):
        json_list = []
        for article in self.articles_soup:
            article_json = self.get_data_from_articles(article)
            json_list.append(article_json)
        
        self.articles = json_list
    
    def get_articles(self):
        self.fetch_news()
        self.fetch_articles()
        self.build_json_articles()
        
        return {
            "diary": self.diary,
            "articles": self.articles
        }

In [63]:
class CorreoScrapper():
    diary = "Correo"
    base_url_correo = "https://diariocorreo.pe"
    url_coronavirus_news = "https://diariocorreo.pe/noticias/coronavirus/"
    soup = []
    articles = []
    articles_soup = []
    
    def __init__(self, limit = 5):
        self.limit = limit
    
    def fetch_news(self):
        page = requests.get(self.url_coronavirus_news)
        self.soup = BeautifulSoup(page.content, 'html.parser')
    
    def fetch_articles(self):
        articles_soup = self.soup.findAll("div", {"class": "story-item"})
        self.articles_soup = articles_soup[:self.limit]
        
    def get_data_from_articles(self, story):
        title = story.find("a", {"class": "story-item__title"})
        subtitle = story.find("p", {"class": "story-item__subtitle"})
        article_url = self.base_url_correo + title.get('href')
        img_src, time = self.get_internal_data(article_url)
        
        return {
            "time": time,
            "title": title.get_text(),
            "img_src": img_src,
            "subtitle": subtitle.get_text(),
            "article_url": self.base_url_correo + title.get('href')
        }
    
    def get_internal_data(self, article_url):
        article = requests.get(article_url)
        article_soup = BeautifulSoup(article.content, 'html.parser')
        try:
            picture = article_soup.find("picture")
            img = picture.find("source", {"media": "(max-width: 767px)"}).get('srcset')
        except:
            img = ""
        time = article_soup.find("time").get("datetime")

        return img, time
    
    def build_json_articles(self):
        json_list = []
        for article in self.articles_soup:
            article_json = self.get_data_from_articles(article)
            json_list.append(article_json)
        
        self.articles = json_list
    
    def get_articles(self):
        self.fetch_news()
        self.fetch_articles()
        self.build_json_articles()
        
        return {
            "diary": self.diary,
            "articles": self.articles
        }

In [64]:
class Extractor():
    
    articles = []
    
    def select_daily(self, diary):
        newspapers = {
            "el_comercio": 'el_comercio',
            "la_republica": 'la_republica',
            "rpp": 'rpp',
            "peru21": "peru21",
            "correo": "correo"
        }
        
        if diary == "all":
            return newspapers
        
        return newspapers.get(diary, "Diario seleccionado invalido")
    
    
    def scrape_articles(self, selected_daily, limit):
        if selected_daily == "el_comercio":
            elComercio_scrapper = ElComercioScrapper(limit)
            elComercio_data = elComercio_scrapper.get_articles()
            self.articles.append(elComercio_data)
        if selected_daily == "la_republica":
            laRepublica_scrapper = LaRepublicaScrapper(limit)
            laRepublica_data = laRepublica_scrapper.get_articles()
            self.articles.append(laRepublica_data)
        if selected_daily == "rpp":
            rpp_scrapper = RPPScrapper(limit)
            rpp_data = rpp_scrapper.get_articles()
            self.articles.append(rpp_data)
        if selected_daily == "peru21":
            peru21_scrapper = Peru21Scrapper(limit)
            peru21_data = peru21_scrapper.get_articles()
            self.articles.append(peru21_data)
        if selected_daily == "correo":
            correo_scrapper = CorreoScrapper(limit)
            correo_data = correo_scrapper.get_articles()
            self.articles.append(correo_data)


    def get_articles(self, selected_daily, limit):
        print(selected_daily, limit)
        self.scrape_articles(selected_daily, limit)
        return self.articles

In [65]:
class NewScrapper(Extractor):
    
    def __init__(self, diary, limit= 5):
        self.diary = diary
        self.limit = limit
        self.selected_daily = self.select_daily(diary)

    def get_news(self):
        articles = self.get_articles(self.selected_daily, self.limit)
        return articles

In [69]:
x = NewScrapper("rpp")
x.get_news()

rpp 5


[{'diary': 'La República',
  'articles': [{'time': '29 Mar 2020 | 3:51 h',
    'title': 'Coronavirus en Perú: Infarto económico y recuperación',
    'img_src': 'https://larepublica.pe/resizer/xO-UB8vkOl7vVEFFDTtszXgWm9Q=/482x290/top/smart/arc-anglerfish-arc2-prod-gruporepublica.s3.amazonaws.com/public/DWJ3LXLOVVCWTFISD6U3TVZIW4.png',
    'subtitle': 'Para atacar la crisis económica desatada por el Covid-19 es necesario distinguir entre dos etapas. La primera estará dominada por la paralización económica interna y el congelamiento masivo de flujos de ingresos en hogares y empresas, y requerirá políticas excepcionales de alivio. La segunda estará marcada por un menor gasto agregado y la persistencia del shock externo, y demandará de medidas más tradicionales de estímulo. Ambas exigirán actuar decididamente para evitar quiebras y la ruptura de la cadena de pagos.',
    'article_url': 'https://larepublica.pe/economia/2020/03/29/coronavirus-en-peru-infarto-economico-y-recuperacion/'},
   {'

In [143]:
gaa

''