# Scraping News Data

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd


In [2]:
class NewsScraper():
    def __init__(self, url, sections):
        self.url = url
        self.sections = sections
        self.news_df = pd.DataFrame()

    @staticmethod
    def retrieve_news_articles_info(url, section):
        links = []
        titles = []
        summaries = []

        response = requests.get(url + section)
        soup = BeautifulSoup(response.text, features='lxml')

        articles = soup.find_all('div', { 'class': 'gs-c-promo' })
        for i, article in enumerate(articles):
            # print("Article {}".format(i + 1))

            link = article.find('a')['href']
            if ("https://" not in link) and ("http://" not in link):
                valid_flag = True
            else:
                valid_flag = False

            if valid_flag:
                link = url + link
                links.append(link)
                # print(link)

                title = article.find('h3').text
                titles.append(title)
                # print(title)

                summary = article.find('p')
                if summary:
                    summaries.append(summary.text)
                    # print(summary.text)
                else:
                    summaries.append(np.nan)

        assert len(links) == len(titles)
        assert len(links) == len(summaries)
        assert len(summaries) == len(titles)

        return links, titles, summaries
    
    @staticmethod
    def retrieve_news_articles_content(links):
        contents = []
        for link in links:
            content = ''
            response = requests.get(link)
            soup = BeautifulSoup(response.text, features='lxml')
            for block in soup.find_all('div', {"data-component": "text-block"}):
                content = content + block.getText()
            contents.append(content)
        return contents

    def preprocess_data(self, section):
        link, title, summary = self.retrieve_news_articles_info(self.url, section)
        content = self.retrieve_news_articles_content(link)
        df = pd.DataFrame({"Titles": title, "Content": content, "Original Summary": summary, "Link": link})
        
        df['Content'] = df['Content'].map(lambda x: np.nan if x=='' else x)
        df.dropna(subset=["Content"], inplace=True)
        df.reset_index(inplace=True, drop=True)
        df.drop_duplicates(["Titles", "Content", "Link"], inplace=True)
        df.reset_index(inplace=True, drop=True)
        df['Section'] = [section] * df.shape[0]

        self.news_df = pd.concat([self.news_df, df])
    
    def get_news_data(self):
        for section in self.sections:
            self.preprocess_data(section)
            self.news_df.reset_index(inplace=True, drop=True)

In [3]:
topics = ["/news/business", "/news/technology", "/news/science_and_environment", "/news/world", "/news/stories", "/news/entertainment_and_arts"]

bbc_news = NewsScraper("https://www.bbc.com", topics)
bbc_news.get_news_data()
bbc_news.news_df

Unnamed: 0,Titles,Content,Original Summary,Link,Section
0,UK orders Chinese firm to sell Welsh tech plant,The takeover of Britain's largest microchip pl...,The UK government says Nexperia's takeover of ...,https://www.bbc.com/news/uk-wales-63656816,/news/business
1,"Work long hours or leave, Musk tells Twitter s...",Elon Musk has told Twitter staff that they mus...,Elon Musk says workers at the social media fir...,https://www.bbc.com/news/business-63648505,/news/business
2,UK Chancellor to unveil spending cuts and tax ...,Chancellor Jeremy Hunt will pledge to face int...,Pensions and benefits will rise with prices bu...,https://www.bbc.com/news/uk-politics-63656522,/news/business
3,What is behind the big tech companies' job cuts?,The first sign of job cuts at Amazon came from...,Thousands of redundancies have been announced ...,https://www.bbc.com/news/technology-63635821,/news/business
4,UK food delivery firm Deliveroo quits Australia,UK-based food delivery app Deliveroo says it i...,The firm struggled to compete with rivals as w...,https://www.bbc.com/news/business-63645765,/news/business
...,...,...,...,...,...
81,Wagatha play won't 'punch down' at Rooney and ...,"""The Wagatha Christie case has mystery, suspen...",,https://www.bbc.com/news/entertainment-arts-63...,/news/entertainment_and_arts
82,The Bollywood actress caught up in a 'gifts sc...,A court in India has granted bail to Bollywood...,,https://www.bbc.com/news/world-asia-india-6362...,/news/entertainment_and_arts
83,Michelle Obama: Being kind to myself is a chal...,Michelle Obama has admitted she struggles with...,,https://www.bbc.com/news/entertainment-arts-63...,/news/entertainment_and_arts
84,BBC marks centenary of first radio broadcast,The BBC is celebrating the centenary of its fi...,,https://www.bbc.com/news/entertainment-arts-63...,/news/entertainment_and_arts


In [4]:
bbc_news.news_df.to_csv("bbc_news.csv")