# Data Extraction on Guardian news
1. From the returned NewsAPI dataset, we only get the dataset with published date, headline, and url of news.
2. With beautiful soup, scrape the informative content of the whole news article
3. The news source include CNN, Reuters, Washington news, BBC, Bloomberg, The Wall Street Journal
3. all extracted news are stored in respective dataframe

In [2]:
import os
import json
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

def process_date(date):
    
    if len(date.split(' ')) < 3:
        return date+' 2019'
    elif len(date.split(' ')) > 3:
        return ' '.join(date.split(' ')[-3:])
    else:
        return date
    
def process_time(time):
    if 'Updated' in time:
        return ' '.join(time.strip().split(' ')[1:4])
    else:
        return ' '.join(time.strip().split(' ')[:3])

def date_format(date):
    DATE_FORMATS = ['%B %d, %Y', '%b. %d, %Y', '%b %d, %Y', '%B %d %Y']
    for a in DATE_FORMATS:
        try:
            date_time = pd.to_datetime(date, format = a)
        except:
            continue
    return date_time

def several_news(soup):
    regex = re.compile('.*timeDelta.*')
    lst1, lst2, lst3 = ([] for i in range(3))
    headline = soup.find_all("h2")[2:]
    time = soup.find_all("span", {'class': regex})
    for aa in headline:
        content = aa.findNext('div')
        content_1 = content.find_all("p")
        content_2 = ' '.join([a.text for a in content_1])
        lst3.append(content_2)
    for a, b in zip(headline, time):
        lst1.append(a.text)
        lst2.append(''.join(b.text.split('ET, ')[1:]))
    return lst1, lst2, lst3

def cnn_crawl(cnn_lst, path):
    print('start crawling cnn news')
    headline_lst, date_lst, all_content_lst = ([] for i in range(3))
    for file in os.listdir(path):
        if file in cnn_lst:
            with open(path+file, 'r') as news:
                data = news.read()
            soup = BeautifulSoup(data, features="html.parser")
            try:
                headline = soup.find("h1", {"class": "pg-headline"}).get_text()
            except:
                lst1, lst2, lst3 = several_news(soup)
                headline_lst = lst1+headline_lst
                date_lst = lst2+date_lst
                all_content_lst = lst3+all_content_lst
                continue
            time = soup.find("p", {"class": "update-time"}).get_text()
            time1 = ''.join(time.split(',')[-2:])
            time_date = ' '.join(time1.split(' ')[2:-1])
            time_date = process_date(time_date)
            regex = re.compile('^zn-body__paragraph')
            try:
                content = soup.find("div", {"class": "pg-rail-tall__body"})
                content_head = content.find("p").get_text()
                content_rr = content.find_all("div", {"class": regex})
            except:
                content = soup.find("div", {"class": "pg-special-article__body"})
                try:
                    content_head = content.find("p", {"class": regex}).get_text()
                    content_rr = content.find_all("div", {"class": regex})
                except:
                    content = soup.find("div", {"class": "pg-special-article__wrapper"})
                    content_head = content.find("div", {"class": 'pg-special-article__body'}).get_text()
                    content_rr = content.find_all("p", {"class": regex})

            content_lst = [a.text for a in content_rr]
            all_content = ' '.join(content_lst)
            cnn_content = content_head+' '+all_content
            headline_lst.append(headline)
            date_lst.append(time_date)
            all_content_lst.append(cnn_content)
            
    print('finish crawling cnn news')        
    CNN_news = pd.DataFrame({
            'headline': headline_lst,
            'date': date_lst,
            'content': all_content_lst})
    CNN = CNN_news.loc[(CNN_news['date'].str.len() > 3) & (CNN_news['headline'].str.len() > 3)]
    CNN['date'] = CNN['date'].apply(date_format)
    return CNN
        
def reuters_crawl(reu_news, path):
    print('start crawling reuters news')
    headline_lst, date_lst, all_content_lst = ([] for i in range(3))
    for file in os.listdir(path):
        if file in reu_news:

            with open(path+file, 'r') as news:
                data = news.read()
            soup = BeautifulSoup(data, features="html.parser")
            try:
                headline = soup.find("h1", {"class": "ArticleHeader_headline"}).get_text()
            except:
                continue
            time = soup.find("div", {"class": "ArticleHeader_date"}).get_text()
            time_date = time.split('/')[0]
            content_div = soup.find("div", {"class": "StandardArticleBody_body"})
            content = content_div.find_all("p")
            content_lst = [tag.text for tag in content if len(tag.attrs) < 1]
            all_content = ' '.join(content_lst)
            headline_lst.append(headline)
            date_lst.append(time_date[:-1])
            all_content_lst.append(all_content)
    Reuters_news1 = pd.DataFrame({
            'headline': headline_lst,
            'date': date_lst,
            'content': all_content_lst})
    Reuters_news = Reuters_news1.loc[Reuters_news1['content'].str.len() > 3]
    Reuters_news['date'] = pd.to_datetime(Reuters_news['date'], format='%B %d, %Y')
    print('finish crawling reuters news')
    return Reuters_news
            
def washington_crawl(washin_news, path):
    print('start crawling washington news')
    headline_lst, date_lst, all_content_lst = ([] for i in range(3))
    filter_string=['RELATED LINKS', 'And here are a few more good reads', 'Coming Up', 'Email', 'https://', 'http://', 'AP Photo', 'Coming soon']
    for file in os.listdir(path):
        if file in washin_news:

            with open(path+file, 'r') as news:
                data = news.read()
            soup = BeautifulSoup(data, features="html.parser")
            try:
                headline_div = soup.find("div", {"class": "topper-headline"})
                headline = headline_div.find('h1').get_text()
            except:
                continue
            time_div = soup.find("div", {"id": "article-body"})
            time = time_div.find('span', {"class": "author-timestamp"})['content']
            time_date = time.split('T')[0]
            content_div = soup.find("div", {"id": "article-body"})
            content = content_div.find_all("p")
            content_lst = [tag.text for tag in content if (len(tag.attrs) <= 1) and (not tag.text.startswith("Copyright")) and not any (e in tag.text for e in filter_string)]
            all_content = ' '.join(content_lst)

            headline_lst.append(headline)
            date_lst.append(time_date)
            all_content_lst.append(all_content)
    Washington_news = pd.DataFrame({
            'headline': headline_lst,
            'date': date_lst,
            'content': all_content_lst})
    print('finish crawling washington news')
    Washington_news['date'] = pd.to_datetime(Washington_news['date'])
    return Washington_news        
            
def BBC_crawl(bbc_news, path):
    print('start crawling BBC news')
    headline_lst, date_lst, all_content_lst = ([] for i in range(3))
    filter_string = 'Text by'

    for file in os.listdir(path):
        if file in bbc_news:

            with open(path+file, 'r') as news:
                data = news.read()
            soup = BeautifulSoup(data, features="html.parser")
            try:
                headline = soup.find("h1", {"class": "story-body__h1"}).get_text()
            except:
                continue
            time = soup.find("div", {"class": "date date--v2"}).get_text()
            content_div = soup.find("div", {"class": "story-body__inner"})
            content_p = content_div.find_all("p")
            content= ''.join([tag.text for tag in content_p if not filter_string in tag.text])
            headline_lst.append(headline)
            date_lst.append(time)
            all_content_lst.append(content)
    BBC_news = pd.DataFrame({
            'headline': headline_lst,
            'date': date_lst,
            'content': all_content_lst})
    print('finish crawling BBC news')
    BBC_news['date'] = pd.to_datetime(BBC_news['date'], format='%d %B %Y')
    return BBC_news

def bloomberg_crawl(bloom_news, path):
    print('start crawling bloomberg news')
    headline_lst, date_lst, all_content_lst = ([] for i in range(3))
    filter_string = ['Illustration', 'Photographer', "@bloomberg.net"]

    for file in os.listdir(path):
        if file in bloom_news:

            with open(path+file, 'r') as news:
                data = news.read()
            soup = BeautifulSoup(data, features="html.parser")
            try:
                time = soup.find("time", {"class": "article-timestamp"})['datetime']
            except:
                continue
            time_date = time.split('T')[0]
            try:
                headline = soup.find("h1", {"class": "lede-text-v2__hed"}).get_text()
                content_div = soup.find("div", {"class": "body-copy-v2 fence-body"})
                content = content_div.find_all("p")
                content_head = ''.join([tag.text for tag in content if not any (e in tag.text for e in filter_string)])
                try:
                    header_div = soup.find_all("div", {"class": "abstract-v2__item-text"})
                    sub_header = '. '.join([tag.text.strip() for tag in header_div])
                except:
                    sub_header=''
                try:
                    sub_content_li = content_div.find_all("li")
                    sub_content = ''.join([tag.text for tag in sub_content_li])
                except:
                    sub_content=''
                cont = sub_header+' '+content_head+' '+sub_content

            except:
                try:
                    headline = soup.find("h1", {"class": "lede-text-only__hed"}).get_text()
                    content_div = soup.find("div", {"class": "body-columns"})
                    content = content_div.find_all("p")
                    content_head = ''.join([tag.text for tag in content if not any (e in tag.text for e in filter_string)])
                    try:
                        bottom_content = content_div.find("span", {"class": "bottom-line__text"}).get_text()
                    except:
                        bottom_content=''
                    cont = content_head+' '+bottom_content
                except:
                    continue
            headline_lst.append(headline)
            date_lst.append(time_date)
            all_content_lst.append(cont)
    print('finish crawling bloomberg news')       
    Bloomberg_news = pd.DataFrame({
            'headline': headline_lst,
            'date': date_lst,
            'content': all_content_lst})
    Bloomberg_news['date'] = pd.to_datetime(Bloomberg_news['date'])
    return Bloomberg_news

def WSJ_crawl(wsj_news, path):
    print('start crawling The Wall Street Journal news')
    regex = re.compile('timestamp.*')
    filter_string = ['@wsj.com', 'contributed to this article', 'Please email']
    headline_lst, date_lst, all_content_lst = ([] for i in range(3))

    for file in os.listdir(path):
        if file in wsj_news:

            with open(path+file, 'r') as news:
                data = news.read()
            soup = BeautifulSoup(data, features="lxml")
            try:
                headline = soup.find("h1", {"itemprop": "headline"}).get_text().strip()
            except:
                continue
            time = soup.find("time", {"class": regex}).get_text()
            time_date = process_time(time)
            try:
                header = soup.find('h2', {'class': 'sub-head'}).get_text().strip()
            except:
                try:
                    header = soup.find('h2', {'itemprop': 'description'}).get_text().strip()
                except:
                    pass
            try:
                content_div = soup.find("div", {"class": "article-content"})
                content = content_div.find_all("p")
                head = ''
            except:
                try:
                    content_div = soup.find("div", {"class": 'paywall'})
                    head = content_div.find_previous_sibling('p').get_text()
                    content = content_div.find_all("p")
                except:
                    continue
            content_lst = [tag.text for tag in content if not any (a in tag.text for a in filter_string)]
            all_content = ' '.join(content_lst)
            combined_content = header+' '+head+' '+all_content

            headline_lst.append(headline)
            date_lst.append(time_date)
            all_content_lst.append(combined_content)
        
    WSJ_news = pd.DataFrame({
            'headline': headline_lst,
            'date': date_lst,
            'content': all_content_lst
     })

    WSJ_news['date'] = WSJ_news['date'].apply(date_format)
    print('finish crawling The Wall Street Journal news')
    return WSJ_news

def get_news_lst(path):
#     with open('Data/NewsAPI/finance/index.json', 'r') as f:
    with open(path, 'r') as f:
        data = json.load(f)

    aurl, burl, curl, durl, eurl, furl, gurl = ([] for i in range(7))
    news_dict = dict.fromkeys(['source_news','url'])
    for news in data['articles']:
        if news['source']['name'] == 'CNN':
            html = news['url'].split('https://')[1]
            html1 = html.replace('/',':')
            aurl.append(html1)
            
        elif news['source']['name'] == 'Reuters':
            html = news['url'].split('https://')[1]
            html1 = html.replace('/',':')
            burl.append(html1)
        
        elif news['source']['name'] == 'The Washington Post':
            try:
                html = news['url'].split('https://')[1]
            except:
                html = news['url'].split('http://')[1]
            html1 = html.replace('/',':')
            curl.append(html1)
        
        elif news['source']['name'] == 'BBC News':
            html = news['url'].split('https://')[1]
            html1 = html.replace('/',':')
            durl.append(html1)
        
        elif news['source']['name'] == 'Bloomberg':
            html = news['url'].split('https://')[1]
            html1 = html.replace('/',':')
            eurl.append(html1)
        
        elif news['source']['name'] == 'The Wall Street Journal':
            html = news['url'].split('https://')[1]
            html1 = html.replace('/',':')
            furl.append(html1)
        
        else:
            try:
                html = news['url'].split('http://')[1]
            except:
                html = news['url'].split('https://')[1]
            html1 = html.replace('/',':')
            gurl.append(html1) # Google News
    return aurl, burl, curl, durl, eurl, furl

if __name__ == '__main__':
    path = os.path.join('../../Data', 'NewsAPI/')
    cnn_lst, reu_news, washin_news, bbc_news, bloom_news, wsj_news = get_news_lst(path+'stock/index.json')
    path = os.path.join('../../Data', 'NewsAPI', 'stock', 'WebPage/')
    WSJ_news_df = WSJ_crawl(wsj_news, path)
    bloomberg_news_df = bloomberg_crawl(bloom_news, path)
    bbc_news_df = BBC_crawl(bbc_news, path)
    washin_news_df = washington_crawl(washin_news, path)
    reuters_news_df = reuters_crawl(reu_news, path)
    cnn_news_df = cnn_crawl(cnn_lst, path)

start crawling The Wall Street Journal news
finish crawling The Wall Street Journal news
start crawling bloomberg news
finish crawling bloomberg news
start crawling BBC news
finish crawling BBC news
start crawling washington news
finish crawling washington news


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


start crawling cnn news
finish crawling cnn news


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
