In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from collections import defaultdict
from urllib.request import Request, urlopen
from newspaper import Article

## Define Scrapers

In [2]:
# Use newspaper to download title and summary
def news_download(link):
    article = Article(link)
    
    try:
        article.download()
        article.parse()
        article.nlp()
        
        title, summary = article.title, article.summary

    # if an article cannot be downloaded without authorization, then skip it
    except: 
        
        title, summary = '', ''
        
    return title, summary

In [3]:
def scrape_news(url, topic):
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    web_byte = urlopen(req).read()
    page = web_byte.decode('utf-8')
    soup = BeautifulSoup(page, 'html.parser')
    results = soup.find_all(class_='subSleeve')

    news_dict = defaultdict(list)
    
    for line in results:
        link = line.find('a')['href']
        title = line.find('a').get_text()
        date = line.find(class_='date').get_text()

        news_dict['link'].append(link)
        news_dict['title'].append(title)
        news_dict['date'].append(date)

        try:
            _, summary = news_download(link)        
        except:
            summary = ''

        news_dict['summary'].append(summary)
        
    df = pd.DataFrame(news_dict)
    df['Topic'] = topic
    
    return df

## Define a Dictionary for Topics and Links

In [14]:
cw_dict = {
    'Regulatory Enforcement': 'https://www.complianceweek.com/topics/regulatory-enforcement',
    'Regulatory Policy': 'https://www.complianceweek.com/topics/regulatory-policy',
    'Accounting & Auditing': 'https://www.complianceweek.com/topics/accounting-and-auditing',
    'Risk Management': 'https://www.complianceweek.com/topics/risk-management',
    'Anti-Corruption': 'https://www.complianceweek.com/topics/anti-corruption',
    'Data Privacy': 'https://www.complianceweek.com/topics/data-privacy',
    'Cybersecurity': 'https://www.complianceweek.com/topics/cyber-security',
    'Technology': 'https://www.complianceweek.com/topics/technology',
    'Boards & Shareholders': 'https://www.complianceweek.com/topics/boards-and-shareholders',
    'Internal Controls': 'https://www.complianceweek.com/topics/internal-controls',
    'Investigations': 'https://www.complianceweek.com/topics/investigations',
    'Financial Services': 'https://www.complianceweek.com/topics/financial-services'
}

# Run the Scrapper

In [15]:
# Scrape news
news_df = pd.DataFrame([])
for topic in cw_dict:
    temp_df = scrape_news(cw_dict[topic], topic)
    news_df = pd.concat([news_df, temp_df])

In [20]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 585 entries, 0 to 49
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   link     585 non-null    object
 1   title    585 non-null    object
 2   date     585 non-null    object
 3   summary  585 non-null    object
 4   Topic    585 non-null    object
dtypes: object(5)
memory usage: 27.4+ KB


# Unify format

In [39]:
news_df['Website']='Compliance Week'
news_df['Paragrah'] = np.nan
news_df['Image URL'] = np.nan
news_df = news_df[['Website', 'title', 'link', 'Paragrah', 'Image URL', 'date', 'summary', 'Topic']]
news_df.columns = list(df.columns[:-1]) + ['Topic']
#news_df.to_csv('compliance_week_1112.csv', index=False)