In [178]:
#for scraping
import requests
from bs4 import BeautifulSoup

#to treat the data
from dateutil.parser import parse
import json

#add data as dataframe and make math calculations
import pandas as pd
import numpy as np


In [200]:
#functions to collect all the urls for the conservative Party website (because we are using their website html flag)
def get_news_url(news_website, section):
    url_list = []
    content = requests.get(news_website)
    soup = BeautifulSoup(content.text, 'lxml')

    #get the list of urls present in a website
    for section in soup.select(section):
        for a in section.findAll('a'):
            #print(a.get_text())
            #print(a.get('href'))
            url_list.append(a.get('href'))
    driver.quit()
    return url_list

In [180]:
#Now finally we will go through page by page to scrap the content
#it returns two items, one is the content of the press release for the conservative party and 
#the second is the date that the press release happened.

def get_content(webpage, class_name, class_date):
    response = requests.get(webpage)
    soup = BeautifulSoup(response.text, 'lxml')
    driver.quit()

    return([soup.select(class_name)[0].get_text(), soup.select(class_date)[0].get_text()])


### Conservatives Scrapping first

In [203]:
#Conservative party of Canada URLs:
#Get all the news category urls
news_website = "https://www.conservative.ca/news/"
news_website2 = "https://www.conservative.ca/news/page/"
page_num = np.arange(2,21,1) #page starts on 2 and goes up to 10 1 at a time

page_news_list = [news_website]
for number in page_num:
    a = news_website2+str(number)
    page_news_list.append(a)

#loop through the news page url getting all the news
url_list = []
for page in page_news_list:
    url_list = url_list + get_news_url(page, '.section--news')

#clean up the list   
url_list = list(set(url_list)) #removes duplicates
url_list = [item for item in url_list if item.count('page')==0] #removes the load more urls
#looping through all the pages (this point might take a while since it opens each url)

#scrap through the urls getting the content and the date
content_scrapping = []
content_date = []
for url in url_list:
    items= get_content(url, '.post-content', '.post-date')
    content_scrapping.append(items[0])
    content_date.append(items[1])

In [204]:
#adds everything to a Dataframe and then transform the date string into an actual date type.
conservatives_df = pd.DataFrame(data = {'party': 'conservative', 'url':url_list, 
                                        'content':content_scrapping, 'date': content_date})
conservatives_df['date'] = conservatives_df.apply(lambda row: parse(row['date']), axis=1)

#backup 
conservatives_df.to_csv('conservatives_df.csv', index = False)

### Liberal Scrapping


In [206]:
news_website = "https://www.liberal.ca/media-releases/"
page_num = np.arange(2,21,1) #page starts on 2 and goes up to 10 1 at a time

page_news_list = [news_website]
for number in page_num:
    a = news_website+'page/'+str(number)
    page_news_list.append(a)
    
liberal_url_list = []
for page in page_news_list:
    liberal_url_list = liberal_url_list + get_news_url(page, '.home-section')
liberal_url_list = list(set(liberal_url_list)) #removes duplicates
liberal_url_list = [item for item in liberal_url_list if item.count('page')==0] #removes the load more urls    
liberal_url_list = [item for item in liberal_url_list if item.count('media-releases')==0] #removes the load more urls  

#looping through all the pages (this point might take a while since it opens each url)
content_liberal = []
content_date_liberal = []
for url in liberal_url_list:
    #print(url)
    items= get_content(url, '.blog-content', '.byline')
    content_liberal.append(items[0])
    content_date_liberal.append(items[1])

In [210]:
#adds everything to a Dataframe and then transform the date string into an actual date type.
liberal_df = pd.DataFrame(data = {'party': 'liberal', 'url':liberal_url_list, 
                                        'content':content_liberal, 'date': content_date_liberal})
liberal_df['date'] = liberal_df.apply(lambda row: parse(row['date']), axis=1)

#liberal backup
liberal_df.to_csv('liberal_df.csv', index = False)

### NDP Scrapping
Of course they are different -_-. The NDP website doesnt have pagination, luckly the load more button makes a request that returns a json file with all the news links which is what we will be using.

In [211]:
#creates the new pagination
news_generic = 'https://www.ndp.ca/latest?action_handler=canadandp-home/block--news-list&action=block--news-list--more&json=1&page='
page_num = np.arange(2,15,1) #page starts on 2 and goes up to 10 1 at a time
page_news_list = []
for number in page_num:
    a = news_generic+str(number)
    page_news_list.append(a)

In [212]:
#function to get json data from NDP press release
def pull_ndp_news_url(webpage):
    ndp_url_list = []
    response = requests.get(webpage)
    data = json.loads(response.text)['list']
    for item in data:
        #print(item['link'])
        ndp_url_list.append(item['link'])
    return ndp_url_list


In [158]:
#gets the list of all press releases by the NDP
ndp_url_list=[]
for page in page_news_list:
    ndp_url_list=ndp_url_list +pull_ndp_news_url(page)
    
#removing all the repeated urls from ndp
ndp_url_list = list(set(ndp_url_list)) #removes duplicates

#looping through all the pages to get their content
ndp_content = []
ndp_date = []
for url in ndp_url_list:
    
    items= get_content(url, '.news2-body', '.news2-date')
    ndp_content.append(items[0])
    ndp_date.append(items[1])

In [213]:
#adds everything to a Dataframe and then transform the date string into an actual date type.
ndp_df = pd.DataFrame(data = {'party': 'NDP', 'url': ndp_url_list, 
                                        'content':ndp_content, 'date': ndp_date})
ndp_df['date'] = ndp_df.apply(lambda row: parse(row['date']), axis=1)

#backup
ndp_df.to_csv('ndp_df.csv', index = False)

In [214]:
#Concatenating everything
frames = [conservative_df, liberal_df, ndp_df]
df = pd.concat(frames)
#backup
df.to_csv('canada_parties_pr.csv', index = False)
df.head()

Unnamed: 0,party,url,content,date
0,conservative,https://www.conservative.ca/a-new-conservative...,"\r\nScheer to restore order, fairness, and com...",2019-10-09
1,conservative,https://www.conservative.ca/the-last-time-just...,\r\nTrudeau can’t be trusted to make life more...,2019-09-30
2,conservative,https://www.conservative.ca/the-hon-andrew-sch...,"\r\nSEPTEMBER 29, 2019\r\nFOR IMMEDIATE RELEAS...",2019-09-29
3,conservative,https://www.conservative.ca/100-day-action-pla...,\r\nFamilies will start saving hundreds of dol...,2019-10-17
4,conservative,https://www.conservative.ca/justin-trudeau-ref...,\r\nTrudeau’s plan means less coverage and hig...,2019-09-23
