In [244]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
import re
import string

### Call `scrape_website` to retrieve a page of article headline.

In [None]:
authors = []
dates = []
statements = []
sources = []
targets = []
urls = []
source_urls = []

def scrape_website(page_number:int):
    ''' Scrape and store the authors, statements, sources, targets, urls, 
    and source urls of a page of articles.

    Parameters
    ----------
    page_number: int
        Page number.
    '''

    domain = 'https://www.politifact.com'
    page_num = str(page_number)
    URL = 'https://www.politifact.com/factchecks/list/?page=' + page_num
    webpage = requests.get(URL)
    soup = BeautifulSoup(webpage.text, 'html.parser')

    statement_footer = soup.find_all('footer', attrs={'class':'m-statement__footer'}) # Author and date
    statement_quote = soup.find_all('div', attrs={'class':'m-statement__quote'}) # Statement
    statement_meta = soup.find_all('div', attrs={'class':'m-statement__meta'}) # Source
    target = soup.find_all('div', attrs={'class':'m-statement__meter'}) # Target

    for i in statement_footer:
        link1 = i.text.strip()
        name_and_date = link1.split()
        first_name = name_and_date[1]
        last_name = name_and_date[2]
        full_name = first_name + ' '+ last_name
        sep_index = name_and_date.index('•')
        month = name_and_date[sep_index+1]
        day = name_and_date[sep_index+2]
        year = name_and_date[sep_index+3]
        date = month + ' ' + day + ' ' + year 

        authors.append(full_name)
        dates.append(date)

    for i in statement_quote:
        link2 = i.find_all('a')
        statement_text = link2[0].text.strip()
        statements.append(statement_text)
        urls.append(domain + link2[0].get('href'))

    for i in statement_meta:
        link3 = i.find_all('a')
        source_text = link3[0].text.strip()
        sources.append(source_text)
        source_urls.append(domain + link3[0].get('href'))

    for i in target:
        link4 = i.find('div', attrs = {'class':'c-image'}).find('img').get('alt')
        targets.append(link4)

### Call `scrape_article` to retrieve full data related to the article.

In [None]:
domain = 'https://www.politifact.com'
highlights = []
articles = []
references = []
categories = []
category_urls = []

def scrape_article(URL:str):
    ''' Scrape and store revelant information of an article, including original
    text, highlights, list of references, list of categories, list of category urls.

    Parameters
    ----------
    URL: str
        URL of an article.
    '''
    webpage = requests.get(URL, timeout=120)
    soup = BeautifulSoup(webpage.text, 'html.parser')

    tldr = soup.find_all('div', attrs={'class':'short-on-time'})
    if(len(tldr) > 0):
        highlights.append(tldr[0].text.strip())
    else: highlights.append(None)

    paragraphs = soup.find('article', attrs={'class':'m-textblock'}).find_all('p')
    article = " ".join([p.text.strip() for p in paragraphs])
    articles.append(article)

    refs = soup.find('article', attrs={'class':'m-superbox__content'}).find_all('p')
    reference = [r.text.strip()[:r.text.index(',')] for r in refs if ',' in r.text]
    references.append(reference)

    cats = soup.find_all('li', attrs={'class': 'm-list__item'})
    category = [cat.text.strip() for cat in cats]
    url = [domain + cat.find('a').get('href') for cat in cats]
    categories.append(category)
    category_urls.append(url)


### Call `scrape_source_score` to retrieve score card of the article source.

In [287]:
src_true = []
src_mostly_true = []
src_half_true = []
src_mostly_false = []
src_false = []
src_pants_on_fire = []

def scrape_source_score(URL:str):
    ''' Scrape and store revelant scores of a source.

    Parameters
    ----------
    URL: str
        URL of a source.
    '''
    webpage = requests.get(URL, timeout=120)
    soup = BeautifulSoup(webpage.text, 'html.parser')

    scorecard = soup.find_all('p', attrs={'class':'m-scorecard__checks'})
    true, mtrue, htrue, mfalse, false, pof = [int(re.findall(pattern='[0-9]+', string=s.text.strip())[0]) for s in scorecard]
    src_true.append(true)
    src_mostly_true.append(mtrue)
    src_half_true.append(htrue)
    src_mostly_false.append(mfalse)
    src_false.append(false)
    src_pants_on_fire.append(pof)    

In [300]:
df = pd.read_csv('../data/politifact_plus.csv')
df.head()

Unnamed: 0,author,headline,source,date,target,highlight,article,references,categories,category_urls,src_true,src_mostly_true,src_half_true,src_mostly_false,src_false,src_pants_on_fire
0,Ciara O'Rourke,"“The airport in Salzburg, Austria, has a count...",Instagram posts,"October 30, 2023",False,This rumor stems from an ad. No such counter e...,A social media post poised to encourage people...,"instagram_post,the_local,washington_post,x_post","transportation,facebook_fact_checks,instagram_...","['https://www.politifact.com/transportation/',...",5,3,16,54,480,157
1,Ciara O'Rourke,Video shows Palestinians pretending to be corp...,Viral image,"October 30, 2023",False,This video is 10 years old and shows student p...,The Gaza Health Ministry has said the Palestin...,"instagram_post,instagram_post,youtube","israel,facebook_fact_checks,viral_image","['https://www.politifact.com/israel/', 'https:...",4,13,35,53,764,340
2,Loreben Tuquero,The life span of a wind tower generator lasts ...,Facebook posts,"October 30, 2023",False,A study by energy industry experts showed that...,Let’s clear the air. Do wind turbine component...,"none,x_post,facebook_post,interview,interview,...","climate_change,energy,facebook_fact_checks,fac...","['https://www.politifact.com/climate-change/',...",24,50,108,247,1532,595
3,Ciara O'Rourke,Matthew Perry died because of a COVID-19 vaccine.,Instagram posts,"October 30, 2023",False,"Actor Matthew Perry died Oct. 28, but his caus...","Actor Matthew Perry died Oct. 28, setting off ...","instagram_post,instagram_post,instagram_post,i...","facebook_fact_checks,coronavirus,instagram_posts",['https://www.politifact.com/facebook-fact-che...,5,3,16,54,480,157
4,Jill Terreri,A discrepancy in the number of ballots and vot...,New York Citizens Audit,"October 29, 2023",False,New York Citizens Audit is comparing certified...,The claim is startling: New York’s election re...,"interview,interview,interview,new_york_citizen...","elections,new_york,new_york_citizens_audit","['https://www.politifact.com/elections/', 'htt...",0,0,0,0,1,0
