In [1]:
import re
import curlify
import requests
import pandas as pd

import numpy as np
import datetime

from bs4 import BeautifulSoup

from tqdm.notebook import tqdm

from dateutil.parser import parse as date_parse

from IPython.display import display, Markdown

tqdm.pandas()

## Data
### Loading

In [2]:
articles = pd.read_excel("Metadata for fact checks .xlsx")
articles.sample()

Unnamed: 0,Unnamed: 1,url,title,summary,claimant,verdict,party,topic,sub category,date_published,creator,publisher,license_type,copyright,verdict_simplified
129,5430672.0,https://www.abc.net.au/news/5430672,Fact check: Chris Bowen scaremongering on retu...,Commentary around the Federal Government's pro...,Chris Bowen,Scaremongering,Labor,Economy,,Mon Jun 02 07:00:00 EST 2014,RMIT ABC Fact Check,Australian Broadcasting Corporation,All rights reserved,2000 ABC,Negative


### Formatting date

In [3]:
def getFormattedDate(date):
    if isinstance(date, datetime.datetime):
        return date
    try:
        return date_parse(date)
    except:
        return np.nan
    
articles['date'] = articles['date_published'].apply(getFormattedDate)
articles[['date_published', 'date']].sample(10)



Unnamed: 0,date_published,date
67,Mon Nov 04 09:02:21 EST 2013,2013-11-04 09:02:21
62,Wed Oct 23 11:06:02 EST 2013,2013-10-23 11:06:02
173,Mon Nov 24 17:20:45 EST 2014,2014-11-24 17:20:45
194,Mon May 04 11:34:30 EST 2015,2015-05-04 11:34:30
424,"17 Apr 2019, 9:11am",2019-04-17 09:11:00
66,Wed Oct 30 17:41:19 EST 2013,2013-10-30 17:41:19
168,Mon Nov 24 09:37:13 EST 2014,2014-11-24 09:37:13
425,"18 Apr 2019, 6:19am",2019-04-18 06:19:00
55,Mon Oct 21 09:31:30 EST 2013,2013-10-21 09:31:30
45,Thu Sep 26 18:10:39 EST 2013,2013-09-26 18:10:39


In [23]:
article = sendQuery("https://www.abc.net.au/news/4856762")
soup = BeautifulSoup(article)
claim = (
            soup.find('strong', text = re.compile('The claim:\s*'))
            .parent.text.replace("The claim: ", "")
        )

In [29]:
s = requests.Session()
def sendQuery(url, params={}, cookies=None):
    req = requests.Request(
        "GET",
        url,
        cookies=cookies,
        params=params
    ).prepare()
    response = s.send(req)
    return response.text

def getClaimVerdict(url):
    article = sendQuery(url)
    soup = BeautifulSoup(article)
    try:
        claim = (
            soup.find('strong', text = re.compile('The claim[s]{0,1}:\s*'))
            .parent.text.replace("The claim: ", "")
        )
        verdict = (
            soup.find('strong', text = re.compile('The verdict:\s*'))
            .parent.text.replace("The verdict: ", "")
        )
        return pd.Series([claim, verdict])
    except:
        return pd.Series([np.nan, np.nan])
    
articles[["claim", "verdict_complete"]] = articles["url"].progress_apply(getClaimVerdict)

  0%|          | 0/488 [00:00<?, ?it/s]

## NewsBank
### Setting requests

In [6]:
cookies = {
    "_ga":"GA1.2.1611251493.1660786621",
    "_gat":"1",
    "_gid": "GA1.2.1611251493.1660786621",
    "oamps": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJwIjpbImQzZWY2MjQzLTA1YzEtNGViOS1iMzI5LWJhZDc4MmI4NDkyZCJdLCJkIjoid3d3Lm1vbmFzaC5lZHUiLCJlIjoiaHR0cHM6Ly9pZHAud3d3Lm1vbmFzaC5lZHUvb3BlbmF0aGVucyIsIm9yZyI6IjcxNzc1Mjg4IiwiaWQiOiJiYTVhMTk4My05OGVlLTRkOTUtOGI0Yi05OWZiYWJhYzdhY2QiLCJleHAiOjE2NjEzNTk3NjgsImNwIjpmYWxzZSwiY3IiOmZhbHNlfQ.8CePEHfLP8TJXawZkHLKT_Y39sf_sqpgCfabOF7-RVg",  
}


url = "https://infoweb-newsbank-com.ap1.proxy.openathens.net/apps/news/results?"
base_params = {
    "p": "AWNB",
    "fld-base-0": "alltext",
    "val-base-0": "",
    "sort": "YMD_date%3AD",
    "maxresults": 1000,
    "t": ""
}

### Parameters

In [5]:
def getDateRange(date):
    upper = date +  datetime.timedelta(weeks=2) 
    lower = date +  datetime.timedelta(weeks=-2) 
    
    return lower.strftime("%m/%d/%Y") + " - " + upper.strftime("%m/%d/%Y") 


def getNumberArticles(row):
    params = base_params.copy()

    params["val-base-0"] = row.title
    params["bln-base-1"] = "and"
    
    if not pd.isnull(row.date):
        params["fld-base-1"] = "YMD_date"
        params["val-base-1"] = getDateRange(row.date)

    search_result = sendQuery(url, params)
    soup = BeautifulSoup(search_result)
    return len(soup.findAll('article'))

### Scrapping

In [6]:
articles["number_articles_newsbank"] = articles.progress_apply(getNumberArticles, axis=1)

  0%|          | 0/488 [00:00<?, ?it/s]

In [45]:
(articles["number_articles_newsbank"]>0).value_counts()

False    416
True      72
Name: number_articles_newsbank, dtype: int64

In [34]:
articles.to_csv("facts_check_news_bank.csv", index=False)

## Google News

In [7]:
articles = pd.read_csv("facts_check_news_bank.csv", parse_dates=["date"])

In [9]:
articles.sample()

Unnamed: 0.1,Unnamed: 0,Unnamed: 2,url,title,summary,claimant,verdict,party,topic,sub category,date_published,creator,publisher,license_type,copyright,verdict_simplified,date,number_articles_newsbank
134,134,5452698.0,https://www.abc.net.au/news/5452698,Foreign aid cuts make up one fifth of budget s...,World Vision Australia chief executive Tim Cos...,Tim Costello,Checks Out,Industry,Foreign relation,Foreign aid,Tue May 20 07:00:00 EST 2014,RMIT ABC Fact Check,Australian Broadcasting Corporation,All rights reserved,2000 ABC,Positive,2014-05-20 07:00:00,3


In [12]:
articles.loc[0]

Unnamed: 0                                                                  0
                                                                    4821544.0
url                                       https://www.abc.net.au/news/4821544
title                       No evidence to support Foreign Minister Bob Ca...
summary                     Foreign Minister Bob Carr claims people arrivi...
claimant                                                             Bob Carr
verdict                                                       Unsubstantiated
party                                                                   Labor
topic                                                             Immigration
sub category                                                              NaN
date_published                                   Wed Aug 14 00:12:38 EST 2013
creator                                                   RMIT ABC Fact Check
publisher                                 Australian Broadcastin

In [None]:
url = "https://www.google.com/search?"
base_params = {
    "q": 
}

In [42]:
from GoogleNews import GoogleNews

googlenews = GoogleNews()


def getDateRange(date):
    upper = date +  datetime.timedelta(weeks=2) 
    lower = date +  datetime.timedelta(weeks=-2) 
    
    return lower.strftime("%m/%d/%Y"), upper.strftime("%m/%d/%Y") 

googlenews.set_time_range(*getDateRange(articles["date"][0]))
googlenews.search(articles["title"][0])

In [40]:
getDateRange(articles["date"][0])  #after:2013-07-31 before:2013-08-28  

('07/31/2013', '08/28/2013')

In [39]:
articles["title"][0]

"No evidence to support Foreign Minister Bob Carr's economic migrants claims"

In [46]:
googlenews.total_count()

3

In [43]:
googlenews.results()

[{'title': 'Foreign aid: how much does Australia spend now?',
  'media': 'The Guardian',
  'date': '2 Aug 2013',
  'datetime': nan,
  'desc': 'Earlier this month, the foreign minister, Bob Carr, told the Guardian there \nwas no alternative to the delay in growth because of low government revenue.',
  'link': 'https://www.theguardian.com/world/2013/aug/22/australian-foreign-aid-policy',
  'img': 'data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=='},
 {'title': "Speaking softly means Australia's voice will not be heard on ...",
  'media': 'Human Rights Watch',
  'date': '8 Aug 2013',
  'datetime': nan,
  'desc': 'It seems to have taken an apparent chemical weapons attack killing hundreds \nin Syria to bring human rights into the foreign policy debate of the \nAustralian...',
  'link': 'https://www.hrw.org/news/2013/08/28/speaking-softly-means-australias-voice-will-not-be-heard-human-rights',
  'img': 'data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKA

In [58]:
import time
def getNumberNews(row):
    if not pd.isnull(row.date):
        googlenews.set_time_range(*getDateRange(row["date"]))
    googlenews.search(row["title"])
    results = googlenews.results()
    
    print(row.name)

    if results:
        return len(results)
    return 0

articles["number_articles_google"] = articles.progress_apply(getNumberNews, axis=1)

  0%|          | 0/488 [00:00<?, ?it/s]

HTTP Error 429: Too Many Requests
0
HTTP Error 429: Too Many Requests
1
HTTP Error 429: Too Many Requests
2


KeyboardInterrupt: 

In [56]:
googlenews.results()

[{'title': 'Foreign aid: how much does Australia spend now?',
  'media': 'The Guardian',
  'date': '2 Aug 2013',
  'datetime': nan,
  'desc': 'Earlier this month, the foreign minister, Bob Carr, told the Guardian there \nwas no alternative to the delay in growth because of low government revenue.',
  'link': 'https://www.theguardian.com/world/2013/aug/22/australian-foreign-aid-policy',
  'img': 'data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=='},
 {'title': "Speaking softly means Australia's voice will not be heard on ...",
  'media': 'Human Rights Watch',
  'date': '8 Aug 2013',
  'datetime': nan,
  'desc': 'It seems to have taken an apparent chemical weapons attack killing hundreds \nin Syria to bring human rights into the foreign policy debate of the \nAustralian...',
  'link': 'https://www.hrw.org/news/2013/08/28/speaking-softly-means-australias-voice-will-not-be-heard-human-rights',
  'img': 'data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKA

In [1]:
!pip list

Package                      Version
---------------------------- -----------
absl-py                      1.2.0
anaconda-clean               1.0
argon2-cffi                  21.3.0
argon2-cffi-bindings         21.2.0
asttokens                    2.0.5
astunparse                   1.6.3
attrs                        21.4.0
backcall                     0.2.0
beautifulsoup4               4.11.1
bleach                       5.0.1
cachetools                   5.2.0
certifi                      2022.6.15
cffi                         1.15.1
charset-normalizer           2.1.0
cycler                       0.11.0
debugpy                      1.6.2
decorator                    5.1.1
defusedxml                   0.7.1
entrypoints                  0.4
executing                    0.9.1
fastjsonschema               2.16.1
flatbuffers                  1.12
fonttools                    4.34.4
gast                         0.4.0
google-auth                  2.9.1
google-auth-o

In [3]:
!pip install jupyter_contrib_nbextensions

Defaulting to user installation because normal site-packages is not writeable
Collecting jupyter_contrib_nbextensions
  Using cached jupyter_contrib_nbextensions-0.5.1-py2.py3-none-any.whl (20.9 MB)
Collecting jupyter-contrib-core>=0.3.3
  Using cached jupyter_contrib_core-0.4.0-py2.py3-none-any.whl (17 kB)
Collecting pyyaml
  Using cached PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (701 kB)
Collecting jupyter-latex-envs>=1.3.8
  Using cached jupyter_latex_envs-1.4.6-py2.py3-none-any.whl
Collecting jupyter-highlight-selected-word>=0.1.1
  Using cached jupyter_highlight_selected_word-0.2.0-py2.py3-none-any.whl (11 kB)
Collecting lxml
  Using cached lxml-4.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.9 MB)
Collecting jupyter-nbextensions-configurator>=0.4.0
  Using cached jupyter_nbextensions_configurator-0.5.0-py2.py3-none-any.whl (467 kB)


Installing collected packages: jupyter-highlight-selected-word, pyyaml, lxml, jupyter-latex-envs, jupyter-contrib-core, jupyter-nbextensions-configurator, jupyter_contrib_nbextensions
[0mSuccessfully installed jupyter-contrib-core-0.4.0 jupyter-highlight-selected-word-0.2.0 jupyter-latex-envs-1.4.6 jupyter-nbextensions-configurator-0.5.0 jupyter_contrib_nbextensions-0.5.1 lxml-4.9.1 pyyaml-6.0


In [5]:
!jupyter contrib nbextension install --user

[32m[I 15:49:55 InstallContribNbextensionsApp][m jupyter contrib nbextension install --user
[32m[I 15:49:55 InstallContribNbextensionsApp][m Installing jupyter_contrib_nbextensions nbextension files to jupyter data directory
[32m[I 15:49:55 InstallContribNbextensionsApp][m Installing /home/crarojasca/.local/lib/python3.8/site-packages/jupyter_contrib_nbextensions/nbextensions/ruler -> ruler
[32m[I 15:49:55 InstallContribNbextensionsApp][m Up to date: /home/crarojasca/.local/share/jupyter/nbextensions/ruler/ruler_editor.yaml
[32m[I 15:49:55 InstallContribNbextensionsApp][m Up to date: /home/crarojasca/.local/share/jupyter/nbextensions/ruler/icon.png
[32m[I 15:49:55 InstallContribNbextensionsApp][m Up to date: /home/crarojasca/.local/share/jupyter/nbextensions/ruler/readme.md
[32m[I 15:49:55 InstallContribNbextensionsApp][m Up to date: /home/crarojasca/.local/share/jupyter/nbextensions/ruler/main.js
[32m[I 15:49:55 InstallContribNbextensionsApp][m Up to date: /home

[32m[I 15:49:55 InstallContribNbextensionsApp][m Up to date: /home/crarojasca/.local/share/jupyter/nbextensions/keyboard_shortcut_editor/readme_shortcut_editor_success.png
[32m[I 15:49:55 InstallContribNbextensionsApp][m Up to date: /home/crarojasca/.local/share/jupyter/nbextensions/keyboard_shortcut_editor/readme_comma.png
[32m[I 15:49:55 InstallContribNbextensionsApp][m Up to date: /home/crarojasca/.local/share/jupyter/nbextensions/keyboard_shortcut_editor/keyboard_shortcut_editor.yaml
[32m[I 15:49:55 InstallContribNbextensionsApp][m - Validating: [32mOK[0m
[32m[I 15:49:55 InstallContribNbextensionsApp][m Installing /home/crarojasca/.local/lib/python3.8/site-packages/jupyter_contrib_nbextensions/nbextensions/nbTranslate -> nbTranslate
[32m[I 15:49:55 InstallContribNbextensionsApp][m Up to date: /home/crarojasca/.local/share/jupyter/nbextensions/nbTranslate/languages.js
[32m[I 15:49:55 InstallContribNbextensionsApp][m Up to date: /home/crarojasca/.local/share/jup