In [33]:
import os
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from dotenv import load_dotenv
from newsdataapi import NewsDataApiClient

In [2]:
# Loading API KEY from environment
load_dotenv()
NEWSDATA_API = os.getenv("newsdata_api")
api = NewsDataApiClient(apikey = NEWSDATA_API)

In [62]:
# Defining keywords by pillar dictionary
keywords_pillar = {
    "pillar_1" : [
        "accountability",
        "abuse of power",
        "constitution",
        "oversight",
        "president",
        "prime minister",
        "minister",
        "secretary",
        "army",
        "opposition",
        "supreme court",
        "judiciary",
        "ruling",
        "congress",
        "senate",
        "assembly",
        "parliament",
        "judicial system",
        "government",
        "legislature",
        "ombudsman",
        "elections",
        "civil society",
        "judicial independence",
        "public officer",
        "government official",
        "authoritarian",
        "authoritarianism",
        "democracy",
        "democratic",
        "rule of law",
        "censor",
        "persecution",
        "political party",
        "political parties",
        "world justice project",
        "impartial",
        "v-dem",
        "magistrate",
        "elections",
        "vote"
    ],
    "pillar_2" : [
        "corrupt",
        "corruption",
        "bribery",
        "accountability",
        "embezzlement",
        "lobbying",
        "nepotism",
        "misappropiation",
        "government contract",
        "comptroller"
    ],
    "pillar_3" : [
        "open government",
        "transparent",
        "transparency",
        "disclosure",
        "government contract",
        "regulation",
        "right to information",
        "comptroller",
        "audit",
        "transparency international"
    ],
    "pillar_4" : [
        "freedom",
        "media",
        "human rights",
        "liberty",
        "liberties",
        "liberal",
        "equality",
        "due process",
        "discrimination",
        "discriminatory",
        "bias",
        "labor rights",
        "labor unions",
        "free media",
        "journalist",
        "presumption of innocence",
        "civil society",
        "rights",
        "immigrants",
        "political organizations",
        "collective bargaining",
        "collective action",
        "consumer protection",
        "legality",
        "freedom house",
        "human rights watch",
        "amnesty international"
    ],
    "pillar_5" : [
        "order",
        "law",
        "peace",
        "peaceful",
        "security",
        "crime",
        "safety",
        "homicide",
        "violence",
        "kidnapping",
        "extortion",
        "war"
    ],
    "pillar_6" : [
        "regulatory enforcement",
        "regulation",
        "regulatory",
        "administrative proceedings",
        "proceedings",
        "expropiate",
        "expropiation",
        "compliance",
        "permit"

    ],
    "pillar_7" : [
        "civil justice",
        "justice",
        "civil",
        "police",
        "judge",
        "court",
        "ruling",
        "law",
        "judiciary",
        "judicial",
        "prosecutor",
        "prosecution",
        "impartial",
        "appeal",
        "dispute resolution",
        "alternative justice",
        "public trial",
        "trial",
        "deffense attorney"
    ],
    "pillar_8" : [
        "criminal justice",
        "justice",
        "criminal",
        "prosecutor",
        "prosecution",
        "police",
        "judge",
        "court",
        "ruling",
        "law",
        "judiciary",
        "judicial",
        "prosecutor",
        "appeal",
        "dispute resolution",
        "alternative justice",
        "public trial",
        "trial",
        "deffense attorney"
    ]
}

# Keywords list
keywords = [y for x in keywords_pillar.values() for y in x]
keywords = list(set(keywords))


In [60]:
# Newspapers
newspapers = [
    "wsj.com",
    "usatoday.com",
    "latimes.com",
    "portlandtribune.com",
    "nytimes.co",
    "bostonglobe.com",
    "chicagotribune.com",
    "seattletimes.com",
    "denverpost.com",
    "indianapolisrecorder.com",
    "tampabay.com",
    "washingtonpost.com",
    "post-gazette.com",
    "phoenixnewtimes.com",
    "houstonchronicle.com",
    "stltoday.com",
    "sfchronicle.com",
    "azcentral.com",
    "inquirer.com",
    "jsonline.com",
    "miamiherald.com",
    "sandiegouniontribune.com",
    "bostonherald.com",
    "oregonlive.com",
    "cincinnati.com",
    "chicago.suntimes.com"
]

In [38]:
# Defining a function to fetch news articles
def newsfetch(key, source):
    """
    This function collects a specific keyword and uses the NewsData API to scrap
    news articles from their massive archive.

    Parameters:
        key: String. Keyword(s) to search for in the NewsData API.
    """
    
    print(f"Searching articles in {source}")
    print(f"Searching news articles for: {key}")

    counter = 1
    print("Page no. " + str(counter) + " of results")

    # Fetching news articles
    response = api.archive_api(q = f'"{key}"',
                               domainurl = source)
    data     = pd.DataFrame(response)
    results  = pd.DataFrame(response["results"])

    if data.empty == False:
        np = data.loc[0,"nextPage"]
        while np is not None:
            counter  = counter + 1
            print("Page no. " + str(counter) + " of results")
            response = api.archive_api(q    = f'"{key}"',
                                    page = np)
            data     = pd.DataFrame(response)
            results  = pd.concat([results, pd.DataFrame(response["results"])])
            
            np = data.loc[0,"nextPage"]

    return results

In [64]:
test = [newsfetch(key=k, source=n) for n in newspapers for k in keywords]

Searching articles in wsj.com
Searching news articles for: law
Page no. 1 of results
Searching articles in wsj.com
Searching news articles for: abuse of power
Page no. 1 of results
Searching articles in wsj.com
Searching news articles for: rule of law
Page no. 1 of results
Searching articles in wsj.com
Searching news articles for: freedom house
Page no. 1 of results
Searching articles in wsj.com
Searching news articles for: deffense attorney
Page no. 1 of results
Searching articles in wsj.com
Searching news articles for: court
Page no. 1 of results
Searching articles in wsj.com
Searching news articles for: presumption of innocence
Page no. 1 of results
Searching articles in wsj.com
Searching news articles for: rights
Page no. 1 of results
Searching articles in wsj.com
Searching news articles for: bias
Page no. 1 of results
Searching articles in wsj.com
Searching news articles for: constitution
Page no. 1 of results
Searching articles in wsj.com
Searching news articles for: peaceful
Pag

NewsdataException: {'status': 'error', 'results': {'message': 'Access Denied! Please upgrade your plan to access the archive endpoint.', 'code': 'AccessDenied'}}

In [65]:
test

[Empty DataFrame
 Columns: []
 Index: [],
 Empty DataFrame
 Columns: []
 Index: [],
                           article_id  \
 0   2ec9cb3752ace5c2603130290d2c194e   
 1   f30d9d2572670380d719dfa1291794d3   
 2   ada49ed7848dfea1eff641017a4e0088   
 3   1a1df11febd7cbe07c868c40f3ca78e1   
 4   30c150e2e68a9f92e52b63816f0e195d   
 5   0e4dac9bdbcd3a2ad7c6c02e08b39803   
 6   9f16dd2378700b3e07b140e9ab65ca56   
 7   e18ee28167de0cd33e8a6863e786486e   
 8   6a7ea96ab6f8aef8a4339674e2a0b855   
 9   22c419a3bbc3796ec10dae2c449de642   
 10  f6b4cbef45c14815c7756936334f75cc   
 
                                                 title  \
 0   Presidential centers from Hoover to Bush and O...   
 1   Teen in stolen car leads police on 132 mph cha...   
 2   Niger’s president vows democracy will prevail ...   
 3   Sweden moves closer to NATO membership after d...   
 4   SCOTUS unfreezes Louisiana redistricting case ...   
 5   With Trump under indictment, House GOP calls o...   
 6   Donald Trum