In [33]:
import os
import pandas as pd
from dotenv import load_dotenv
from newsdataapi import NewsDataApiClient

In [2]:
# Loading API KEY from environment
load_dotenv()
NEWSDATA_API = os.getenv("newsdata_api")
api = NewsDataApiClient(apikey = NEWSDATA_API)

In [62]:
# Defining keywords by pillar dictionary
keywords = [

    # Agents
    "government",
    "president",
    "prime minister",
    "minister",
    "secretary",
    "army",
    "opposition",
    "congress",
    "senate",
    "assembly",
    "parliament",
    "legislature",
    "supreme court",
    "judiciary",
    "judicial system",
    "ombudsman",
    "civil society",
    "public officer",
    "political party",
    "political parties",
    "world justice project",
    "v-dem",
    "magistrate",
    "comptroller",
    "transparency international",
    "freedom house",
    "human rights watch",
    "amnesty international",
    "police",
    "judge",
    "court",
    "deffense attorney",

    # Accountability and Corruption
    "accountability",
    "oversight",
    "corrupt",
    "corruption",
    "bribery",
    "graft",
    "fraud",
    "patronage",
    "accountability",
    "embezzlement",
    "lobbying",
    "nepotism",
    "misappropiation",
    "government contract",
    "transparent",
    "transparency",
    "disclosure",
    "government contract",
    "audit",

    # Authoritarianism
    "abuse of power",
    "authoritarian",
    "authoritarianism",
    "populism",
    "populist",

    # Elections
    "elections",
    "vote",
    "ballout",

    # Freedom and rights
    "censor",
    "persecution",
    "freedom",
    "media",
    "human right",
    "protest",
    "demonstration",
    "liberty",
    "liberties",
    "liberal",
    "equality",
    "due process",
    "discrimination",
    "discriminatory",
    "bias",
    "expression",
    "labor rights",
    "property rights",
    "labor unions",
    "free media",
    "immigrants",
    "immigration",
    "assylum",

    # Institutions
    "constitution",
    "legislative",
    "democracy",
    "democratic",
    "rule of law",
    "governance",
    "impartial",
    "consumer protection",
    "legality",
    "investigation",

    # Justice
    "judicial independence",
    "civil justice",
    "justice",
    "judicial",
    "prosecution",
    "investigation",
    "appeal",
    "dispute resolution",
    "alternative justice",
    "public trial",
    "trial",
    "criminal justice",
    "criminal",

    # Regulation
    "regulatory enforcement",
    "regulation",
    "regulatory",
    "administrative proceedings",
    "presumption of innocence",
    "proceedings",
    "expropiate",
    "expropiation",
    "compliance",
    "permit",

    # Security
    "security",
    "crime",
    "safety",
    "homicide",
    "violence",
    "kidnapping",
    "extortion"   
]

In [38]:
# Defining a function to fetch news articles
def newsfetch(key, source):
    """
    This function collects a specific keyword and uses the NewsData API to scrap
    news articles from their massive archive.

    Parameters:
        key: String. Keyword(s) to search for in the NewsData API.
    """
    
    print(f"Searching articles in {source}")
    print(f"Searching news articles for: {key}")

    counter = 1
    print("Page no. " + str(counter) + " of results")

    # Fetching news articles
    response = api.archive_api(q = f'"{key}"',
                               domainurl = source)
    data     = pd.DataFrame(response)
    results  = pd.DataFrame(response["results"])

    if data.empty == False:
        np = data.loc[0,"nextPage"]
        while np is not None:
            counter  = counter + 1
            print("Page no. " + str(counter) + " of results")
            response = api.archive_api(q    = f'"{key}"',
                                    page = np)
            data     = pd.DataFrame(response)
            results  = pd.concat([results, pd.DataFrame(response["results"])])
            
            np = data.loc[0,"nextPage"]

    return results

In [None]:
test = [newsfetch(key=k, source=n) for n in newspapers for k in keywords]