In [1]:
import pandas as pd
import os
import requests
import json
import time
from dotenv import load_dotenv

In [2]:
# Defining a functions that displays the entire pandas data frame:
def showEverything(df):
    with pd.option_context('display.max_rows', None,
                        'display.max_columns', None,
                        'display.width', 1000,
                        'display.precision', 3,
                        'display.colheader_justify', 'left'):
        display(df)

## Loading API Key

In [3]:
# Loading API KEY from environment
load_dotenv()
newscatcher_key_v2 = os.getenv("newscatcher_key_v2")
newscatcher_key_v3 = os.getenv("newscatcher_key_v3")

## Checking Subscription Plan (only for V3)

In [179]:
# Defining URL, header, and parameters
url      = "https://v3-api.newscatcherapi.com/api/subscription"
headers  = {"x-api-token" : newscatcher_key_v3}

# Sending the GET
response = requests.get(url, headers=headers)

# Parsening results
json_format = json.loads(response.text)
print(json_format)

{'active': True, 'concurrent_calls': 1, 'plan': 'v3_nlp', 'plan_calls': 10000, 'remaining_calls': 9036, 'historical_days': 60}


## Loading Keywords and Sources

In [173]:
sources = pd.read_excel("../inputs/EU_sources_1.xlsx")
sources["shortURL"] = sources["URL"].replace(r"^https?://|www\.|/", "", regex=True)

In [88]:
keywords      = pd.read_excel("../inputs/keywords_1.xlsx")
keywords_long = pd.melt(
    keywords, 
    id_vars    = "Group", 
    var_name   = "language", 
    value_name = "keyword"
)

In [93]:
institutions = pd.read_excel("../inputs/country_institutions_1.xlsx").dropna(subset=["translation"])

## Gathering News Articles

In [143]:
def gatherKeywords(language, country):
    """
    This function takes a language and a country as arguments and retrieve a set of query-styled strings
    from the keywords and institutions data frames.
    """

    output = []

    # Collapsing keyword batches
    batches = sorted(keywords_long["Group"].unique())
    for batch in batches:
        subset  = (
            keywords_long
            .copy()
            .loc[keywords_long["language"] == language]
            .loc[keywords_long["Group"] == batch]
        )
        query_style = " OR ".join(['"' + word + '"' for word in subset.keyword])
        query_style = query_style.replace("/", '" OR "')
        output.append(query_style)
    
    # Collapsing institutional names
    institutional_names = (
        institutions
        .copy()
        .loc[institutions["country"] == country]
    )
    query_style = " OR ".join(['"' + word + '"' for word in institutional_names.translation])
    output.append(query_style)

    return output

def newsFetcher(query, source, date_0 = "7 months ago", date_1 = "2 months ago", v2 = True):
    """
    This function takes a query and a news source as inputs and returns a data frame with all the results of that specific query 
    through either V2 or V3 of the Newscatcher API version.
    """

    # Defining initial counters
    page   = 1
    npage  = 100

    # Creating an empty list to store results
    outputs = []

    while page <= npage:

        # Defining URL, header, and parameters
        if v2 == True:
            url      = "https://api.newscatcherapi.com/v2/search?"
            headers  = {"x-api-key" : newscatcher_key_v2}
            params   = {
                "q"         : query,
                "sources"   : source,
                "page"      : page,
                "page_size" : 100,
                "from"      : date_0,
                "to"        : date_1,
                "sort_by"   : "date"
            }
        else:
            url      = "https://v3-api.newscatcherapi.com/api/search?"
            headers  = {"x-api-token" : newscatcher_key_v3}
            params   = {
                "q"         : query,
                "sources"   : source,
                "page"      : page,
                "page_size" : 1000,
                "from_"     : date_0,
                "to_"       : date_1,
                "sort_by"   : "date"
            }
    
        # Sending a GET call
        response = requests.get(url, params=params, headers=headers)
        time.sleep(1) # The API has a restriction of 1 call per second

        # Parsening the response in a JSON format
        json_data = json.loads(response.text)

        # Updating counters
        npage      = json_data["total_pages"]
        total_hits = json_data["total_hits"]

        if total_hits > 0:
        
            # Converting results to pandas data frame
            df = pd.DataFrame(json_data["articles"])
            min_date   = df.iloc[-1]["published_date"]
            outputs.append(df)

            # Increasing/Reseting counts
            if total_hits < 10000 or page < npage:
                page = page + 1
            else:
                page   = 1
                date_1 = min_date 
    
    # Merging entire list of data frames
    if outputs:
        results = pd.concat(outputs, ignore_index=True)
    else:
        results = None

    return results

def extractNews(source, country, language, from_ = "7 months ago", to_ = "2 months ago", v2 = True):
    """
    This functions takes a source's URL, the country it belongs to, and the language of the publication and 
    it retrieves news articles associated to the pre-defined queries within that news source.
    """

    print(f"Extracting news articles from: {source}")
    
    # Creating an empty dictionary to store results
    results_per_batch = {}

    # Iterating across batches of keywords/queries
    qbatches = gatherKeywords(language, country)
    for n, batch in enumerate(qbatches, start = 1):
        print(f"===== Extracting articles from Batch #{n}")
        batch_name   = f"Batch {n}"
        fetched_news = newsFetcher(batch, source, from_, to_, v2)
        dict = {
            batch_name : fetched_news
        }
        results_per_batch.update(dict)

    # Defining the outcome
    output = {
        source : results_per_batch
    }

    return output

def mergeData(dta, version):
    """
    This function takes a list containing all the returned data from the API and compiles it into a data set
    """
    for element in dta:
        for source, batches in element.items():
            print(source)
            data_list    = [data for batch, data in batches.items()]
            empty_source = all(data is None for data in data_list)
            if not empty_source:
                master_data  = pd.concat(data_list)
                master_data.to_parquet(f"../data/data-extraction-1-{version}/{source}.parquet.gzip", compression = "gzip")    

### Using API V2 to gather news

In [None]:
results_list_v2 = (
    sources
    # .loc[64:64]
    .apply(lambda row: extractNews(row["shortURL"], row["Country"], row["Language"]), axis = 1)
    .tolist()
)

In [None]:
mergeData(results_list_v2, version = "v2")

### Using API V3 to gather news

In [182]:
results_list_v3 = (
    sources
    # .loc[(sources["HP"] == "Yes") & (sources["Country"] == "Estonia")]
    .loc[sources["HP"] == "Yes"]
    .loc[110:208]
    .apply(lambda row: extractNews(
        row["shortURL"], row["Country"], row["Language"],
        from_ = "2 months ago", to_ = "1 day ago", v2 = False
    ), axis = 1)
    .tolist()
)

Extracting news articles from: kathimerini.gr
===== Extracting articles from Batch #1
===== Extracting articles from Batch #2
===== Extracting articles from Batch #3
===== Extracting articles from Batch #4
===== Extracting articles from Batch #5
===== Extracting articles from Batch #6
===== Extracting articles from Batch #7
===== Extracting articles from Batch #8
===== Extracting articles from Batch #9
===== Extracting articles from Batch #10
Extracting news articles from: protothema.gr
===== Extracting articles from Batch #1
===== Extracting articles from Batch #2
===== Extracting articles from Batch #3
===== Extracting articles from Batch #4
===== Extracting articles from Batch #5
===== Extracting articles from Batch #6
===== Extracting articles from Batch #7
===== Extracting articles from Batch #8
===== Extracting articles from Batch #9
===== Extracting articles from Batch #10
Extracting news articles from: tanea.gr
===== Extracting articles from Batch #1
===== Extracting articles f

In [183]:
mergeData(results_list_v3, version = "v3")

kathimerini.gr
protothema.gr
tanea.gr
magyarhirlap.hu
magyarnemzet.hu
vaol.hu
nepszava.hu
advertiser.ie
irishexaminer.com
irishtimes.com
independent.ie
corriere.it
lastampa.it
laverita.info
ilfoglio.it
repubblica.it
bnn-news.com
ir.lv
la.lv
db.lv
baltic-review.com
ve.lt
lrytas.lt
wort.lu
lequotidien.lu
tageblatt.lu
timesofmalta.com
independent.com.mt
talk.mt
ad.nl
telegraaf.nl
nrc.nl
volkskrant.nl
trouw.nl
wyborcza.pl
fakt.pl
rp.pl
publico.pt
expresso.pt
jn.pt
adevarul.ro
libertatea.ro
evz.ro
sme.sk
pravda.sk
dennikn.sk
dnevnik.si
delo.si
vecer.com
elpais.com
lavanguardia.com
elmundo.es
abc.es
diariovasco.com
aftonbladet.se
dn.se
svd.se
gp.se


## Compiling and saving data

In [112]:
files_v2 = os.listdir("../data/data-extraction-1-v2")
data_sources_v2 = [
    pd.read_parquet(f"../data/data-extraction-1-v2/{x}")
    for x in files_v2
]
master_v2 = pd.concat(data_sources_v2).drop_duplicates(subset = "_id")
master_v2.to_parquet("../data/master_1v2.parquet.gzip", compression = "gzip")

In [184]:
files_v3 = os.listdir("../data/data-extraction-1-v3")
data_sources_v3 = [
    pd.read_parquet(f"../data/data-extraction-1-v3/{x}")
    for x in files_v3
]
master_v3 = pd.concat(data_sources_v3).drop_duplicates(subset = "id")
master_v3.to_parquet("../data/master_1v3.parquet.gzip", compression = "gzip")

In [187]:
master_v3.to_parquet("../data/master_1v3.parquet.gzip", compression = "gzip")