In [120]:
import pandas as pd
import os
import requests
import json
import time
from dotenv import load_dotenv

## Loading API Key

In [4]:
# Loading API KEY from environment
load_dotenv()
newscatcher_key = os.getenv("newscatcher_key")

## Checking subscription plan

In [130]:
# Defining URL, header, and parameters
url      = "https://v3-api.newscatcherapi.com/api/subscription"
headers  = {"x-api-token" : newscatcher_key}

# Sending the GET
response = requests.get(url, headers=headers)

# Parsening results
json_format = json.loads(response.text)
print(json_format)

{'active': True, 'concurrent_calls': 2, 'plan': 'v3_nlp', 'plan_calls': 30000, 'remaining_calls': 29335, 'historical_days': 60}


## Defining extraction functions

In [157]:
cols = ["title", "published_date", "link", "domain_url", "name_source", "language", 
        "country", "description", "content", "word_count", "is_opinion", "id"]

def newsFetcher(source):

    # Defining URL, header, and parameters
    url      = "https://v3-api.newscatcherapi.com/api/search?"
    params   = {
        "q"         : "*",
        "sources"   : source,
        "page_size" : 5 
    }
    headers  = {"x-api-token" : newscatcher_key}

    # Sending the GET
    response = requests.get(url, params=params, headers=headers)
    time.sleep(2) # The API seems to fail when overwhelmed with calls

    # Parsening the response
    json_data = json.loads(response.text)

    # Status
    status   = json_data["status"]
    nhits    = json_data["total_hits"]
    status   = json_data["status"]

    # Printing summary of results
    print(f"Source: {source}, status: {status}, total hits: {nhits}")

    # Converting to pandas data frame
    df = pd.DataFrame(json_data["articles"])

    outcome = {
        "summary" : [json_data["status"], json_data["total_hits"], json_data["status"]],
        "data"    : df 
    }

    return outcome

def sourceFetcher(source):

    print(f"Source: {source}")

    # Defining URL, header, and parameters
    url      = "https://v3-api.newscatcherapi.com/api/sources?"
    params   = {
        "source_url" : source
    }
    headers  = {"x-api-token" : newscatcher_key}

    # Sending the GET
    response = requests.get(url, params=params, headers=headers)
    time.sleep(2) # The API seems to fail when overwhelmed with calls

    # Parsening the response
    json_data = json.loads(response.text)

    # Preparing the
    if not json_data["sources"]:
        outcome = "Source not found"
    else:
        outcome = json_data["sources"][0]["name_source"]

    return outcome 

## Checking sources with the API

In [62]:
sources = pd.read_excel("../inputs/EU_sources_1.xlsx")
sources["shortURL"] = sources["URL"].replace(r"^https?://|www\.|/", "", regex=True)

In [121]:
sources["newscatcher"] = sources["shortURL"].apply(sourceFetcher)

Source: diepresse.com
Source: derstandard.at
Source: krone.at
Source: profil.at
Source: heute.at
Source: kleinezeitung.at
Source: volksblatt.at
Source: vn.at
Source: wienerzeitung.at
Source: kurier.at
Source: neue.at
Source: sn.at
Source: rtbf.be
Source: standaard.be
Source: lalibre.be
Source: lesoir.be
Source: lecho.be
Source: tijd.be
Source: gva.be
Source: nieuwsblad.be
Source: hln.be
Source: demorgen.be
Source: knack.be
Source: lalibre.be
Source: sudinfo.be
Source: lavenir.net
Source: burgasnews.com
Source: varna24.bg
Source: 24chasa.bg
Source: trud.bg
Source: dnevnik.bg
Source: capital.bg
Source: standartnews.com
Source: bta.bg
Source: glas-slavonije.hr
Source: slobodnadalmacija.hr
Source: novilist.hr
Source: 24sata.hr
Source: jutarnji.hr
Source: vecernji.hr
Source: rtl.hr
Source: hrt.hr
Source: nacional.hr
Source: politis.com.cy
Source: philenews.com
Source: cyprus-mail.com
Source: sigmalive.com
Source: archiv.hn.cz
Source: blesk.cz
Source: mfdnes.cz
Source: pravo.cz
Source: denik

In [123]:
sources.loc[sources["newscatcher"] == "Source not found"]

Unnamed: 0,Country,Name,City,NUTS,URL,Language,Editorial,API Availability,shortURL,newscatcher
8,Austria,Wiener Zeitung,Vienna,AT1,https://www.wienerzeitung.at/,German,liberal,Yes,wienerzeitung.at,Source not found
20,Belgium,Het Laatste Nieuws,Antwerp,BE2,https://www.hln.be/,Dutch,center-right,Yes,hln.be,Source not found
34,Croatia,Glas Slavonije,Osijek,HR02,https://www.glas-slavonije.hr/,Croatian,,No,glas-slavonije.hr,Source not found
47,Czechia,Hospodářské noviny,Praha,CZ01,https://archiv.hn.cz/,Czech,,No,archiv.hn.cz,Source not found
49,Czechia,Mladá fronta Dnes,Praha,CZ01,https://www.mfdnes.cz/,Czech,,No,mfdnes.cz,Source not found
50,Czechia,Právo,Praha,CZ01,https://www.pravo.cz/,Czech,center-left,No,pravo.cz,Source not found
70,Finland,Helsinki Times,Helsinki,FI1B,https://www.helsinkitimes.fi/,English,,No,helsinkitimes.fi,Source not found
119,Hungary,Magyar Hirlap,Közép-Magyarország,HU1,https://www.magyarhirlap.hu/,Hungarian,conservative,Yes,magyarhirlap.hu,Source not found
150,Latvia,Liesma,\tVidzeme,LV008,https://www.eliesma.lv/,Latvian,,No,eliesma.lv,Source not found
151,Latvia,Latgales Laiks,Latgale,LV005,https://latgaleslaiks.lv/,Latvian,,No,latgaleslaiks.lv,Source not found


In [188]:
sources_nodata = sources.loc[sources["newscatcher"] == "Source not found"].shortURL.to_list()

## Testing News Articles Search Endpoint

In [158]:
target_sources = sources["shortURL"].tolist()
test_results   = [newsFetcher(x) for x in target_sources]

Source: diepresse.com, status: ok, total hits: 860
Source: derstandard.at, status: ok, total hits: 1060
Source: krone.at, status: ok, total hits: 1850
Source: profil.at, status: ok, total hits: 39
Source: heute.at, status: ok, total hits: 1220
Source: kleinezeitung.at, status: ok, total hits: 1809
Source: volksblatt.at, status: ok, total hits: 632
Source: vn.at, status: ok, total hits: 247
Source: wienerzeitung.at, status: ok, total hits: 1
Source: kurier.at, status: ok, total hits: 955
Source: neue.at, status: ok, total hits: 97
Source: sn.at, status: ok, total hits: 1379
Source: rtbf.be, status: ok, total hits: 1605
Source: standaard.be, status: ok, total hits: 590
Source: lalibre.be, status: ok, total hits: 80
Source: lesoir.be, status: ok, total hits: 153
Source: lecho.be, status: ok, total hits: 281
Source: tijd.be, status: ok, total hits: 326
Source: gva.be, status: ok, total hits: 1972
Source: nieuwsblad.be, status: ok, total hits: 1245
Source: hln.be, status: ok, total hits: 0


In [183]:
summary_list = [x["summary"][1] for x in test_results]
summary_df   = pd.DataFrame(
    {
        "source" : target_sources,
        "nhits"  : summary_list
    }
) 

In [201]:
(summary_df
 .loc[summary_df["nhits"] < 25]
 .sort_values(by = "nhits", ascending = False))

Unnamed: 0,source,nhits
148,db.lv,21
84,lavoixdunord.fr,17
31,capital.bg,16
171,wyborcza.pl,15
40,rtl.hr,14
61,bt.dk,14
195,primorske.svet24.si,9
194,vecer.com,7
53,berlingske.dk,7
157,tageblatt.lu,5


In [202]:
data_chunks = [x["data"] for x in test_results]
master_data = pd.concat(data_chunks)
# with pd.option_context('display.max_rows', None,
# 					'display.max_columns', None,
# 					'display.width', 1000,
# 					'display.precision', 3,
# 					'display.colheader_justify', 'left'):
# 	display(master_data.loc[:,cols])

In [194]:
master_data[master_data["domain_url"].isin(sources_nodata)][cols]

Unnamed: 0,title,published_date,link,domain_url,name_source,language,country,description,content,word_count,is_opinion,id
0,Firmenpleiten: Flüchten Unternehmen aus der Ve...,2024-02-13 04:05:51,https://www.wienerzeitung.at/a/firmenpleiten-f...,wienerzeitung.at,Wiener Zeitung,de,AT,Mit Kika/Leiner und später Signa war 2023 das ...,Mit Kika/Leiner und später Signa war 2023 das ...,723,False,87005d97f8826959d059bed39814f3c2
0,"About 10,000 Finnish service workers to go on ...",2024-02-07 07:00:29,https://www.helsinkitimes.fi/finland/finland-n...,helsinkitimes.fi,Helsinki Times,en,FI,SERVICE UNION UNITED (PAM) on Tuesday announce...,The union revealed that the strikes will cover...,325,False,180374bb6a3b9d6ad3c203c8ff3604a8
1,Finland's Compass Group Culinary Team strikes ...,2024-02-07 21:12:30,https://www.helsinkitimes.fi/finland/finland-n...,helsinkitimes.fi,Helsinki Times,en,FI,In a triumphant showcase of culinary excellenc...,In a triumphant showcase of culinary excellenc...,451,False,7f343e146d2248bffd72a12f6d9a23f7
2,Widespread strikes to hit Finland: Major disru...,2024-02-09 16:00:19,https://www.helsinkitimes.fi/finland/finland-n...,helsinkitimes.fi,Helsinki Times,en,FI,Finland is bracing for a wave of political str...,Finland is bracing for a wave of political str...,414,False,db8f20bf2989b3a7b727143d84817d3f
3,VR to operate half of its long-distance trains...,2024-02-13 16:03:51,https://www.helsinkitimes.fi/finland/finland-n...,helsinkitimes.fi,Helsinki Times,en,FI,"VR, the Finnish state-owned railway company, a...","VR , the Finnish state-owned railway company, ...",369,False,866951b83f9a0a77024514d3e9da5e5c
0,Bold Bass: Introducing Sonos Sub Mini,2024-02-07 19:21:06,https://baltic-review.com/sonos-sub-mini,baltic-review.com,The Baltic Review,en,US,"Within the Sonos cosmos, the Sonos Sub Mini is...","Within the Sonos cosmos, the Sonos Sub Mini is...",1569,False,066ed83526971b1c61226c3154d32529
1,Baltic simplicity and mastery of geometry: Lit...,2024-02-07 18:06:07,https://baltic-review.com/lithuanian-straw-gar...,baltic-review.com,The Baltic Review,en,US,"'Sodai' in Lithuanian, also known as straw mob...","'Sodai' in Lithuanian, also known as straw mob...",289,False,3fccfa13dc16bf9cf69095eb868df007
0,Tageblatt-Serie / Ein Einblick: So sieht es hi...,2024-02-07 05:48:00,https://www.tageblatt.lu/headlines/ein-einblic...,tageblatt.lu,Tageblatt,de,LU,900: Es ist wohl das wichtigste Gerät der Fußb...,"Kantine, Sprachunterricht, Fitnessraum: Im vie...",633,False,657a41c7e9beff90411520eb5ed8ce78
1,Kommentar / Wahlverschiebung löst institutione...,2024-02-08 05:02:00,https://www.tageblatt.lu/headlines/wahlverschi...,tageblatt.lu,Tageblatt,de,LU,Der 'Phare des Mamelles' vor der senegalesisch...,Der 'Phare des Mamelles' vor der senegalesisch...,440,False,361d00c01dead94196ef32c2cd1a8d67
2,Basketball / Contern baut Siegesserie aus,2024-02-10 08:02:00,https://www.tageblatt.lu/sport/andere_sportart...,tageblatt.lu,Tageblatt,de,LU,"Düdelingens Profilazarett ist mit Stephens, Lo...",Contern bestätigte am Freitagabend seine ausge...,480,False,6bbe8238df6c60c9ce221fa83a6a6325


In [196]:
master_data.loc[master_data["content"].isnull() | master_data["description"].isnull()]

Unnamed: 0,title,author,authors,journalists,published_date,published_date_precision,updated_date,updated_date_precision,link,domain_url,...,language,description,content,word_count,is_opinion,twitter_account,all_links,all_domain_links,id,score
2,Бюлетин,Capital.bg,[Capital.bg],,2024-02-08 14:14:00,timezone unknown,2024-02-08 17:35:45,timezone unknown,https://www.capital.bg/biznes/imoti/2024/02/08...,capital.bg,...,bg,,"Пред вас е ""брой"" на специализирания бюлетин з...",199,False,@capitalbg,[https://mediakit.economedia.bg/advertising/#h...,"[youtube.com, linkedin.com, digitalkconference...",ec897006e1ec20e2803eb03922e60ca1,1.699454
1,"Local woman raises over €3,000 for Meath River...",,[],[],2024-02-07 00:00:00,date,,,https://www.meathchronicle.ie/2024/02/07/local...,meathchronicle.ie,...,en,,Quiz and raffle fundraiser at The Willows Pub ...,314,False,,"[https://www.anglocelt.ie/, http://www.offalyi...","[offalyindependent.ie, westmeathexaminer.ie, a...",cc8ade33806928c68e1781016828703f,1.203476
