In [1]:
from urllib.request import urlopen
import pandas as pd
from gdeltdoc import GdeltDoc, Filters

## Get list off all GDELT themes
We need the list of predefined themes to be able to filter

In [2]:
THEMES_URL = "http://data.gdeltproject.org/api/v2/guides/LOOKUP-GKGTHEMES.TXT"


def get_themes(url: str) -> pd.DataFrame:    
    # Fetch the content using urllib
    with urlopen(url) as response:
        data = response.read().decode()
    
    # Split the data into lines
    lines = data.strip().split("\n")
    
    # Split each line into key-value pairs
    rows = [line.split("\t") for line in lines]
    
    # Create a DataFrame from the rows
    df = pd.DataFrame(rows, columns=['theme', 'count'])
    df['count'] = df['count'].astype(int)
    
    return df

def get_climate_themes(themes_df) -> list[str] : 
    return themes_df[themes_df["theme"].str.contains("CLIMATE")]["theme"].to_list()

themes_df = get_themes(THEMES_URL)
themes_df

Unnamed: 0,theme,count
0,TAX_FNCACT,999601552
1,TAX_ETHNICITY,410780218
2,EPU_POLICY,384818230
3,CRISISLEX_CRISISLEXREC,373229208
4,TAX_WORLDLANGUAGES,348186680
...,...,...
59310,TAX_WORLDLANGUAGES_PUNAPA,1
59311,TAX_WORLDBIRDS_SWALLOWTAILED_HUMMINGBIRDS,1
59312,TAX_WORLDMAMMALS_PACIFIC_DEGU,1
59313,TAX_WORLDBIRDS_FLAMECRESTED_TANAGER,1


In [3]:
climate_themes = get_climate_themes(themes_df)
climate_themes

['WB_405_BUSINESS_CLIMATE',
 'WB_567_CLIMATE_CHANGE',
 'ENV_CLIMATECHANGE',
 'UNGP_CLIMATE_CHANGE_ACTION',
 'WB_1949_CLIMATE_SMART_AGRICULTURE',
 'WB_568_CLIMATE_SERVICES',
 'WB_579_CLIMATE_CHANGE_MITIGATION',
 'WB_571_CLIMATE_SCIENCE',
 'WB_1841_SHORT_LIVED_CLIMATE_POLLUTANTS',
 'WB_1844_MARKET_BASED_CLIMATE_CHANGE_MITIGATION',
 'WB_1773_CLIMATE_CHANGE_IMPACTS',
 'WB_1847_CLIMATE_FINANCE',
 'WB_574_CLIMATE_CHANGE_ADAPTATION',
 'WB_959_CLIMATE_CHANGE_LAW',
 'WB_747_SOCIAL_RESILIENCE_AND_CLIMATE_CHANGE',
 'WB_1774_CLIMATE_FORECASTING',
 'WB_2673_JOBS_AND_CLIMATE_CHANGE',
 'TAX_AIDGROUPS_CLIMATE_ACTION_NETWORK',
 'WB_572_CLIMATE_RESILIENT_DEVELOPMENT',
 'WB_2639_CLIMATE_EFFICIENT_INDUSTRIES',
 'WB_573_CLIMATE_RISK_MANAGEMENT',
 'WB_1849_PUBLIC_CLIMATE_FINANCE',
 'WB_1838_CLIMATE_RISK_SCREENING',
 'WB_1850_PRIVATE_CLIMATE_FINANCE',
 'WB_1839_OZONE_LAYER_DEPLETION_AND_CLIMATE_CHANGE',
 'WB_575_COMMUNITY_BASED_CLIMATE_ADAPTATION',
 'WB_1750_CLIMATE_CHANGE_ADAPTATION_IMPACTS']

## Scrape gdlet api fo latvian climate articles
We will use this wrapper around the GDELT api : https://github.com/alex9smith/gdelt-doc-api

In [4]:
partial_articles_dfs = []

for theme in climate_themes: 
    for year in [2022, 2023, 2024]: 
        f = Filters(
            #keyword = "climate change",
            start_date = f"{year}-01-01",
            end_date = f"{year}-12-31", 
            theme = theme, 
            country = "LG", 
        )
    
        gd = GdeltDoc()
    
        # Search for articles matching the filters
        partial_articles_df = gd.article_search(f)
        print(f"{len(partial_articles_df)} articles found for theme {theme}, in {year}")
        if partial_articles_df.empty: 
            continue
        partial_articles_dfs.append(partial_articles_df)

articles_df = pd.concat(partial_articles_dfs)

250 articles found for theme WB_405_BUSINESS_CLIMATE, in 2022
250 articles found for theme WB_405_BUSINESS_CLIMATE, in 2023
250 articles found for theme WB_405_BUSINESS_CLIMATE, in 2024
250 articles found for theme WB_567_CLIMATE_CHANGE, in 2022
205 articles found for theme WB_567_CLIMATE_CHANGE, in 2023
250 articles found for theme WB_567_CLIMATE_CHANGE, in 2024
250 articles found for theme ENV_CLIMATECHANGE, in 2022
151 articles found for theme ENV_CLIMATECHANGE, in 2023
150 articles found for theme ENV_CLIMATECHANGE, in 2024
250 articles found for theme UNGP_CLIMATE_CHANGE_ACTION, in 2022
156 articles found for theme UNGP_CLIMATE_CHANGE_ACTION, in 2023
159 articles found for theme UNGP_CLIMATE_CHANGE_ACTION, in 2024
149 articles found for theme WB_1949_CLIMATE_SMART_AGRICULTURE, in 2022
73 articles found for theme WB_1949_CLIMATE_SMART_AGRICULTURE, in 2023
78 articles found for theme WB_1949_CLIMATE_SMART_AGRICULTURE, in 2024
126 articles found for theme WB_568_CLIMATE_SERVICES, in 

In [5]:
articles_df = articles_df[articles_df["language"] == "Latvian"]
articles_df["seendate"] = pd.to_datetime(articles_df["seendate"])

print(f"Deleting {articles_df["url"].duplicated().sum()} duplicates")
articles_df = articles_df.drop_duplicates("url")
print(f"{len(articles_df)} unique articles found")
    

Deleting 1191 duplicates
1683 unique articles found


In [6]:
articles_df["domain"].value_counts()

domain
nra.lv              459
lsm.lv              388
delfi.lv            351
la.lv               225
diena.lv             65
reitingi.lv          60
ogrenet.lv           30
bnn.lv               20
tvnet.lv             19
ventasbalss.lv       19
ir.lv                17
mfa.gov.lv           13
ntz.lv                5
president.lv          5
latgaleslaiks.lv      3
vm.gov.lv             2
220.lv                1
brivalatvija.lv       1
Name: count, dtype: int64

In [9]:
articles_df.to_csv("../data/latvian_article_links.csv")