## Filter Articles For Each Company

In [1]:
import libs
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display 

import yfinance as yf
import duckdb

import io
from urllib.request import urlopen
import zipfile
import os

from menuinst.platforms.win_utils.knownfolders import folder_path
from sipbuild.generator.parser.tokens import keywords
pd.set_option('display.max_rows', 5)

In [2]:
conn = duckdb.connect(database='../../eda-ddb/eda-gdelt.ddb', read_only=True, config= {'access_mode': 'READ_ONLY'} )

In [3]:
stocks_to_keywords_broad = {
    "ADM": ["agriculture", "commodities", "grain", "processing", "sustainability"],
    "JNJ": ["healthcare", "pharmaceuticals", "medical-devices", "consumer-health", "vaccines"],
    "NEM": ["mining", "gold", "precious-metals", "exploration", "sustainability"],
    "V": ["payments", "credit-cards", "digital-payments", "transactions", "financial-services"],
    "PG": ["consumer-goods", "hygiene", "household-products", "personal-care", "brands"],
    "ABBV": ["biopharmaceuticals", "immunology", "oncology", "healthcare"],
    "CVX": ["oil", "energy", "natural-gas", "petroleum", "sustainability"],
    "PEP": ["pepsi", "beverages", "snacks", "food-products"],
    "T": ["telecommunications", "wireless", "internet", "broadband", "5g", "internet-services"],
    "VZ": ["telecommunications", "wireless", "5g", "broadband", "internet", "internet-services" ],
    "AAPL": ["iphone", "ipad", "ios", "tim-cook", "icloud"],
    "CTVA": ["agriculture", "biotechnology", "seeds", "crop-protection", "sustainability"],
    "XOM": ["exxon-mobil", "oil", "energy", "petroleum", "natural-gas", "exploration"],
    "JPM": ["banking", "investment", "financial-services", "wealth-management", "loans"],
    "DE": ["agriculture", "machinery", "construction", "equipment", "sustainability"],
    "COP": ["oil", "energy", "exploration", "natural-gas", "sustainability"],
    "MA": ["payments", "credit-cards", "financial-services", "transactions", "digital-payments"],
    "KO": ["sprite", "fanta", "schweppes", "powerade", "beverages", "soft-drinks"],
    "MSFT": ["software", "cloud", "windows", "satya-nadella"],
    "DOW": ["dow", "chemicals", "materials", "plastics", "manufacturing", "sustainability"],
    "FCX": ["mining", "copper", "gold", "exploration", "sustainability"],
    "NVDA": ["gpu", "artificial-intelligence", "graphics", "semiconductors"],
    "BP": ["british-petroleum", "oil", "energy", "petroleum", "natural-gas", "renewable-energy"],
    "PFE": ["pharmaceuticals", "vaccines", "biopharmaceuticals", "healthcare"],
    "TMUS": ["telecommunications", "wireless", "5g", "broadband", "mobile"]
}

stocks_to_keywords_specific = {
    "ADM": ["agribusiness", "grain-processing", "sustainable-agriculture", "commodity-markets", "food-supply-chain"],
    "JNJ": ["healthcare-innovation", "pharmaceutical-research", "medical-devices", "consumer-health-products", "vaccine-development"],
    "NEM": ["gold-mining", "precious-metals-investment", "mining-operations", "sustainability-in-mining", "gold-exploration"],
    "V": ["digital-payments", "credit-card-services", "financial-technology", "payment-processing", "transaction-security"],
    "PG": ["consumer-goods", "hygiene-products", "personal-care-brands", "household-brands", "product-innovation"],
    "ABBV": ["biopharmaceuticals", "immunology-research", "oncology-treatments", "healthcare-solutions"],
    "CVX": ["oil-and-gas", "energy-sector", "petroleum-production", "sustainable-energy", "natural-gas-exploration"],
    "PEP": ["beverages-industry", "snack-foods", "consumer-products", "sustainability-in-food", "brand-marketing"],
    "T": ["telecommunications-services", "wireless-network", "broadband-internet", "entertainment-services"],
    "VZ": ["5g-technology", "telecommunications-network", "internet-services", "wireless-solutions"],
    "AAPL": ["iphone-development", "ios-software", "consumer-electronics", "tech-innovation"],
    "CTVA": ["agricultural-biotechnology", "crop-seeds", "sustainable-agriculture", "agricultural-innovation"],
    "XOM": ["oil-industry", "energy-resources", "petroleum-markets", "natural-gas-production"],
    "JPM": ["banking-industry", "investment-banking", "financial-services-innovation", "wealth-management-strategies"],
    "DE": ["deere", "agriculture-machinery", "construction-equipment", "sustainable-agriculture", "farm-technology"],
    "COP": ["oil-and-gas-exploration", "energy-production", "natural-gas-resources", "sustainability-in-energy"],
    "MA": ["digital-payments", "financial-services", "credit-card-industry", "payment-processing-technology"],
    "KO": ["coke", "cola","sprite", "fanta", "schweppes", "powerade", "beverages-market", "soft-drink-industry", "brand-marketing", "consumer-goods"],
    "MSFT": ["cloud-computing", "software-development", "windows-platform", "technology-leadership"],
    "DOW": ["dow", "chemical-manufacturing", "materials-science", "plastics-production", "sustainable-materials"],
    "FCX": ["copper-mining", "gold-mining", "mineral-resources", "sustainability-in-mining"],
    "NVDA": ["gpu-technology", "artificial-intelligence", "graphics-processing", "semiconductor-industry"],
    "BP": ["british-petroleum", "energy-sector", "petroleum-industry", "natural-gas-production", "renewable-energy-solutions"],
    "PFE": ["pharmaceuticals-research", "vaccine-innovation", "biopharmaceuticals", "healthcare-development"],
    "TMUS": ["telecommunications-industry", "5g-network", "mobile-services", "broadband-solutions"]
}

In [5]:

stocks_to_keywords_blacklist = {
    "ADM": [],
    "JNJ": ["boris"],
    "NEM": [],
    "V": ["citizenship", "passport", "trump"],
    "PG": [],
    "ABBV": [],
    "CVX": [],
    "PEP": [],
    "T": [],
    "VZ": [],
    "AAPL": [],
    "CTVA": [],
    "XOM": [],
    "JPM": [],
    "DE": [],
    "COP": [],
    "MA": [],
    "KO": [],
    "MSFT": [],
    "DOW": ["down", "shutdown"],
    "FCX": [],
    "NVDA": [],
    "BP": [],
    "PFE": [],
    "TMUS": []
}


In [6]:
stocks_to_company_names = {
    "ADM" : ["archer-daniels" ,"archer-daniels-midland", ],
    "JNJ" : ["johnson-and-johnson", "johnson-johnson"],
    "NEM" : ["newmont-corporation"],
    "V" : ["visa"],
    "PG" : ["procter-and-gamble"],
    "ABBV" : ["abbvie"],
    "CVX" : ["chevron"],
    "PEP" : ["pepsico", "pepsi"],
    "T" : ["att", "at-t"],
    "VZ" : ["verizon"],
    "AAPL" : ["apple"],
    "CTVA" : ["corteva"],
    "XOM" : ["exxon-mobil", "exxon"],
    "JPM" : ["jpmorgan-chase"],
    "DE" : ["deere", "john-deere"],
    "COP" : ["conocophillips"],
    "MA" : ["mastercard"],
    "KO" : ["coca-cola", "coke", "cola", "sprite", "fanta", "schweppes", "powerade"],
    "MSFT" : ["microsoft", "azure"],
    "DOW" : ["dow-jones", "dow"],
    "FCX" : ["freeport-mcmoran"],
    "NVDA" : ["nvidia", "geforce"],
    "BP" : ["bp", "british-petroleum"],
    "PFE" : ["pfizer"],
    "TMUS" : ["t-mobile", "tmobile"],
}

In [7]:
sources = [
    # Major U.S. News
    'nytimes.com',               # The New York Times
    'washingtonpost.com',         # The Washington Post
    'bbc.com',                    # BBC News
    'reuters.com',                # Reuters
    'apnews.com',                 # Associated Press (AP)
    'npr.org',                    # National Public Radio (NPR)
    'politico.com',               # Politico
    'pbs.org',                    # PBS News
    'propublica.org',             # ProPublica
    'fivethirtyeight.com',        # FiveThirtyEight
    'cnn.com',                    # CNN
    'foxnews.com',                # Fox News
    'msnbc.com',                  # MSNBC
    'abcnews.go.com',             # ABC News
    'cbsnews.com',                # CBS News
    'usatoday.com',               # USA Today
    'latimes.com',                # Los Angeles Times
    'bloomberg.com',              # Bloomberg
    'wsj.com',                    # The Wall Street Journal
    'forbes.com',                 # Forbes
    'time.com',                   # TIME
    'newsweek.com',               # Newsweek
    'huffpost.com',               # HuffPost
    'vox.com',                    # Vox
    'axios.com',                  # Axios
    'buzzfeednews.com',           # BuzzFeed News

    # Major International News
    'theguardian.com',            # The Guardian (UK)
    'thetimes.co.uk',             # The Times (UK)
    'telegraph.co.uk',            # The Telegraph (UK)
    'independent.co.uk',          # The Independent (UK)
    'ft.com',                     # Financial Times (UK)
    'the-sun.com',                # The Sun (UK)
    'lemonde.fr',                 # Le Monde (France)
    'lefigaro.fr',                # Le Figaro (France)
    'dw.com',                     # Deutsche Welle (Germany)
    'spiegel.de',                 # Der Spiegel (Germany)
    'aljazeera.com',              # Al Jazeera (Qatar)
    'rt.com',                     # Russia Today (Russia)
    'haaretz.com',                # Haaretz (Israel)
    'timesofisrael.com',          # The Times of Israel (Israel)
    'straitstimes.com',           # The Straits Times (Singapore)
    'chinadaily.com.cn',          # China Daily (China)
    'japantimes.co.jp',           # The Japan Times (Japan)
    'abc.net.au',                 # ABC News (Australia)
    'smh.com.au',                 # The Sydney Morning Herald (Australia)
    'thestar.com',                # Toronto Star (Canada)
    'cbc.ca',                     # CBC News (Canada)
    'globalnews.ca',              # Global News (Canada)
    'elpais.com',                 # El País (Spain)
    'elmundo.es',                 # El Mundo (Spain)
    'clarin.com',                 # Clarín (Argentina)
    'folha.uol.com.br',           # Folha de S.Paulo (Brazil)
    'nation.co.ke',               # Daily Nation (Kenya)

    # Financial & Business
    'cnbc.com',                   # CNBC
    'marketwatch.com',            # MarketWatch
    'businessinsider.com',        # Business Insider
    'economist.com',              # The Economist
    'barrons.com',                # Barron’s
    'ft.com',                     # Financial Times

    # Tech & Science
    'wired.com',                  # WIRED
    'techcrunch.com',             # TechCrunch
    'mashable.com',               # Mashable
    'theverge.com',               # The Verge
    'arstechnica.com',            # Ars Technica
    'gizmodo.com',                # Gizmodo
    'cnet.com',                   # CNET
    'scientificamerican.com',     # Scientific American
    'nature.com',                 # Nature
    'newscientist.com',           # New Scientist

    # Specialized Journalism & Investigative Reporting
    'propublica.org',             # ProPublica
    'theintercept.com',           # The Intercept
    'democracynow.org',           # Democracy Now!
    'motherjones.com',            # Mother Jones
    'rollingstone.com',           # Rolling Stone
    'slate.com',                  # Slate
    'jacobinmag.com',             # Jacobin
    'newyorker.com',              # The New Yorker
    'vanityfair.com',             # Vanity Fair

    # Regional U.S. News
    'chicagotribune.com',         # Chicago Tribune
    'dallasnews.com',             # The Dallas Morning News
    'miamiherald.com',            # Miami Herald
    'boston.com',                 # Boston Globe
    'sfchronicle.com',            # San Francisco Chronicle
    'philly.com',                 # The Philadelphia Inquirer
    'startribune.com',            # Star Tribune (Minneapolis)
    'azcentral.com',              # The Arizona Republic

    # Latin America
    'eltiempo.com',               # El Tiempo (Colombia)
    'lanacion.com.ar',            # La Nación (Argentina)
    'elcomercio.pe',              # El Comercio (Peru)
    'eluniversal.com.mx',         # El Universal (Mexico)

    # Africa & Middle East
    'mg.co.za',                   # Mail & Guardian (South Africa)
    'punchng.com',                # The Punch (Nigeria)
    'gulfnews.com',               # Gulf News (UAE)
    'arabnews.com',               # Arab News (Saudi Arabia)
    'dailystar.com.lb',           # The Daily Star (Lebanon)
]

In [11]:
def generate_sql_query(search_terms, blacklist_terms,  news_sources, fast=True):
    
    if fast :
        search_conditions = " OR ".join([
            f"SOURCEURL LIKE '%{term}%'"
            for term in search_terms
        ])
    else:
        search_conditions = " OR ".join([
            f"SOURCEURL LIKE '%-{term}-%' OR "
            f"SOURCEURL LIKE '%-{term}' OR "
            f"SOURCEURL LIKE '%-{term}/%' OR "
            f"SOURCEURL LIKE '%/{term}-%' OR "
            f"SOURCEURL LIKE '%/{term}%'"
            for term in search_terms
        ])

    news_source_conditions = " OR ".join([f"SOURCEURL LIKE '%{source}%'\n" for source in news_sources])
    blacklist_conditions = " OR ".join([f"SOURCEURL LIKE '%{term}%'" for term in blacklist_terms])
    
    query = f"""
    SELECT Day, FractionDate, GoldsteinScale, AvgTone, NumArticles, NumMentions, SOURCEURL
    FROM gdelt
    WHERE ({search_conditions})
    AND ({news_source_conditions})
    """
    
    if blacklist_terms != list():
        query += f"AND NOT ({blacklist_conditions})"

    query += ";"
    return query

In [9]:
print(generate_sql_query(["terms"], ["blacklist"], ["news"], fast=False))


    SELECT Day, FractionDate, GoldsteinScale, AvgTone, NumArticles, NumMentions, SOURCEURL
    FROM gdelt
    WHERE (SOURCEURL LIKE '%-terms-%' OR SOURCEURL LIKE '%-terms' OR SOURCEURL LIKE '%-terms/%' OR SOURCEURL LIKE '%/terms-%' OR SOURCEURL LIKE '%/terms%')
    AND (SOURCEURL LIKE '%news%'
)
    AND NOT (SOURCEURL LIKE '%blacklist%');


In [12]:

def filter_gdelt(keywords, blacklist_words, folder_name):
    print(f"In Progress!: {folder_name}/{stock}_{(stocks_to_company_names[stock])[0]}.csv")
   
    q = generate_sql_query( keywords, blacklist_words, sources, fast=False)
    # print(q)
    
    df: pd.DataFrame = conn.query( q ).execute().fetchdf()
    df = df.drop_duplicates(subset=['SOURCEURL'])
    df['Date'] = pd.to_datetime(df['Day'], format='%Y%m%d')
    
    df.drop(columns=['Day', 'FractionDate'], inplace=True)
    display(df)
    df.to_csv(f"{folder_name}/{stock}_{(stocks_to_company_names[stock])[0]}.csv", index=False)

    print(f"Done!: {folder_name}/{stock}_{(stocks_to_company_names[stock])[0]}.csv", len(df))



In [None]:
folder = ["stocks_to_keywords_specific", "stocks_to_keywords_broad", "stocks_to_company_names"]

for i,(stock_to_words) in enumerate([ stocks_to_keywords_specific, stocks_to_keywords_broad, list() ]):
    for stock in stock_to_words.keys():
        filter_gdelt(stock_to_words[stock] + stocks_to_company_names[stock], stocks_to_keywords_blacklist[stock], folder[i])

In Progress!: stocks_to_keywords_specific/ADM_archer-daniels.csv


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,GoldsteinScale,AvgTone,NumArticles,NumMentions,SOURCEURL,Date
0,0.0,-0.002303,10.0,10.0,https://uk.reuters.com/article/uk-column-russe...,2019-01-08
1,0.0,-5.303863,21.0,21.0,https://uk.reuters.com/article/us-britain-eu-t...,2019-01-24
...,...,...,...,...,...,...
1027,-9.0,1.515152,5.0,5.0,https://www.forbes.com/councils/forbestechcoun...,2024-09-09
1030,7.0,2.802102,4.0,4.0,https://punchng.com/agribusiness-foundation-tr...,2024-09-13


Done!: stocks_to_keywords_specific/ADM_archer-daniels.csv 238
In Progress!: stocks_to_keywords_specific/JNJ_johnson-and-johnson.csv


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,GoldsteinScale,AvgTone,NumArticles,NumMentions,SOURCEURL,Date
0,0.0,-4.055304,160.0,160.0,https://in.reuters.com/article/us-johnson-john...,2019-01-01
2,-0.3,-2.905569,1.0,1.0,https://www.foxnews.com/politics/dem-rep-johns...,2019-01-02
...,...,...,...,...,...,...
130808,0.0,-0.903614,2.0,2.0,https://www.independent.co.uk/news/uk/politics...,2024-10-15
130816,3.0,-2.081448,2.0,2.0,https://www.foxnews.com/media/speaker-johnson-...,2024-10-15


Done!: stocks_to_keywords_specific/JNJ_johnson-and-johnson.csv 22029
In Progress!: stocks_to_keywords_specific/NEM_newmont-corporation.csv


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,GoldsteinScale,AvgTone,NumArticles,NumMentions,SOURCEURL,Date
0,-7.0,2.472188,10.0,10.0,https://www.forbes.com/sites/greatspeculations...,2019-01-15
1,-9.0,-1.851852,1.0,1.0,https://www.cbc.ca/news/canada/nova-scotia/kir...,2019-01-16
...,...,...,...,...,...,...
667,1.0,-1.439791,10.0,10.0,https://punchng.com/no-state-can-ban-mining-op...,2024-10-04
673,4.0,-2.571861,4.0,4.0,https://punchng.com/fg-to-resolve-osun-gold-mi...,2024-10-09


Done!: stocks_to_keywords_specific/NEM_newmont-corporation.csv 158
In Progress!: stocks_to_keywords_specific/V_visa.csv


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,GoldsteinScale,AvgTone,NumArticles,NumMentions,SOURCEURL,Date
0,-4.0,-4.186047,10.0,10.0,https://punchng.com/thousands-to-miss-visa-app...,2019-01-01
6,4.0,5.949657,10.0,10.0,https://gulfnews.com/uae/government/long-term-...,2019-01-01
...,...,...,...,...,...,...
24450,3.2,2.588556,2.0,2.0,https://gulfnews.com/living-in-uae/visa-immigr...,2024-10-14
24452,3.0,1.832461,10.0,10.0,https://gulfnews.com/uae/expat-residents-of-gc...,2024-10-14


Done!: stocks_to_keywords_specific/V_visa.csv 4003
In Progress!: stocks_to_keywords_specific/PG_procter-and-gamble.csv


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,GoldsteinScale,AvgTone,NumArticles,NumMentions,SOURCEURL,Date
0,7.0,-3.401361,2.0,2.0,https://www.azcentral.com/story/money/business...,2019-01-17
1,-4.4,-0.332326,54.0,54.0,https://uk.reuters.com/article/uk-britain-stoc...,2019-01-21
...,...,...,...,...,...,...
248,-8.0,-1.073171,10.0,10.0,https://www.forbes.com/sites/arthurkellermann/...,2024-08-07
249,1.9,0.115320,23.0,23.0,https://punchng.com/propak-summit-to-promote-p...,2024-08-23


Done!: stocks_to_keywords_specific/PG_procter-and-gamble.csv 82
In Progress!: stocks_to_keywords_specific/ABBV_abbvie.csv


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:
conn.close()