# Query the events, mentions and knowledge table

In [50]:
from datetime import date, timedelta
from gdelt import gdelt as gdelt_client
import pandas as pd
import re

In [124]:
def document_has_theme(document, themes=[]):
    document_themes = str(document["Themes"]).split(";")
    for theme in themes:
        if theme in document_themes:
            return True
        
    return False

def query_documents(graphs, mentions, themes=[]):
    mentioned_documents = graphs.loc[graphs["DocumentIdentifier"].isin(mentions["MentionIdentifier"])]
    if 0 < len(themes):
        return mentioned_documents[mentioned_documents.apply(document_has_theme, args=[themes], axis=1)]
    else:
        return mentioned_documents
    
def query_mentions(mentions, events):
    return mentions.loc[mentions["GLOBALEVENTID"].isin(events["GLOBALEVENTID"])]

def query_top_most_events(mentions, events, max_count):
    top_most = mentions["GLOBALEVENTID"].value_counts(sort=True)[:max_count]
    top_most_event_ids = top_most.index.tolist()
    return events.loc[events["GLOBALEVENTID"].isin(top_most_event_ids)]

def query_complex(date_of_interest=date.today(), max_count=3, full_day=True, trusted_sources=[], min_confidence=100, themes=[]):
    client = gdelt_client(version=2)
    events = client.Search(date_of_interest.strftime("%Y %m %d"), table="events", coverage=full_day)
    mentions = client.Search(date_of_interest.strftime("%Y %m %d"), table="mentions", coverage=full_day)
    graphs = client.Search(date_of_interest.strftime("%Y %m %d"), table="gkg", coverage=full_day) 
    del client
    
    if 0 < len(trusted_sources):
        trusted_mentions = mentions.loc[(mentions["MentionSourceName"].isin(trusted_sources)) & (mentions["Confidence"] >= min_confidence)]
    else:
        trusted_mentions = mentions.loc[mentions["Confidence"] >= min_confidence]
    
    top_most_events = query_top_most_events(trusted_mentions, events, max_count)
    events_mentions = query_mentions(trusted_mentions, top_most_events)
    events_documents = query_documents(graphs, events_mentions, themes)
    return (top_most_events,  events_mentions, events_documents)

def describe_gdelt_result(gdelt_result):
    (events, mentions, documents) = gdelt_result    
    document_source = documents["SourceCommonName"].values
    document_extras = documents["Extras"].values
    document_image_urls = documents["SharingImage"].values
    news = []
    unique_links = {}
    document_index = 0
    for document_extra in document_extras:
        news_entry = {
            "title": None,
            "source": document_source[document_index],
            "links": [],
            "image": None
        }
        
        for title_match in re.finditer("<PAGE_TITLE>(.+)</PAGE_TITLE>", document_extra):
            (title,) = title_match.groups(1)
            news_entry["title"] = title
        for links_match in re.finditer("<PAGE_LINKS>(.+)</PAGE_LINKS>", document_extra):
            (links,) = links_match.groups(1)
            links = links.split(";")
            for link in links:
                if re.match("https?://", link):
                    if not link in unique_links:
                        news_entry["links"].append(link)
                        unique_links[link] = link
        
        document_image_url = document_image_urls[document_index]
        if re.match("https?://", str(document_image_url)):
            news_entry["image"] = document_image_url
        
        if (not None is news_entry["title"] 
            and not None is news_entry["source"] 
            and 0 < len(news_entry["links"]) 
            and not None is news_entry["image"]):
            news.append(news_entry)
        document_index += 1
    
    return { 
        "locations": events["ActionGeo_FullName"].values,
        "news": news
    }
    
def gdelt_result_tohtml(gdelt_result):
    description = describe_gdelt_result(gdelt_result)
    locations = description["locations"]
    html_news = ""
    html_news += "<p>Locations: "
    for location in locations:
        html_news += "{} ".format(location)
    html_news += "</p>"
    
    news = description["news"]
    for news_entry in news:
        html_news += "<p><b>{}</b></p>".format(news_entry["title"])
        html_news += "<p>Source: {}</p>".format(news_entry["source"])
        html_news += "<p><img src='{}'/></p>".format(news_entry["image"])
        html_news += "<p>Links:".format(news_entry["source"])
        html_news += "<ul>"
        for link in news_entry["links"]:
            html_news += "<li><a href='{0}'>{0}</a></li>".format(link)
        html_news += "</ul></p>"
    return html_news

In [42]:
trusted_sources = ["bbc.co.uk",
                   "cbsnews.com",
                   "dailymail.co.uk",
                   "latimes.com",
                   "msn.com",
                   "nytimes.com",
                   "reuters.com",
                   "sputniknews.com",
                   "swissinfo.ch",
                   "usatoday.com",
                   "washingtonpost.com",
                   "washingtontimes.com",
                   "yahoo.com"]
themes = []
yesterday = date.today()-timedelta(days=1)
gdelt_result = query_complex(yesterday, max_count=1, full_day=True, trusted_sources=trusted_sources, themes=themes)
describe_gdelt_result(gdelt_result)

66243    https://www.msn.com/en-us/news/world/un-team-m...
Name: SOURCEURL, dtype: object
183157    https://www.msn.com/en-us/news/world/un-team-m...
187225    https://uk.reuters.com/article/uk-mali-securit...
187226    https://www.reuters.com/article/us-mali-securi...
191300    https://in.reuters.com/article/mali-security/u...
195797    https://www.swissinfo.ch/eng/u-n--team-meets-d...
200318    http://www.msn.com/en-nz/news/world/u-n-team-m...
204416    https://af.reuters.com/article/worldNews/idAFK...
204417    https://af.reuters.com/article/topNews/idAFKBN...
217786    https://www.reuters.com/article/us-mali-securi...
217787    https://uk.reuters.com/article/uk-mali-securit...
233028    https://af.reuters.com/article/topNews/idAFKBN...
233029    https://in.reuters.com/article/mali-security/m...
321131    https://uk.reuters.com/article/uk-mali-securit...
326615    https://in.reuters.com/article/mali-security/u...
331787    https://www.msn.com/en-us/news/world/us-halts-...
337548    

In [121]:
gdelt_result[2].values[0:5]

array([['20200821104500-689', 20200821104500, 1, 'msn.com',
        'https://www.msn.com/en-us/news/world/un-team-meets-detained-mali-president-as-coup-supporters-plan-to-rally/ar-BB18dT7o',
        nan, nan,
        'TAX_WORLDMAMMALS;TAX_WORLDMAMMALS_HUMAN;SELF_IDENTIFIED_HUMAN_RIGHTS;WB_2203_HUMAN_RIGHTS;WB_2432_FRAGILITY_CONFLICT_AND_VIOLENCE;UNGP_CRIME_VIOLENCE;TAX_FNCACT;TAX_FNCACT_OFFICIALS;TAX_ETHNICITY;TAX_ETHNICITY_MALI;LEADER;TAX_FNCACT_PRESIDENT;USPEC_POLITICS_GENERAL1;REBELLION;GENERAL_GOVERNMENT;RESIGNATION;ARREST;CRISISLEX_C07_SAFETY;CRISISLEX_CRISISLEXREC;INSURGENCY;WB_2433_CONFLICT_AND_VIOLENCE;WB_2464_INSURGENCY;WB_2462_POLITICAL_VIOLENCE_AND_WAR;TERROR;ARMEDCONFLICT;PEACEKEEPING;SECURITY_SERVICES;WB_2470_PEACE_OPERATIONS_AND_CONFLICT_MANAGEMENT;WB_2471_PEACEKEEPING;MEDIA_SOCIAL;ALLIANCE;TAX_FNCACT_LEADERS;ELECTION;EPU_CATS_MIGRATION_FEAR_FEAR;TAX_ETHNICITY_FRENCH;TAX_WORLDLANGUAGES;TAX_WORLDLANGUAGES_FRENCH;TAX_RELIGION;TAX_RELIGION_ISLAMIC;TAX_TERROR_GROUP;TAX_TERROR

In [55]:
gdelt_result[2]["SharingImage"].values[0:5]

array([nan,
       'https://s2.reutersmedia.net/resources/r/?m=02&d=20200821&t=2&i=1530485786&r=LYNXMPEG7K0TC',
       'https://s1.reutersmedia.net/resources/r/?m=02&d=20200821&t=2&i=1530487058&r=LYNXMPEG7K0U2',
       'https://s4.reutersmedia.net/resources/r/?m=02&d=20200821&t=2&i=1530487329&r=LYNXMPEG7K0U9',
       'https://www.swissinfo.ch/resource/image/45982994/landscape_ratio16x9/1920/1080/23f05750e647b8f1409fbde3dfbcf74a/LO/image_kbn25h19o.jpg'],
      dtype=object)

In [122]:
describe_gdelt_result(gdelt_result)

{'locations': array(['Bamako, Bamako, Mali'], dtype=object),
 'news': [{'image': 'https://s2.reutersmedia.net/resources/r/?m=02&d=20200821&t=2&i=1530485786&r=LYNXMPEG7K0TC',
   'links': ['http://thomsonreuters.com/en/about-us/trust-principles.html'],
   'source': 'reuters.com',
   'title': 'U.N. team meets detained Mali president as coup supporters plan to rally'},
  {'image': 'https://s.yimg.com/uu/api/res/1.2/ApRWaFNJfrafM0vVD.hpMQ--~B/aD01NzY7dz0xMDI0O3NtPTE7YXBwaWQ9eXRhY2h5b24-/https://media.zenfs.com/en/france_24_english_articles_100/01b8407d87873d0818657e38244ee12e',
   'links': ['http://www.france24.com/en/20200818-ecowas-calls-on-mali-soldiers-to-end-the-mutiny',
    'http://www.france24.com/en/20200820-a-transitional-civilian-or-military-president-will-take-power-in-mali-junta-tells-france-24',
    'http://www.france24.com/en/20200821-un-team-meets-mali-s-deposed-president-keita-following-coup',
    'http://www.france24.com/en/tag/Ibrahim-Boubacar-Keita/',
    'http://www.fran

In [123]:
from IPython.display import display, HTML
html_news = gdelt_result_tohtml(gdelt_result)
display(HTML(html_news))