# Requirements

In [1]:
!pip install --user gdelt



# Import modules

In [12]:
from datetime import date, timedelta
from gdelt import gdelt as gdelt_client
import matplotlib.pyplot as plot
import pandas
import tempfile

# Query the knowledge graph
Use coverage option for querying all daily records. Otherwise records collected from the last 15 minutes are returned.
Use the date option to filter by date.

In [27]:
def get_graph(date, coverage=False):
    client = gdelt_client(version=2)
    graph = client.Search(date.strftime("%Y %m %d"), table="gkg", coverage=coverage)
    graph = graph.astype({"DATE": str})
    graph["DATE"] = graph["DATE"].apply(lambda dateStr: dateStr[:14])
    del client
    return graph

def get_graph_range(from_date, to_date, coverage=False):
    date_range = to_date-from_date
    if date_range.days < 1:
        return
    
    client = gdelt_client(version=2)
    graph = None
    for day in range(0, date_range.days + 1):
        date = from_date + timedelta(days=day)
        graph_temp = client.Search(date.strftime("%Y %m %d"), table="gkg", coverage=coverage)
        graph_temp = graph_temp.astype({"DATE": str})
        graph_temp["DATE"] = graph_temp["DATE"].apply(lambda dateStr: dateStr[:14])
        if graph is None:
            graph = graph_temp
        else:
            graph = pandas.concat([graph, graph_temp], axis=0)
    del client
    return graph

def get_today_graph(coverage=False):
    return get_graph(date.today(), coverage)

def get_yesterday_graph(coverage=False):
    return get_graph(date.today()-timedelta(days=1), coverage)

In [28]:
end_date = date.today()-timedelta(days=1)
start_date = end_date-timedelta(days=2)
graph = get_graph_range(start_date, end_date, coverage=False)
corona_records = graph.loc[graph["V2Themes"].str.contains("TAX_DISEASE_CORONAVIRUS", na=False)]

report_date = end_date



ValueError: This GDELT query returned no data. Check query parameters and retry

In [None]:
corona_records.to_csv("{}/corona_{}.csv".format(tempfile.gettempdir(), report_date.strftime("%Y%m%d")), index=False)

# Location type exploding and filtering

In [59]:
from enum import Enum

class location_type(Enum):
    """Location type
        Defines the different location types.
    """
    UNKNOWN = 0
    COUNTRY = 1
    USSTATE = 2
    USCITY = 3
    WORLDCITY = 4
    WORLDSTATE = 5

class gdelt_location:
    """GDELT location
        Defines a GDELT location.
    """
    def __init__(self, location_typeid=0, name=None, country_code=None, admin1_code=None, lat=None, lon=None, feature_id=None):        
        self.location_type = location_type(int(location_typeid))
        self.location_name = name
        self.country_code = country_code
        self.admin1_code = admin1_code
        self.location_lat = lat
        self.location_lon = lon
        self.feature_id = feature_id
        
    def has_location_type(self, location_type):
        return location_type == self.location_type
    
    def location_type_matches(self, location_types):
        return self.location_type in location_types
    
    def __str__(self):
        return self.location_name
    
class location_filter():
    """Location Filter
        Defines different filters which can be applied on the dataframes.
    """
    def filter_by_type(self, gkg_dataframe, location_type):
        return gkg_dataframe.loc[gkg_dataframe.apply(lambda record: record["GDELT_Locations"].has_location_type(location_type), axis=1)]
    
    def filter_by_types(self, gkg_dataframe, location_types):
        return gkg_dataframe.loc[gkg_dataframe.apply(lambda record: record["GDELT_Locations"].location_type_matches(location_types), axis=1)]

def split_location_entries(locations):
    return [gdelt_location(*location) if 7 == len(location) else gdelt_location() for location in locations]

def split_locations(record):
    return split_location_entries([location.split("#") for location in str(record["Locations"]).split(";")])


# Filter the locations by location type
- We are exploding the records using the locations column
- We are filtering by using the location type (e.g. return only cities)

In [60]:
corona_locations = corona_records.copy(deep=True)
if corona_locations.empty:
    corona_locations["GDELT_Locations"] = []
    corona_filtered_locations = corona_locations
else:
    corona_locations["GDELT_Locations"] = corona_records.apply(lambda record: split_locations(record), axis=1)
    corona_locations_exploded = corona_locations.explode("GDELT_Locations")

    filter = location_filter()
    corona_filtered_locations = filter.filter_by_types(corona_locations_exploded, [location_type.WORLDCITY, location_type.USCITY])

del corona_records
del graph

# Extract the coordinates and the name from the GDELT location

In [61]:
def to_point_locations(gkg_dataframe):
    if gkg_dataframe.empty:
        return pandas.DataFrame(columns=["GKGRECORDID", 
                                         "DATE",
                                         "SourceCommonName",
                                         "DocumentIdentifier",
                                         "Location_Name",
                                         "Location_Lat",
                                         "Location_Lon"])
        
    point_locations = gkg_dataframe[["GKGRECORDID", 
                                                 "DATE",
                                                 "SourceCommonName",
                                                 "DocumentIdentifier",
                                                 "GDELT_Locations"]].copy(deep=True)
    point_locations["Location_Name"] = gkg_dataframe.apply(lambda record: record["GDELT_Locations"].location_name, axis=1)
    point_locations["Location_Lat"] = gkg_dataframe.apply(lambda record: record["GDELT_Locations"].location_lat, axis=1)
    point_locations["Location_Lon"] = gkg_dataframe.apply(lambda record: record["GDELT_Locations"].location_lon, axis=1)
    return point_locations.drop("GDELT_Locations", axis=1)

corona_point_locations = to_point_locations(corona_filtered_locations)
del corona_filtered_locations

corona_point_locations

Unnamed: 0,GKGRECORDID,DATE,SourceCommonName,DocumentIdentifier,Location_Name,Location_Lat,Location_Lon
1,20200314000000-1,20200314000000,mykxlg.com,https://www.mykxlg.com/news/national/on-eve-of...,"Hollywood, California, United States",34.0983,-118.327
1,20200314000000-1,20200314000000,mykxlg.com,https://www.mykxlg.com/news/national/on-eve-of...,"Anaheim, California, United States",33.8353,-117.915
3,20200314000000-3,20200314000000,newportri.com,https://www.newportri.com/zz/news/20200313/sch...,"Philadelphia, Pennsylvania, United States",39.9523,-75.1638
3,20200314000000-3,20200314000000,newportri.com,https://www.newportri.com/zz/news/20200313/sch...,"Cleveland, Ohio, United States",41.4995,-81.6954
3,20200314000000-3,20200314000000,newportri.com,https://www.newportri.com/zz/news/20200313/sch...,"Valley High School, California, United States",33.7231,-117.901
...,...,...,...,...,...,...,...
79207,20200315191500-1533,20200315191500,rockymounttelegram.com,https://www.rockymounttelegram.com/news/nation...,"Iditarod, Alaska, United States",62.5444,-158.095
79207,20200315191500-1533,20200315191500,rockymounttelegram.com,https://www.rockymounttelegram.com/news/nation...,"Anchorage, Alaska, United States",61.2181,-149.9
79207,20200315191500-1533,20200315191500,rockymounttelegram.com,https://www.rockymounttelegram.com/news/nation...,"Kaltag, Alaska, United States",64.3272,-158.722
79207,20200315191500-1533,20200315191500,rockymounttelegram.com,https://www.rockymounttelegram.com/news/nation...,"Nulato, Alaska, United States",64.7194,-158.103


# Save point locations to temp

In [62]:
corona_point_locations.to_csv("{}/corona_locations_{}.gkg.csv".format(tempfile.gettempdir(), report_date.strftime("%Y%m%d")), index=False)

del corona_point_locations