# Analysing the GDELT events

In [1]:
#!pip install gdelt

In [2]:
from arcgis.gis import GIS
from arcgis.features import GeoAccessor
import datetime as dt
import gdelt as gd
import numpy as np
import pandas as pd

In [3]:
gdelt_client = gd.gdelt()

In [15]:
end_date = dt.datetime.now()
start_date = end_date - dt.timedelta(days=1)
end_date_string = end_date.strftime('%Y %m %d')
start_date_string = start_date.strftime('%Y %m %d')
gdelt_events = gdelt_client.Search([start_date_string, end_date_string], table='events', coverage=True)





In [5]:
gdelt_events['CAMEOCodeDescription'].unique()
gdelt_events.columns

Index(['GLOBALEVENTID', 'SQLDATE', 'MonthYear', 'Year', 'FractionDate',
       'Actor1Code', 'Actor1Name', 'Actor1CountryCode', 'Actor1KnownGroupCode',
       'Actor1EthnicCode', 'Actor1Religion1Code', 'Actor1Religion2Code',
       'Actor1Type1Code', 'Actor1Type2Code', 'Actor1Type3Code', 'Actor2Code',
       'Actor2Name', 'Actor2CountryCode', 'Actor2KnownGroupCode',
       'Actor2EthnicCode', 'Actor2Religion1Code', 'Actor2Religion2Code',
       'Actor2Type1Code', 'Actor2Type2Code', 'Actor2Type3Code', 'IsRootEvent',
       'EventCode', 'CAMEOCodeDescription', 'EventBaseCode', 'EventRootCode',
       'QuadClass', 'GoldsteinScale', 'NumMentions', 'NumSources',
       'NumArticles', 'AvgTone', 'Actor1Geo_Type', 'Actor1Geo_FullName',
       'Actor1Geo_CountryCode', 'Actor1Geo_ADM1Code', 'Actor1Geo_ADM2Code',
       'Actor1Geo_Lat', 'Actor1Geo_Long', 'Actor1Geo_FeatureID',
       'Actor2Geo_Type', 'Actor2Geo_FullName', 'Actor2Geo_CountryCode',
       'Actor2Geo_ADM1Code', 'Actor2Geo_ADM2Code

In [6]:
#gdelt_events.groupby(['Actor1Name', 'ActionGeo_FullName', 'Actor2Name', 'CAMEOCodeDescription']).size().sort_values(ascending=False).reset_index(name='counts')
gdelt_events.groupby(['ActionGeo_FullName', 'CAMEOCodeDescription', 'DATEADDED']).size().sort_values(ascending=False).reset_index(name='counts')

Unnamed: 0,ActionGeo_FullName,CAMEOCodeDescription,DATEADDED,counts
0,South Korea,Express intent to meet or negotiate,20211107014500,60
1,"Tehran, Tehran, Iran",Sign formal agreement,20211106133000,52
2,Netherlands,Cooperate militarily,20211107024500,40
3,"Brussels, Bruxelles-Capitale, Belgium",Engage in negotiation,20211107104500,40
4,Indonesia,Sign formal agreement,20211106021500,38
...,...,...,...,...
61818,"Nagasaki, Yamagata, Japan",Host a visit,20211106143000,1
61819,"Nagasaki, Yamagata, Japan",Host a visit,20211106051500,1
61820,"Nagasaki, Yamagata, Japan",Engage in symbolic act,20211107123000,1
61821,"Nagasaki, Yamagata, Japan",Engage in symbolic act,20211107093000,1


In [7]:
# Find demonstrations/protests
protests_codes = ['140', '141'] \
    + ['141' + str(index) for index in range(1, 5)] \
    + ['142'] \
    + ['142' + str(index) for index in range(1, 5)] \
    + ['143'] \
    + ['143' + str(index) for index in range(1, 5)] \
    + ['144'] \
    + ['144' + str(index) for index in range(1, 5)] \
    + ['145'] \
    + ['145' + str(index) for index in range(1, 5)]

# Geo types 3:='US city', 4:= 'World city'
geo_types = [3, 4]
#gdelt_protests = gdelt_events.query('EventCode in @protests_codes').query('ActionGeo_Type in @geo_types')
protests_rootcode = '14'
start_dateadded = int(start_date.strftime('%Y%m%d%H%M%S'))
end_dateadded = int(end_date.strftime('%Y%m%d%H%M%S'))
gdelt_protests = gdelt_events[(gdelt_events.EventRootCode == protests_rootcode) & (gdelt_events.ActionGeo_Type.isin(geo_types)) & (gdelt_events.DATEADDED >= start_dateadded) & (gdelt_events.DATEADDED <= end_dateadded)]
gdelt_protests.groupby(['ActionGeo_FullName']).size().sort_values(ascending=False).reset_index(name='counts')

Unnamed: 0,ActionGeo_FullName,counts
0,"Glasgow, Glasgow City, United Kingdom",70
1,"Baghdad, Baghdad, Iraq",33
2,"London, London, City of, United Kingdom",27
3,"Khartoum, Al Khartum, Sudan",18
4,"Paris, France (general), France",15
...,...,...
156,"Muhammad Akbar, North-West Frontier, Pakistan",1
157,"Naraingarh, Haryana, India",1
158,"Ndjamena, (CD04), Chad",1
159,"Patna, Bihar, India",1


In [16]:
import io
import requests

def download_knowledge_graph(datetime_of_interest):
    last_valid_datetime = datetime_of_interest - dt.timedelta(minutes=datetime_of_interest.minute % 15,
                             seconds=datetime_of_interest.second)
    
    file_url = 'http://data.gdeltproject.org/gdeltv2/{0}.gkg.csv.zip'.format(last_valid_datetime.strftime('%Y%m%d%H%M%S'))
    web_result = requests.get(file_url)
    if 404 == web_result.status_code:
        return None
    
    with io.BytesIO(web_result.content) as file_stream:
        try:
            gkg_frame = pd.read_csv(file_stream, compression='zip', sep='\t', header=None, warn_bad_lines=False)
            gkg_frame.columns = ['GKGRECORDID',
                'DATE',
                'SourceCollectionIdentifier',
                'SourceCommonName',
                'DocumentIdentifier',
                'Counts',
                'V2Counts',
                'Themes',
                'V2Themes',
                'Locations',
                'V2Locations',
                'Persons',
                'V2Persons',
                'Organizations',
                'V2Organizations',
                'V2Tone',
                'Dates',
                'GCAM',
                'SharingImage',
                'RelatedImages',
                'SocialImageEmbeds',
                'SocialVideoEmbeds',
                'Quotations',
                'AllNames',
                'Amounts',
                'TranslationInfo',
                'Extras']
        
            return gkg_frame
        except Exception as ex:
            raise Exception('Reading knowlegde from {0} failed, cause: {1}'.format(file_url, ex))

def filter_by_theme(gkg_frame, theme):
    return gkg_frame[gkg_frame['V2Themes'].str.contains(theme, na=False)]

def to_locations(gkg_frame):
    gkg_locations = gkg_frame['V2Locations'].str.split(';')
    gdelt_locations = []
    for index, locations in gkg_locations.items():
        for location in locations:
            try:
                location_values = location.split("#")
                # Location type is LONG
                # 3:= US City
                # 4:= World City
                geo_res = int(location_values[0])
                if 3 == geo_res or 4 == geo_res:
                    name = location_values[1]
                    # Latitude and Longitude are FLOAT
                    lat = float(location_values[5])
                    lon = float(location_values[6])
                    new_location = gdelt_location(geo_res, lat, lon, name)
                    gdelt_locations.append(new_location)
            except:
                # Swallow any exception like float parsing
                pass
        
    return gdelt_locations



class gdelt_location(object):
    
    def __init__(self, geo_res, lat, lon, name):
        self._geo_res = geo_res
        self._lat = lat
        self._lon = lon
        self._name = name
        
    @property
    def lat(self):
        return self._lat
    
    @property
    def lon(self):
        return self._lon
    
    @property
    def name(self):
        return self._name
    
    def __repr__(self):
        return '{0} at {1}, {2}'.format(self._name, self._lon, self._lat)
        
def query_locations(start_date, end_date):
    if (end_date < start_date):
        return []
    
    gdelt_locations = {}
    date_of_interest = start_date
    while (date_of_interest < end_date):
        try:
            gdelt_knowledge = download_knowledge_graph(date_of_interest)
            if not None is gdelt_knowledge:
                gdelt_protests = filter_by_theme(gdelt_knowledge, 'TAX_FNCACT_DEMONSTRATORS')
                locations = to_locations(gdelt_protests)
                for location in locations:
                    if location.name in gdelt_locations:
                        gdelt_locations[location.name]['count'] += 1
                    else:
                        gdelt_locations[location.name] = {
                            'location' : location,
                            'count': 1
                        }
        except Exception as ex:
            print(ex)
        date_of_interest += dt.timedelta(minutes=15)
        
    return gdelt_locations



query_locations(start_date, end_date)

#gdelt_knowledge.columns

Reading knowlegde from http://data.gdeltproject.org/gdeltv2/20211106154500.gkg.csv.zip failed, cause: 'utf-8' codec can't decode byte 0xed in position 160: invalid continuation byte
Reading knowlegde from http://data.gdeltproject.org/gdeltv2/20211106161500.gkg.csv.zip failed, cause: 'utf-8' codec can't decode byte 0xe9 in position 54: invalid continuation byte
Reading knowlegde from http://data.gdeltproject.org/gdeltv2/20211106163000.gkg.csv.zip failed, cause: 'utf-8' codec can't decode byte 0xe8 in position 73: invalid continuation byte
'float' object is not iterable
'float' object is not iterable
Reading knowlegde from http://data.gdeltproject.org/gdeltv2/20211107010000.gkg.csv.zip failed, cause: 'utf-8' codec can't decode byte 0xe9 in position 345: invalid continuation byte
Reading knowlegde from http://data.gdeltproject.org/gdeltv2/20211107020000.gkg.csv.zip failed, cause: 'utf-8' codec can't decode byte 0xe1 in position 65: invalid continuation byte
'float' object is not iterable


{'Amsterdam, Noord-Holland, Netherlands': {'location': Amsterdam, Noord-Holland, Netherlands at 4.91667, 52.35,
  'count': 292},
 'Sydney, New South Wales, Australia': {'location': Sydney, New South Wales, Australia at 151.217, -33.8833,
  'count': 84},
 "Seoul, Soul-T'ukpyolsi, South Korea": {'location': Seoul, Soul-T'ukpyolsi, South Korea at 127.0, 37.5664,
  'count': 20},
 'Melbourne, Victoria, Australia': {'location': Melbourne, Victoria, Australia at 144.967, -37.8167,
  'count': 46},
 'Nairobi, Nairobi Area, Kenya': {'location': Nairobi, Nairobi Area, Kenya at 36.8167, -1.28333,
  'count': 6},
 'Paris, France (General), France': {'location': Paris, France (General), France at 2.33333, 48.8667,
  'count': 422},
 'Stoke Newington, Hackney, United Kingdom': {'location': Stoke Newington, Hackney, United Kingdom at -0.083333, 51.5667,
  'count': 6},
 'Glasgow, Glasgow City, United Kingdom': {'location': Glasgow, Glasgow City, United Kingdom at -4.25, 55.8333,
  'count': 2832},
 'Londo

In [61]:
gdelt_protests = gdelt_knowledge[gdelt_knowledge['V2Themes'].str.contains('TAX_FNCACT_DEMONSTRATORS', na=False)]
gdelt_locations = gdelt_protests['V2Locations'].str.split(';')
gdelt_values = []
for index, locations in gdelt_locations.items():
    try:
        values = []
        for location in locations:
            location_values = location.split("#")
            # Location type is LONG
            location_values[0] = int(location_values[0])
            # Latitude and Longitude are FLOAT
            location_values[5] = float(location_values[5])
            location_values[6] = float(location_values[6])
            # Only append to the Feature ID
            values += location_values[:8]
        
        gdelt_values.append(values)
    except:
        # Swallow any exception like float parsing
        pass

gdelt_values

[[1,
  'Scotland',
  'UK',
  'UK',
  '',
  54.0,
  -4.0,
  'UK',
  1,
  'Scotland',
  'UK',
  'UK',
  '',
  54.0,
  -4.0,
  'UK',
  1,
  'Scotland',
  'UK',
  'UK',
  '',
  54.0,
  -4.0,
  'UK',
  4,
  'Scottish Highlands, Highland, United Kingdom',
  'UK',
  'UKV3',
  '40172',
  57.5,
  -4.5,
  '-2598681',
  1,
  'United Kingdom',
  'UK',
  'UK',
  '',
  54.0,
  -4.0,
  'UK',
  1,
  'United Kingdom',
  'UK',
  'UK',
  '',
  54.0,
  -4.0,
  'UK',
  1,
  'United Kingdom',
  'UK',
  'UK',
  '',
  54.0,
  -4.0,
  'UK',
  1,
  'United Kingdom',
  'UK',
  'UK',
  '',
  54.0,
  -4.0,
  'UK',
  1,
  'United Kingdom',
  'UK',
  'UK',
  '',
  54.0,
  -4.0,
  'UK',
  1,
  'United Kingdom',
  'UK',
  'UK',
  '',
  54.0,
  -4.0,
  'UK',
  1,
  'United Kingdom',
  'UK',
  'UK',
  '',
  54.0,
  -4.0,
  'UK',
  1,
  'United Kingdom',
  'UK',
  'UK',
  '',
  54.0,
  -4.0,
  'UK',
  1,
  'United Kingdom',
  'UK',
  'UK',
  '',
  54.0,
  -4.0,
  'UK',
  1,
  'United Kingdom',
  'UK',
  'UK',
  '',
  54.

In [7]:
# Connect to ArcGIS Online anonymously
gis = GIS()

europe_map = gis.map("Europe")
europe_map.basemap = "dark-gray-vector"

In [8]:
WGS84 = 4326
gdelt_spatial = GeoAccessor.from_xy(gdelt_events, x_column='ActionGeo_Long', y_column='ActionGeo_Lat', sr=WGS84)
gdelt_spatial.spatial.plot(map_widget=europe_map, renderer_type='s')
europe_map

MapView(layout=Layout(height='400px', width='100%'))

In [9]:
import algorithm

In [10]:
gdelt_events.groupby(['ActionGeo_FullName']).size().sort_values(ascending=False).reset_index(name='counts')

Unnamed: 0,ActionGeo_FullName,counts
0,"Gaza, Israel (general), Israel",4456
1,"Jerusalem, Israel (general), Israel",2409
2,United States,1359
3,"Washington, District of Columbia, United States",1160
4,"New York, United States",1095
...,...,...
4973,"Kill Devil Hills, North Carolina, United States",1
4974,"King Saud University, Ar Riya?, Saudi Arabia",1
4975,"Kingdom Of Buganda, Mityana, Uganda",1
4976,"Kingstown, Saint George, Saint Vincent And The...",1


In [26]:
names_values = gdelt_events['ActionGeo_FullName'].dropna()
names_list = names_values.tolist()
algorithm.count(names_list, 'Gaza, Israel (general), Israel')

4456

In [27]:
names_arr = names_values.to_numpy()
algorithm.count(names_arr, 'Gaza, Israel (general), Israel')

4456