Adopted from GDELT Data Wrangle by James Houghton https://nbviewer.jupyter.org/github/JamesPHoughton/Published_Blog_Scripts/blob/master/GDELT%20Wrangler%20-%20Clean.ipynb

Additional GDELT resources: 
    
    GDELT library overview: https://colab.research.google.com/drive/1rnKEHKV1StOwGtFPsCctKDPTBB_kHOc_?usp=sharing 
    
    GDELT with big data: https://github.com/linwoodc3/gdeltPyR/wiki/Pulling-Large-GDELT-Data
        

# PART I: Get GDELT DATA FOR NIGER


### Get the GDELT index files

In [1]:
import requests
import lxml.html as lh

gdelt_base_url = 'http://data.gdeltproject.org/events/'

# get the list of all the links on the gdelt file page
page = requests.get(gdelt_base_url+'index.html') #Grab GDELT reference list which is by day
doc = lh.fromstring(page.content)
link_list = doc.xpath("//*/ul/li/a/@href") #Returns all the possible CSV files of GDELT data as a references list

# separate out those links that begin with four digits 
'''
Will extract just the days resulting in list like: 
['20200617.export.CSV.zip',
 '20200616.export.CSV.zip',
 '20200615.export.CSV.zip',...]
 Until 2015
'''

file_list = [x for x in link_list if str.isdigit(x[0:4])]
file_list

['20201011.export.CSV.zip',
 '20201010.export.CSV.zip',
 '20201009.export.CSV.zip',
 '20201008.export.CSV.zip',
 '20201007.export.CSV.zip',
 '20201006.export.CSV.zip',
 '20201005.export.CSV.zip',
 '20201004.export.CSV.zip',
 '20201003.export.CSV.zip',
 '20201002.export.CSV.zip',
 '20201001.export.CSV.zip',
 '20200930.export.CSV.zip',
 '20200929.export.CSV.zip',
 '20200928.export.CSV.zip',
 '20200927.export.CSV.zip',
 '20200926.export.CSV.zip',
 '20200925.export.CSV.zip',
 '20200924.export.CSV.zip',
 '20200923.export.CSV.zip',
 '20200922.export.CSV.zip',
 '20200921.export.CSV.zip',
 '20200920.export.CSV.zip',
 '20200919.export.CSV.zip',
 '20200918.export.CSV.zip',
 '20200917.export.CSV.zip',
 '20200916.export.CSV.zip',
 '20200915.export.CSV.zip',
 '20200914.export.CSV.zip',
 '20200913.export.CSV.zip',
 '20200912.export.CSV.zip',
 '20200911.export.CSV.zip',
 '20200910.export.CSV.zip',
 '20200909.export.CSV.zip',
 '20200908.export.CSV.zip',
 '20200907.export.CSV.zip',
 '20200906.export.CS

In [2]:
#Counters to help assess how many files are coming and going out
infilecounter = 0
outfilecounter = 0

In [22]:
print(len(file_list))

2861


### Uses GDELT Index file list to download GDELT data for that day for that country

In [3]:
import os.path #To help navigate the file directories
import urllib #To request from GDELT
import zipfile #TO unzip the files we downlaod
import glob #To go through multiple files in a directory
import operator 

local_path = './results/' # Will save to empy results folder to help keep file clean

fips_country_code = 'NG'  ## !!!!! THIS IS THE NIGER COUNTRY CODE GETS ONLY NIGER DATA!!!!

#Adjust list number to get days wanted 
for compressed_file in file_list[:7]: #!!!!!Only getting index 0 to 6!!!!!!
    print(compressed_file,)
    
    # if we dont have the compressed file stored locally, go get it. Keep trying if necessary.
    while not os.path.isfile(local_path+compressed_file): 
        print('downloading,'),
        urllib.request.urlretrieve(url=gdelt_base_url+compressed_file, 
                           filename=local_path+compressed_file)
        
    # extract the contents of the compressed file to a temporary directory    
    print('extracting,'),
    z = zipfile.ZipFile(file=local_path+compressed_file, mode='r')    
    z.extractall(path=local_path+'tmp/')
    
    # parse each of the csv files in the working directory, 
    print('parsing,'),
    for infile_name in glob.glob(local_path+'tmp/*'):
        outfile_name = local_path+fips_country_code+'%04i.tsv'%outfilecounter
        
        # open the infile and outfile
        with open(infile_name, mode='r', encoding="ISO-8859-1") as infile, open(outfile_name, mode='w') as outfile:
            for line in infile:
                # extract lines with our interest country code
                if fips_country_code in operator.itemgetter(51, 37, 44)(line.split('\t')):    
                    outfile.write(line)
            outfilecounter +=1
            
        # delete the temporary file
        os.remove(infile_name)
    infilecounter +=1
    print('done', infilecounter)
    

20201011.export.CSV.zip
downloading,
extracting,
parsing,
done 1
20201010.export.CSV.zip
downloading,
extracting,
parsing,
done 2
20201009.export.CSV.zip
downloading,
extracting,
parsing,
done 3
20201008.export.CSV.zip
downloading,
extracting,
parsing,
done 4
20201007.export.CSV.zip
downloading,
extracting,
parsing,
done 5
20201006.export.CSV.zip
downloading,
extracting,
parsing,
done 6
20201005.export.CSV.zip
downloading,
extracting,
parsing,
done 7


# PART II:  PARSE DATA AGAIN

### Read in the data

In [4]:
import pandas as pd

# Get the GDELT field names from a helper file
colnames = pd.read_csv('CSV.header.fieldids.csv')['Field Name']


# Build DataFrames from each of the intermediary files
files = glob.glob(local_path+fips_country_code+'*')
DFlist = []
for active_file in files:
    print(active_file)
    DFlist.append(pd.read_csv(active_file, sep='\t', header=None, dtype=str,
                              names=colnames, index_col=['GLOBALEVENTID'], encoding='iso-8859-1'))

# Merge the file-based dataframes and save a pickle
DF = pd.concat(DFlist)
DF.to_pickle(local_path+'backup'+fips_country_code+'.pickle')    
    
# once everythin is safely stored away, remove the temporary files
for active_file in files:
    os.remove(active_file)

./results/NG0002.tsv
./results/NG0003.tsv
./results/NG0006.tsv
./results/NG0000.tsv
./results/NG0004.tsv
./results/NG0005.tsv
./results/NG0001.tsv


In [5]:
import pickle

Niger_Data = pd.read_pickle(r"./results/backupNG.pickle")

### See top 5 lines of data

In [6]:
Niger_Data.head()

Unnamed: 0_level_0,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
GLOBALEVENTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
950900071,20201009,202010,2020,2020.7644,ITA,ITALY,ITA,,,,...,NG,1,Niger,NG,NG,16,8,NG,20201009,https://www.ibtimes.com/french-italian-captive...
950900073,20201009,202010,2020,2020.7644,ITA,ITALIAN,ITA,,,,...,-1072484,1,Niger,NG,NG,16,8,NG,20201009,https://www.ibtimes.com/french-italian-captive...
950900075,20201009,202010,2020,2020.7644,ITA,ITALY,ITA,,,,...,NG,1,Niger,NG,NG,16,8,NG,20201009,https://www.ibtimes.com/french-italian-captive...
950900082,20201009,202010,2020,2020.7644,ITAMED,ITALY,ITA,,,,...,IT,1,Niger,NG,NG,16,8,NG,20201009,https://www.ibtimes.com/french-italian-captive...
950900473,20201009,202010,2020,2020.7644,MLI,MALI,MLI,,,,...,NG,1,Niger,NG,NG,16,8,NG,20201009,https://www.ibtimes.com/french-italian-captive...


### Helper Function  to turn codebooks  into look up tables

In [7]:
def ref_dict(df):
    cols = list(df)
    ref_dict = {}
    for row in df.iterrows(): 
        ref_dict[row[1][cols[0]]] = row[1][cols[1]]
    
    return ref_dict

### Convert each codebook and store in object

In [8]:
#Read in event codes
eventCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.eventcodes.txt", sep='\t'))
#Read in Goldsteinscale
goldScale = ref_dict(pd.read_csv("./Ref Codes/CAMEO.goldsteinscale.txt", sep='\t'))
#Read in ethnic groups
ethnicCodes =ref_dict(pd.read_csv("./Ref Codes/CAMEO.ethnic.txt", sep='\t'))
#Read in known Groups
knownGroups = ref_dict(pd.read_csv("./Ref Codes/CAMEO.knowngroup.txt", sep='\t'))
#Read in relgion
religionCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.religion.txt", sep='\t'))
#Read in type
typeCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.type.txt", sep='\t'))

eventCodes

{1: 'MAKE PUBLIC STATEMENT',
 10: 'DEMAND',
 11: 'DISAPPROVE',
 12: 'REJECT',
 13: 'THREATEN',
 14: 'PROTEST',
 15: 'EXHIBIT FORCE POSTURE',
 16: 'REDUCE RELATIONS',
 17: 'COERCE',
 18: 'ASSAULT',
 19: 'FIGHT',
 2: 'APPEAL',
 20: 'USE UNCONVENTIONAL MASS VIOLENCE',
 21: 'Appeal for material cooperation, not specified below',
 211: 'Appeal for economic cooperation',
 212: 'Appeal for military cooperation',
 213: 'Appeal for judicial cooperation',
 214: 'Appeal for intelligence',
 22: 'Appeal for diplomatic cooperation, such as policy support',
 23: 'Appeal for aid, not specified below',
 231: 'Appeal for economic aid',
 232: 'Appeal for military aid',
 233: 'Appeal for humanitarian aid',
 234: 'Appeal for military protection or peacekeeping',
 24: 'Appeal for political reform, not specified below',
 241: 'Appeal for change in leadership',
 242: 'Appeal for policy change',
 243: 'Appeal for rights',
 244: 'Appeal for change in institutions, regime',
 25: 'Appeal to yield',
 251: 'Appeal 

In [9]:
# Turn colnames into list for ref

cross_ref = list(colnames)


In [10]:
# Create look up table to get values instead of numbers

look_up_code = {"eventCodes": [26,27,28], "goldScale":[30], "ethnicCodes":[9,19], "knownGroups":[8,18], 
                "religionCodes":[10,11,20,21], "typeCodes":[12,13,14,22,23,24]}

In [16]:
'''
Helper function to user can reorient data based on interest from codes

data: Niger_Data - pandas dataframe
ref: key value from look_look_code - string
codebook: reference 
'''

import math

def search_dict(data,ref, codebook):
    res = {}
    look_up = look_up_code[ref]
    col_names = []
    for i in look_up: 
        col_names.append(cross_ref[i])
    
    for col in col_names: 
        for row in data.iterrows(): 
            if isinstance(row[1][col],float):
                #print (type(row[1][col]), col)
                pass
            else: 
                #print (col)
                var = codebook[row[1][col]].upper()
                #print (var, row[1][col])
                if var in res.keys(): 
                    #print(row[1][col])
                    res[var].append(dict(row[1]))
                else: 
                    res[var] = [dict(row[1])]
    return res
    


In [17]:
res = search_dict(Niger_Data, "ethnicCodes", ethnicCodes)
res.keys()

dict_keys(['TUAREG', 'IGBO', 'IJAW'])

In [18]:
#verfication to ensure code is working properly
for k,v in res.items(): 
    print (k, ": ", len(v))

TUAREG :  2
IGBO :  2
IJAW :  4


In [19]:
#Put each collection of articles in a Dataframe
list_res = []

for cat in res.values(): 
    #print(cat)
    list_res.append(pd.DataFrame(cat))
    

In [21]:
list_res[0] #access the group you are interested in by changing the variables

Unnamed: 0,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
0,20201007,202010,2020,2020.7589,tmh,TUAREG,,,tmh,,...,-1077349,4,"Agadez, Agadez, Niger",NG,NG01,16.9733,7.99111,-1077349,20201007,https://northerntransmissions.com/mdou-moctar-...
1,20201007,202010,2020,2020.7589,BUS,PRODUCER,,,,,...,-1077349,4,"Agadez, Agadez, Niger",NG,NG01,16.9733,7.99111,-1077349,20201007,https://northerntransmissions.com/mdou-moctar-...


### Homework 4: Do some type of analysis with GDELT data. It can be country focused (e.g. Guatemala) or topic focused (e.g. attacks or bilateral agreements)

### Must write in the first cell what you are interested in. Code must work but results can be garabage. Update the GDELT parameters to get the information you want and then include some type of plot can be a graph or can be a map.  

### Total Points Possible 19
      

In [5]:
import requests
import lxml.html as lh
import os.path #To help navigate the file directories
import urllib #To request from GDELT
import zipfile #TO unzip the files we downlaod
import glob #To go through multiple files in a directory
import operator 
import pandas as pd

'''I'm looking at the current Armenia-Azerbaijan conflict over Nagorno-Karabakh. In the interests of just doing the assignment, I'll just show the sites of a) Armenian strikes, b) Azeri strikes, and c) ground engagements.'''

gdelt_base_url = 'http://data.gdeltproject.org/events/'

# get the list of all the links on the gdelt file page
page = requests.get(gdelt_base_url+'index.html') #Grab GDELT reference list which is by day
doc = lh.fromstring(page.content)
link_list = doc.xpath("//*/ul/li/a/@href") #Returns all the possible CSV files of GDELT data as a references list

# separate out those links that begin with four digits 
'''
Will extract just the days resulting in list like: 
['20200617.export.CSV.zip',
 '20200616.export.CSV.zip',
 '20200615.export.CSV.zip',...]
 Until 2015
'''

infilecounter = 0
outfilecounter = 0

file_list = [x for x in link_list if str.isdigit(x[0:4])]

local_path = './results/' # Will save to empy results folder to help keep file clean

fips_country_codes = ['AJ', 'AM']

#Adjust list number to get days wanted 
for compressed_file in file_list[:16]: #!!!!!Only getting index 0 to 6!!!!!!
    print(compressed_file,)
    
    # if we dont have the compressed file stored locally, go get it. Keep trying if necessary.
    while not os.path.isfile(local_path+compressed_file): 
        print('downloading,'),
        urllib.request.urlretrieve(url=gdelt_base_url+compressed_file, 
                           filename=local_path+compressed_file)
        
    # extract the contents of the compressed file to a temporary directory    
    print('extracting,'),
    z = zipfile.ZipFile(file=local_path+compressed_file, mode='r')    
    z.extractall(path=local_path+'tmp/')
    
    # parse each of the csv files in the working directory, 
    print('parsing,'),
    for infile_name in glob.glob(local_path+'tmp/*'):
        outfile_name = local_path+'HWK'+'%04i.tsv'%outfilecounter
        
        # open the infile and outfile
        with open(infile_name, mode='r', encoding="ISO-8859-1") as infile, open(outfile_name, mode='w') as outfile:
            for line in infile:
                # extract lines with our interest country code
                if any([x in operator.itemgetter(51, 37, 44)(line.split('\t')) for x in fips_country_codes]):    
                    outfile.write(line)
            outfilecounter +=1
            
        # delete the temporary file
        os.remove(infile_name)
    infilecounter +=1
    print('done', infilecounter)
    

# Get the GDELT field names from a helper file
colnames = pd.read_csv('CSV.header.fieldids.csv')['Field Name']


# Build DataFrames from each of the intermediary files
files = glob.glob(local_path+'HWK'+'*')
DFlist = []
for active_file in files:
    print(active_file)
    DFlist.append(pd.read_csv(active_file, sep='\t', header=None, dtype=str,
                              names=colnames, index_col=['GLOBALEVENTID'], encoding='iso-8859-1'))

# Merge the file-based dataframes and save a pickle
DF = pd.concat(DFlist)
DF.to_pickle(local_path+'backup'+'HWK'+'.pickle')    
    
# once everythin is safely stored away, remove the temporary files
for active_file in files:
    os.remove(active_file)

20201011.export.CSV.zip
extracting,
parsing,
done 1
20201010.export.CSV.zip
extracting,
parsing,
done 2
20201009.export.CSV.zip
extracting,
parsing,
done 3
20201008.export.CSV.zip
extracting,
parsing,
done 4
20201007.export.CSV.zip
extracting,
parsing,
done 5
20201006.export.CSV.zip
extracting,
parsing,
done 6
20201005.export.CSV.zip
extracting,
parsing,
done 7
20201004.export.CSV.zip
extracting,
parsing,
done 8
20201003.export.CSV.zip
extracting,
parsing,
done 9
20201002.export.CSV.zip
extracting,
parsing,
done 10
20201001.export.CSV.zip
extracting,
parsing,
done 11
20200930.export.CSV.zip
extracting,
parsing,
done 12
20200929.export.CSV.zip
extracting,
parsing,
done 13
20200928.export.CSV.zip
extracting,
parsing,
done 14
20200927.export.CSV.zip
downloading,
extracting,
parsing,
done 15
20200926.export.CSV.zip
downloading,
extracting,
parsing,
done 16
./results/HWK0012.tsv
./results/HWK0002.tsv
./results/HWK0001.tsv
./results/HWK0008.tsv
./results/HWK0006.tsv
./results/HWK0014.tsv
./r

In [7]:
import pickle

homework_data = pd.read_pickle(r"./results/backupHWK.pickle")

In [10]:
homework_data.columns.values.tolist()

['SQLDATE',
 'MonthYear',
 'Year',
 'FractionDate',
 'Actor1Code',
 'Actor1Name',
 'Actor1CountryCode',
 'Actor1KnownGroupCode',
 'Actor1EthnicCode',
 'Actor1Religion1Code',
 'Actor1Religion2Code',
 'Actor1Type1Code',
 'Actor1Type2Code',
 'Actor1Type3Code',
 'Actor2Code',
 'Actor2Name',
 'Actor2CountryCode',
 'Actor2KnownGroupCode',
 'Actor2EthnicCode',
 'Actor2Religion1Code',
 'Actor2Religion2Code',
 'Actor2Type1Code',
 'Actor2Type2Code',
 'Actor2Type3Code',
 'IsRootEvent',
 'EventCode',
 'EventBaseCode',
 'EventRootCode',
 'QuadClass',
 'GoldsteinScale',
 'NumMentions',
 'NumSources',
 'NumArticles',
 'AvgTone',
 'Actor1Geo_Type',
 'Actor1Geo_FullName',
 'Actor1Geo_CountryCode',
 'Actor1Geo_ADM1Code',
 'Actor1Geo_Lat',
 'Actor1Geo_Long',
 'Actor1Geo_FeatureID',
 'Actor2Geo_Type',
 'Actor2Geo_FullName',
 'Actor2Geo_CountryCode',
 'Actor2Geo_ADM1Code',
 'Actor2Geo_Lat',
 'Actor2Geo_Long',
 'Actor2Geo_FeatureID',
 'ActionGeo_Type',
 'ActionGeo_FullName',
 'ActionGeo_CountryCode',
 'Acti

In [42]:
# See what convential conflict codes we have
pd.unique(homework_data.loc[homework_data['EventRootCode'] == '19'].EventCode)


array(['190', '193', '194', '195', '192', '196', '191'], dtype=object)

In [63]:
# Commit an act of data malpractice and pretend that "tanks" whatever the heck this decides that includes aren't involved in this conflict; treating all 194s as artillery strikes.

indirect_attacks_ARM = homework_data.loc[(homework_data['EventCode'].isin(['194', '195'])) & (homework_data['Actor1CountryCode'] == 'ARM')]
indirect_attacks_AZE = homework_data.loc[(homework_data['EventCode'].isin(['194', '195'])) & (homework_data['Actor1CountryCode'] == 'AZE')]
ground_engagements = homework_data.loc[homework_data['EventCode'] == '193']

print('Total ARM indirect: {}   |   Total AZE indirect: {}   |   Total ground engagements: {}'.format(len(indirect_attacks_ARM), len(indirect_attacks_AZE), len(ground_engagements)))

Total ARM indirect: 396   |   Total AZE indirect: 395   |   Total ground engagements: 1984


In [75]:
homework_events = pd.concat([indirect_attacks_AZE, indirect_attacks_ARM, ground_engagements])

In [92]:

from bokeh.plotting import figure, output_notebook, show #builds interactive graphs for python
from bokeh.models import Range1d
import math #this is used in graphic section to use the irrational number pi
from bokeh.tile_providers import get_provider, Vendors
from pyproj import Transformer

output_notebook() #Allows inline plotting for Juptyer notebook

In [93]:
''' NOT USING THIS.  It's got some issues I need to work out, but that aren't relevant to this assignment.

# Compute our map bounds
def map_bounds(latitudes, longitudes, x_buffer = 1, y_buffer = 1):
    # Takes pandas series data for lat and long coords and determines the map bounds
    lat_min, lat_max = min(latitudes.astype('float').values), max(latitudes.astype('float').values)
    lon_min, lon_max = min(longitudes.astype('float').values), max(longitudes.astype('float').values)
    bounding_box = [
        (lat_min - x_buffer, lon_max + y_buffer),
        (lat_max + x_buffer, lon_min - y_buffer)
    ]
    return bounding_box'''



In [112]:
# Coordinate conversion and bounding box setup

import numpy as np

bounds = [(36.00, 51.25), (44.25, 43.00)]

transformer = Transformer.from_crs('epsg:4326','epsg:3857')
bounding_box = []
for pt in transformer.itransform(bounds): 
    bounding_box.append(pt)
frames = [indirect_attacks_ARM, indirect_attacks_AZE, ground_engagements]
for frame in frames:
    ll_points = list(zip(frame.ActionGeo_Lat.astype(float).tolist(), frame.ActionGeo_Long.astype(float).tolist()))
    wm_points = list(transformer.itransform(ll_points))
    frame['wm_latitude'] = [x[1] for x in wm_points]
    frame['wm_longitude'] = [x[0] for x in wm_points]

In [113]:
#Plots the bounding box
p = figure(x_range=(bounding_box[0][0], bounding_box[1][0]),y_range=(bounding_box[0][1], bounding_box[1][1]),x_axis_type="mercator", y_axis_type="mercator")
tile_provider = get_provider('STAMEN_TERRAIN')
#add the map form the Bokeh map vendor in this case Stamen_Terrain --- see documentation
p.add_tile(tile_provider)
# Places a circle for each converted lat/long attack
dot_colors = ['firebrick', 'cyan', 'mediumseagreen']
frames = [indirect_attacks_ARM, indirect_attacks_AZE, ground_engagements]
for frame, color in zip(frames, dot_colors):
    for row in frame.itertuples():
        p.circle(x=row.wm_latitude, y=row.wm_longitude, color=color)

#shows the plot
show(p)