Adopted from GDELT Data Wrangle by James Houghton https://nbviewer.jupyter.org/github/JamesPHoughton/Published_Blog_Scripts/blob/master/GDELT%20Wrangler%20-%20Clean.ipynb

Additional GDELT resources: 
    
    GDELT library overview: https://colab.research.google.com/drive/1rnKEHKV1StOwGtFPsCctKDPTBB_kHOc_?usp=sharing 
    
    GDELT with big data: https://github.com/linwoodc3/gdeltPyR/wiki/Pulling-Large-GDELT-Data
        

# PART I: Get GDELT DATA FOR NIGER


### Get the GDELT index files

In [1]:
# PIKE HERE!!
# interested in Chinese Gov invovlement in South Africa
# Changed country code to SF and looked into codeType rather than eventType
# made a bar graph of instances by country fro ActionGeo_CountryType
# results are kind of garbage, and the Chinese weren't as involved as expected 
# tried to find average of AvgTone in articles

In [2]:
import requests
import lxml.html as lh

gdelt_base_url = 'http://data.gdeltproject.org/events/'

# get the list of all the links on the gdelt file page
page = requests.get(gdelt_base_url+'index.html') #Grab GDELT reference list which is by day
doc = lh.fromstring(page.content)
link_list = doc.xpath("//*/ul/li/a/@href") #Returns all the possible CSV files of GDELT data as a references list

# separate out those links that begin with four digits 
'''
Will extract just the days resulting in list like: 
['20200617.export.CSV.zip',
 '20200616.export.CSV.zip',
 '20200615.export.CSV.zip',...]
 Until 2015
'''

file_list = [x for x in link_list if str.isdigit(x[0:4])]
file_list

['20201004.export.CSV.zip',
 '20201003.export.CSV.zip',
 '20201002.export.CSV.zip',
 '20201001.export.CSV.zip',
 '20200930.export.CSV.zip',
 '20200929.export.CSV.zip',
 '20200928.export.CSV.zip',
 '20200927.export.CSV.zip',
 '20200926.export.CSV.zip',
 '20200925.export.CSV.zip',
 '20200924.export.CSV.zip',
 '20200923.export.CSV.zip',
 '20200922.export.CSV.zip',
 '20200921.export.CSV.zip',
 '20200920.export.CSV.zip',
 '20200919.export.CSV.zip',
 '20200918.export.CSV.zip',
 '20200917.export.CSV.zip',
 '20200916.export.CSV.zip',
 '20200915.export.CSV.zip',
 '20200914.export.CSV.zip',
 '20200913.export.CSV.zip',
 '20200912.export.CSV.zip',
 '20200911.export.CSV.zip',
 '20200910.export.CSV.zip',
 '20200909.export.CSV.zip',
 '20200908.export.CSV.zip',
 '20200907.export.CSV.zip',
 '20200906.export.CSV.zip',
 '20200905.export.CSV.zip',
 '20200904.export.CSV.zip',
 '20200903.export.CSV.zip',
 '20200902.export.CSV.zip',
 '20200901.export.CSV.zip',
 '20200831.export.CSV.zip',
 '20200830.export.CS

In [3]:
#Counters to help assess how many files are coming and going out
infilecounter = 0
outfilecounter = 0

### Uses GDELT Index file list to download GDELT data for that day for that country

In [4]:
import os.path #To help navigate the file directories
import urllib #To request from GDELT
import zipfile #TO unzip the files we downlaod
import glob #To go through multiple files in a directory
import operator 

local_path = './results/' # Will save to empy results folder to help keep file clean

fips_country_code = 'SF'  # Changed country code to South Africa

#Adjust list number to get days wanted 
for compressed_file in file_list[:9]: # To get the past 9 days
    print(compressed_file,)
    
    # if we dont have the compressed file stored locally, go get it. Keep trying if necessary.
    while not os.path.isfile(local_path+compressed_file): 
        print('downloading,'),
        urllib.request.urlretrieve(url=gdelt_base_url+compressed_file, 
                           filename=local_path+compressed_file)
        
    # extract the contents of the compressed file to a temporary directory    
    print('extracting,'),
    z = zipfile.ZipFile(file=local_path+compressed_file, mode='r')    
    z.extractall(path=local_path+'tmp/')
    
    # parse each of the csv files in the working directory, 
    print('parsing,'),
    for infile_name in glob.glob(local_path+'tmp/*'):
        outfile_name = local_path+fips_country_code+'%04i.tsv'%outfilecounter
        
        # open the infile and outfile
        with open(infile_name, mode='r', encoding="ISO-8859-1") as infile, open(outfile_name, mode='w') as outfile:
            for line in infile:
                # extract lines with our interest country code
                if fips_country_code in operator.itemgetter(51, 37, 44)(line.split('\t')):    
                    outfile.write(line)
            outfilecounter +=1
            
        # delete the temporary file
        os.remove(infile_name)
    infilecounter +=1
    print('done', infilecounter)
    

20201004.export.CSV.zip
extracting,
parsing,
done 1
20201003.export.CSV.zip
extracting,
parsing,
done 2
20201002.export.CSV.zip
extracting,
parsing,
done 3
20201001.export.CSV.zip
extracting,
parsing,
done 4
20200930.export.CSV.zip
extracting,
parsing,
done 5
20200929.export.CSV.zip
extracting,
parsing,
done 6
20200928.export.CSV.zip
extracting,
parsing,
done 7
20200927.export.CSV.zip
extracting,
parsing,
done 8
20200926.export.CSV.zip
extracting,
parsing,
done 9


# PART II:  PARSE DATA AGAIN

### Read in the data

In [5]:
import pandas as pd

# Get the GDELT field names from a helper file
colnames = pd.read_csv('CSV.header.fieldids.csv')['Field Name']


# Build DataFrames from each of the intermediary files
files = glob.glob(local_path+fips_country_code+'*')
DFlist = []
for active_file in files:
    print(active_file)
    DFlist.append(pd.read_csv(active_file, sep='\t', header=None, dtype=str,
                              names=colnames, index_col=['GLOBALEVENTID'], encoding='iso-8859-1'))

# Merge the file-based dataframes and save a pickle
DF = pd.concat(DFlist)
DF.to_pickle(local_path+'backup'+fips_country_code+'.pickle')    
    
# once everythin is safely stored away, remove the temporary files
for active_file in files:
    os.remove(active_file)

./results/SF0000.tsv
./results/SF0001.tsv
./results/SF0002.tsv
./results/SF0003.tsv
./results/SF0004.tsv
./results/SF0005.tsv
./results/SF0006.tsv
./results/SF0007.tsv
./results/SF0008.tsv


In [6]:
import pickle

SF_Data = pd.read_pickle(r"./results/backupSF.pickle") # CHanged from Niger_Data to SF_Data

### See top 5 lines of data

In [7]:
SF_Data.head()

Unnamed: 0_level_0,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
GLOBALEVENTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
950095801,20201004,202010,2020,2020.7507,AFR,AFRICA,AFR,,,,...,204226.0,5,"Gauteng, Gauteng, South Africa",SF,SF06,-26.0833,28.25,204226,20201004,https://lowvelder.co.za/lnn/1144782/covid-19-a...
950096906,20201004,202010,2020,2020.7507,MED,LOCAL MEDIA,,,,,...,,5,"Gauteng, Gauteng, South Africa",SF,SF06,-26.0833,28.25,204226,20201004,https://lowvelder.co.za/lnn/1144782/covid-19-a...
950097736,20201004,202010,2020,2020.7507,,,,,,,...,-1224926.0,4,"Durban, KwaZulu-Natal, South Africa",SF,SF02,-29.85,31.0167,-1224926,20201004,https://www.enca.com/news/emirates-cancels-fli...
950097860,20201004,202010,2020,2020.7507,AFR,AFRICA,AFR,,,,...,,4,"Cape Town, Western Cape, South Africa",SF,SF11,-33.9167,18.4167,-1217214,20201004,https://www.enca.com/news/police-expect-more-a...
950097861,20201004,202010,2020,2020.7507,AFR,AFRICA,AFR,,,,...,,4,"Johannesburg, Gauteng, South Africa",SF,SF06,-26.2,28.0833,-1240261,20201004,https://mynorthwest.com/2206366/south-african-...


### Helper Function  to turn codebooks  into look up tables

In [8]:
def ref_dict(df):
    cols = list(df)
    ref_dict = {}
    for row in df.iterrows(): 
        ref_dict[row[1][cols[0]]] = row[1][cols[1]]
    
    return ref_dict

### Convert each codebook and store in object

In [9]:
#Read in event codes
eventCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.eventcodes.txt", sep='\t'))
#Read in Goldsteinscale
goldScale = ref_dict(pd.read_csv("./Ref Codes/CAMEO.goldsteinscale.txt", sep='\t'))
#Read in ethnic groups
ethnicCodes =ref_dict(pd.read_csv("./Ref Codes/CAMEO.ethnic.txt", sep='\t'))
#Read in known Groups
knownGroups = ref_dict(pd.read_csv("./Ref Codes/CAMEO.knowngroup.txt", sep='\t'))
#Read in relgion
religionCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.religion.txt", sep='\t'))
#Read in type
typeCodes = ref_dict(pd.read_csv("./Ref Codes/CAMEO.type.txt", sep='\t'))

print(typeCodes)

{'COP': 'Police forces', 'GOV': 'Government', 'INS': 'Insurgents', 'JUD': 'Judiciary', 'MIL': 'Military', 'OPP': 'Political Opposition', 'REB': 'Rebels', 'SEP': 'Separatist Rebels', 'SPY': 'State Intelligence', 'UAF': 'Unaligned Armed Forces', 'AGR': 'Agriculture', 'BUS': 'Business', 'CRM': 'Criminal', 'CVL': 'Civilian', 'DEV': 'Development', 'EDU': 'Education', 'ELI': 'Elites', 'ENV': 'Environmental', 'HLH': 'Health', 'HRI': 'Human Rights', 'LAB': 'Labor', 'LEG': 'Legislature', 'MED': 'Media', 'REF': 'Refugees', 'MOD': 'Moderate', 'RAD': 'Radical', 'AMN': 'Amnesty International', 'IRC': 'Red Cross', 'GRP': 'Greenpeace', 'UNO': 'United Nations', 'PKO': 'Peacekeepers', 'UIS': 'Unidentified State Actor', 'IGO': 'Inter-Governmental Organization', 'IMG': 'International Militarized Group', 'INT': 'International/Transnational Generic', 'MNC': 'Multinational Corporation', 'NGM': 'Non-Governmental Movement', 'NGO': 'Non-Governmental Organization', 'SET': 'Settler'}


In [10]:
# Turn colnames into list for ref

cross_ref = list(colnames)
#cross_ref

In [11]:
# Create look up table to get values instead of numbers

look_up_code = {"eventCodes": [26,27,28], "goldScale":[30], "ethnicCodes":[9,19], "knownGroups":[8,18], 
                "religionCodes":[10,11,20,21] , "typeCodes":[12,13,14,22,23,24]}

In [12]:
'''
Helper function to user can reorient data based on interest from codes

data: SF_Data - pandas dataframe
ref: key value from look_look_code - string
codebook: reference 
'''

import math

def search_dict(data,ref, codebook):
    res = {}
    look_up = look_up_code[ref]
    col_names = []
    for i in look_up: 
        col_names.append(cross_ref[i])
    
    #print(col_names)
    for col in col_names: 
        for row in data.iterrows():
            if isinstance(row[1][col],float):
                #print (type(row[1][col]), col)
                pass
            else: 
                #print (col)
                var = codebook[row[1][col]].upper()
                #print (var, row[1][col])
                if var in res.keys(): 
                    #print(row[1][col])
                    res[var].append(dict(row[1]))
                else: 
                    res[var] = [dict(row[1])]
    return res
    


In [13]:
res = search_dict(SF_Data, "typeCodes", typeCodes)
#res.keys()

In [14]:
#verfication to ensure code is working properly
for k,v in res.items(): 
    print (k, ": ", len(v))

MEDIA :  428
BUSINESS :  1015
GOVERNMENT :  2333
INTER-GOVERNMENTAL ORGANIZATION :  139
CIVILIAN :  860
EDUCATION :  555
POLICE FORCES :  1150
CRIMINAL :  171
JUDICIARY :  591
MILITARY :  352
LABOR :  212
AGRICULTURE :  68
ELITES :  117
MULTINATIONAL CORPORATION :  101
LEGISLATURE :  227
POLITICAL OPPOSITION :  125
UNIDENTIFIED STATE ACTOR :  9
STATE INTELLIGENCE :  61
HUMAN RIGHTS :  26
RADICAL :  2
HEALTH :  370
NON-GOVERNMENTAL ORGANIZATION :  56
UNALIGNED ARMED FORCES :  35
REBELS :  21
REFUGEES :  22
ENVIRONMENTAL :  6
SEPARATIST REBELS :  14
SETTLER :  11
INTERNATIONAL/TRANSNATIONAL GENERIC :  5
DEVELOPMENT :  16


In [15]:
#Put each collection of articles in a Dataframe
list_res = []

for cat in res.values(): 
    #print(cat)
    list_res.append(pd.DataFrame(cat))
    

In [16]:
list_res[2] #access the type interested in by changing the variables
# interested in Government actions/types
# this index may change based on the day and number of articles published for TypeCode

Unnamed: 0,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
0,20201004,202010,2020,2020.7507,GOV,GOVERNMENT,,,,,...,,4,"Durban, KwaZulu-Natal, South Africa",SF,SF02,-29.85,31.0167,-1224926,20201004,https://www.enca.com/news/emirates-cancels-fli...
1,20201004,202010,2020,2020.7507,GOV,GOVERNMENT,,,,,...,-1224926,4,"Durban, KwaZulu-Natal, South Africa",SF,SF02,-29.85,31.0167,-1224926,20201004,https://www.enca.com/news/emirates-cancels-fli...
2,20201004,202010,2020,2020.7507,KENGOV,KENYA,KEN,,,,...,,1,South Africa,SF,SF,-30,26,SF,20201004,https://nation.africa/kenya/news/politics/uhur...
3,20201004,202010,2020,2020.7507,GOVEDU,EDUCATION DEPARTMENT,,,,,...,,1,South Africa,SF,SF,-30,26,SF,20201004,https://theconversation.com/what-south-africas...
4,20201004,202010,2020,2020.7507,GOV,GOVERNMENT,,,,,...,SF,1,South Africa,SF,SF,-30,26,SF,20201004,https://mybroadband.co.za/news/government/3700...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2328,20200928,202009,2020,2020.7342,AFR,AFRICA,AFR,,,,...,-1224926,4,"Durban, KwaZulu-Natal, South Africa",SF,SF02,-29.85,31.0167,-1224926,20200928,https://www.dailysun.co.za/News/cele-visits-ar...
2329,20200928,202009,2020,2020.7342,,,,,,,...,-1250176,4,"Kwandengezi, KwaZulu-Natal, South Africa",SF,SF02,-29.8375,30.7778,-1250176,20200928,https://ewn.co.za/2020/09/28/4-people-killed-i...
2330,20200927,202009,2020,2020.7315,AFR,AFRICA,AFR,,,,...,SF,1,South Africa,SF,SF,-30,26,SF,20200927,https://citizen.co.za/news/south-africa/politi...
2331,20200927,202009,2020,2020.7315,COP,DEPUTY,,,,,...,SF,1,South Africa,SF,SF,-30,26,SF,20200927,https://citizen.co.za/news/south-africa/politi...


### Homework 4: Do some type of analysis with GDELT data. It can be country focused (e.g. Guatemala) or topic focused (e.g. attacks or bilateral agreements)

### Must write in the first cell what you are interested in. Code must work but results can be garabage. Update the GDELT parameters to get the information you want and then include some type of plot can be a graph or can be a map.  

### Total Points Possible 19
      

In [17]:
Gov_data = pd.DataFrame(list_res[2]) #since list_res[0] is hardcoded, this also needs to change based on the day

#particularly interested in Chinese influence in South Africa, so looking to see China as ActionGeo

China_dict = {} #make an empty datastructure (dictionary) to fill

#This loop goes through each row and counts the number of entries by country of action
for index, row in Gov_data.iterrows():
    if row["ActionGeo_CountryCode"] in China_dict.keys():
        China_dict[row["ActionGeo_CountryCode"]] += 1 
    else:
        China_dict[row["ActionGeo_CountryCode"]] = 1 

China_dict  #This shows how many events took place in a country


{'SF': 2144,
 'GM': 9,
 'ZA': 4,
 'UK': 8,
 'ZI': 83,
 'CH': 8,
 'CE': 5,
 'GR': 2,
 'RS': 5,
 'UP': 2,
 'RW': 2,
 'US': 8,
 'WA': 2,
 'CU': 3,
 'LT': 1,
 'CF': 2,
 'MZ': 4,
 'ET': 3,
 'BC': 4,
 'MT': 2,
 'SA': 2,
 'NI': 1,
 'SW': 5,
 'IR': 6,
 'IN': 5,
 'AE': 1,
 'QA': 4,
 'AF': 2,
 'IS': 1,
 'MP': 1,
 'GH': 2,
 'IV': 1,
 'OD': 1}

In [18]:
from bokeh.plotting import figure, output_notebook, show #builds interactive graphs for python
from bokeh.models import Range1d
import math #this is used in graphic section to use the irrational number pi
output_notebook() #Allows inline plotting for Juptyer notebook

In [19]:
# code adapted from Session 3 HW

countries = list(China_dict.keys())
instances = list(China_dict.values())

In [20]:
#Sort from smallest to largest
sorted_countries = sorted(countries, key=lambda x: instances[countries.index(x)])
sorted_instances = sorted(instances)


In [21]:
# Uses the bokeh library to plot an interactive graph ---this is very basic view of its capability
# makes the figure
p = figure(x_range = sorted_countries, plot_width=800, plot_height=1200)
#Plots the data
p.vbar(x=sorted_countries, width=0.5, bottom=0,
       top= sorted_instances, color="firebrick")

p.xaxis.major_label_orientation = math.pi/2  #Rotates the labels on the X axis

In [22]:
show(p)

In [23]:
China_dict = {k : v for k,v in China_dict.items() if v > 2}

China_dict

# removed those items from dictionary where there were only one or two instances
# to try to make a little bit better graphic

#code from StackOverflow article

{'SF': 2144,
 'GM': 9,
 'ZA': 4,
 'UK': 8,
 'ZI': 83,
 'CH': 8,
 'CE': 5,
 'RS': 5,
 'US': 8,
 'CU': 3,
 'MZ': 4,
 'ET': 3,
 'BC': 4,
 'SW': 5,
 'IR': 6,
 'IN': 5,
 'QA': 4}

In [24]:
countries = list(China_dict.keys())
instances = list(China_dict.values())

In [25]:
#Sort from smallest to largest
sorted_countries = sorted(countries, key=lambda x: instances[countries.index(x)])
sorted_instances = sorted(instances)

In [26]:

p = figure(x_range = sorted_countries, plot_width=800, plot_height=1200)
#Plots the data
p.vbar(x=sorted_countries, width=0.5, bottom=0,
       top= sorted_instances, color="firebrick")

p.xaxis.major_label_orientation = math.pi/2  #Rotates the labels on the X axis

In [27]:
show(p)

In [28]:
# this doesn't help at all, as South Africa dominates the number of articles
# Did not see China as much as I wanted to at all

In [29]:
# since that didn't work out, I tried to find the average tone of the articles

AvgTone_dict = {} #make an empty datastructure (dictionary) to fill

#This loop goes through each row and counts the number of entries by event type
for index, row in Gov_data.iterrows():
    if row["AvgTone"] in AvgTone_dict.keys():
        AvgTone_dict[row["AvgTone"]] += 1 
    else:
        AvgTone_dict[row["AvgTone"]] = 1 

#AvgTone_dict

In [30]:
AvgTone_dict = {float(k) : int(v) for k,v in AvgTone_dict.items()}

In [31]:
Tonelist = []
for key in AvgTone_dict:
    Tonelist.append(key)

In [32]:
ToneValue = []
for key in AvgTone_dict.keys():
    ToneValue.append(AvgTone_dict[key])

In [33]:
resultlist = []
for i in range(0, len(Tonelist)):
    resultlist.append(Tonelist[i] * ToneValue[i])
#resultlist
# since some values appeared more than one time, to get an accurate average I needed to 
# multiply the key by the value

In [34]:
total = sum(resultlist)
avg = total/len(Tonelist)
print(avg) # the average tone of the articles is negative, how surprising

-6.093224013376986
