# 6.1 Sourcing Open Data

## GDELT Conflict Dataset 1.0 (2021)

#### -Errol Hinkamp

##### Table of Contents

1. Import libraries
2. Import data
3. Clean data
4. Basic descriptive analysis

# 1. Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 2. Import data

In [2]:
# Import dataframe
path=r'C:\Users\Errol\Documents\Data Analyst Work\Achievement 6'
data = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'gdelt_conflict_1_0.csv'))

In [3]:
# Disable column limits
pd.options.display.max_columns = None

In [5]:
# Basic dataframe info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176432 entries, 0 to 176431
Data columns (total 14 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Year                  176432 non-null  int64  
 1   CountryCode           176432 non-null  object 
 2   CountryName           176432 non-null  object 
 3   SumEvents             176432 non-null  int64  
 4   TotalEvents           176432 non-null  int64  
 5   NormalizedEvents1000  176432 non-null  float64
 6   EventRootCode         176432 non-null  int64  
 7   EventRootDescr        176432 non-null  object 
 8   EventCode             176432 non-null  int64  
 9   EventDescr            176432 non-null  object 
 10  GoldsteinScale        176432 non-null  float64
 11  AvgNumMentions        176432 non-null  float64
 12  SumNumMentions        176432 non-null  int64  
 13  AvgAvgTone            176432 non-null  float64
dtypes: float64(4), int64(6), object(4)
memory usage: 18.

In [6]:
# Quick visual check
data.head(20)

Unnamed: 0,Year,CountryCode,CountryName,SumEvents,TotalEvents,NormalizedEvents1000,EventRootCode,EventRootDescr,EventCode,EventDescr,GoldsteinScale,AvgNumMentions,SumNumMentions,AvgAvgTone
0,1979,CH,China,350,33541,10.43499,19,FIGHT,193,Fight with small arms and light weapons,-10.0,3.988571,1396,1.906
1,1979,HR,Croatia,14,714,19.607843,19,FIGHT,190,"Use conventional military force, not specified...",-10.0,2.642857,37,4.335594
2,1979,SU,Sudan,16,1354,11.816839,17,COERCE,173,"Arrest, detain, or charge with legal action",-5.0,6.0,96,3.358047
3,1979,SY,Syria,71,3119,22.763706,19,FIGHT,193,Fight with small arms and light weapons,-10.0,3.647887,259,3.557653
4,1979,GM,Germany,33,6615,4.988662,17,COERCE,172,"Impose administrative sanctions, not specified...",-5.0,4.484848,148,4.841399
5,1979,UK,United Kingdom,67,15721,4.261815,18,ASSAULT,180,"Use unconventional violence, not specified below",-9.0,4.313433,289,4.057995
6,1979,UK,United Kingdom,7,15721,0.445264,17,COERCE,1711,Confiscate property,-9.2,3.857143,27,3.936435
7,1979,IS,Israel,6,17311,0.3466,19,FIGHT,191,"Impose blockade, restrict movement",-9.5,5.666667,34,3.751923
8,1979,AO,Angola,21,1530,13.72549,19,FIGHT,192,Occupy territory,-9.5,5.52381,116,2.859268
9,1979,RS,Russia,8,27338,0.292633,17,COERCE,1711,Confiscate property,-9.2,3.375,27,4.230203


# 3. Clean data

In [7]:
# Check for mixed data types
for col in data.columns.tolist():
  weird = (data[[col]].applymap(type) != data[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (data[weird]) > 0:
    print (col)

##### No mixed data columns

In [8]:
# Check for missing values
data.isnull().sum()

Year                    0
CountryCode             0
CountryName             0
SumEvents               0
TotalEvents             0
NormalizedEvents1000    0
EventRootCode           0
EventRootDescr          0
EventCode               0
EventDescr              0
GoldsteinScale          0
AvgNumMentions          0
SumNumMentions          0
AvgAvgTone              0
dtype: int64

##### No missing values

In [10]:
# Check for duplicates
data_dups = data[data.duplicated()]
data_dups

Unnamed: 0,Year,CountryCode,CountryName,SumEvents,TotalEvents,NormalizedEvents1000,EventRootCode,EventRootDescr,EventCode,EventDescr,GoldsteinScale,AvgNumMentions,SumNumMentions,AvgAvgTone


##### No duplicates

# 4. Basic descriptive analysis

In [11]:
# Get info on dataframe
data.describe()

Unnamed: 0,Year,SumEvents,TotalEvents,NormalizedEvents1000,EventRootCode,EventCode,GoldsteinScale,AvgNumMentions,SumNumMentions,AvgAvgTone
count,176432.0,176432.0,176432.0,176432.0,176432.0,176432.0,176432.0,176432.0,176432.0,176432.0
mean,2004.12207,475.137237,120847.0,6.220068,17.944591,621.729533,-8.385991,6.857883,5985.704,1.917248
std,11.607784,8511.682202,827066.1,21.52162,0.903649,706.792802,1.961109,7.33911,141333.6,3.948198
min,1979.0,1.0,1.0,4.2e-05,17.0,170.0,-10.0,1.0,1.0,-21.052632
25%,1996.0,3.0,4481.0,0.336431,17.0,180.0,-10.0,3.939394,16.0,1.01847
50%,2006.0,13.0,16483.0,1.076684,18.0,191.0,-9.2,5.25,71.0,3.389922
75%,2014.0,72.0,56997.0,4.092455,19.0,1711.0,-7.0,7.996457,480.0,4.447779
max,2021.0,893895.0,23770350.0,1000.0,20.0,1833.0,-5.0,512.25,16834210.0,20.0


In [21]:
# Get medians of numerical columns
data.median()

Year                     2006.000000
SumEvents                  13.000000
TotalEvents             16483.000000
NormalizedEvents1000        1.076684
EventRootCode              18.000000
EventCode                 191.000000
GoldsteinScale             -9.200000
AvgNumMentions              5.250000
SumNumMentions             71.000000
AvgAvgTone                  3.389922
dtype: float64

In [22]:
# Get modes for all columns
data.mode()

Unnamed: 0,Year,CountryCode,CountryName,SumEvents,TotalEvents,NormalizedEvents1000,EventRootCode,EventRootDescr,EventCode,EventDescr,GoldsteinScale,AvgNumMentions,SumNumMentions,AvgAvgTone
0,2016,BM,United States,1,21821,20.833333,17,COERCE,190,"Use conventional military force, not specified...",-10.0,2.0,9,0.0
