In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh
from collections import Counter

In [2]:
# all bokeh imports

from bokeh.io import output_notebook
output_notebook()
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral6
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.models import FactorRange
from bokeh.palettes import Category20b
from bokeh.palettes import viridis, Set2
from bokeh.models import ColumnDataSource
from bokeh.layouts import row, gridplot, column
from bokeh.models import Range1d
from bokeh.palettes import Category20
from math import pi
from bokeh.io import output_file, show
from bokeh.palettes import Category20c
from bokeh.plotting import figure
from bokeh.transform import cumsum


Let's try to read our data, encoding type was found on the web

Detailed info about meaning of the data was obtained here: https://www.start.umd.edu/gtd/downloads/Codebook.pdf

In [3]:
data_orig = pd.read_csv('../globalterrorismdb_0718dist.csv', encoding='ISO-8859-1')

  interactivity=interactivity, compiler=compiler, result=result)


Now I'd use df.head() to get an initial idea of the data, though df.tail() should work equally fine

In [4]:
data_orig.head()
data_orig.tail()

delete = ['eventid', 'approxdate', 'extended', 'country', 'region', 
          'latitude', 'longitude', 
          #'provstate', 'specificity',
         'vicinity']
data=data_orig.drop(delete, axis=1)
data.tail()

Unnamed: 0,iyear,imonth,iday,resolution,country_txt,region_txt,provstate,city,specificity,location,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
181686,2017,12,31,,Somalia,Sub-Saharan Africa,Middle Shebelle,Ceelka Geelow,2.0,The incident occurred near the town of Balcad.,...,,"""Somalia: Al-Shabaab Militants Attack Army Che...","""Highlights: Somalia Daily Media Highlights 2 ...","""Highlights: Somalia Daily Media Highlights 1 ...",START Primary Collection,0,0,0,0,
181687,2017,12,31,,Syria,Middle East & North Africa,Lattakia,Jableh,1.0,The incident occurred at the Humaymim Airport.,...,,"""Putin's 'victory' in Syria has turned into a ...","""Two Russian soldiers killed at Hmeymim base i...","""Two Russian servicemen killed in Syria mortar...",START Primary Collection,-9,-9,1,1,
181688,2017,12,31,,Philippines,Southeast Asia,Maguindanao,Kubentog,2.0,The incident occurred in the Datu Hoffer distr...,...,,"""Maguindanao clashes trap tribe members,"" Phil...",,,START Primary Collection,0,0,0,0,
181689,2017,12,31,,India,South Asia,Manipur,Imphal,1.0,The incident occurred in the Mantripukhri neig...,...,,"""Trader escapes grenade attack in Imphal,"" Bus...",,,START Primary Collection,-9,-9,0,-9,
181690,2017,12,31,,Philippines,Southeast Asia,Maguindanao,Cotabato City,1.0,,...,,"""Security tightened in Cotabato following IED ...","""Security tightened in Cotabato City,"" Manila ...",,START Primary Collection,-9,-9,0,-9,


Some first thoughts:
* Quite a lot of missing values that we will have to handle later
* There are too many variables to fit the screen 
* And those that fit don't always have very descriptive names :(

data.info() will reveal all variables with their types and NaNs, 
data.describe() would give us some some basic statistics

In next two cells we can examine the type of data and missing values.


In [5]:
data.info(verbose=True, null_counts=True, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181691 entries, 0 to 181690
Data columns (total 127 columns):
iyear                 181691 non-null int64
imonth                181691 non-null int64
iday                  181691 non-null int64
resolution            2220 non-null object
country_txt           181691 non-null object
region_txt            181691 non-null object
provstate             181270 non-null object
city                  181257 non-null object
specificity           181685 non-null float64
location              55495 non-null object
summary               115562 non-null object
crit1                 181691 non-null int64
crit2                 181691 non-null int64
crit3                 181691 non-null int64
doubtterr             181690 non-null float64
alternative           29011 non-null float64
alternative_txt       29011 non-null object
multiple              181690 non-null float64
success               181691 non-null int64
suicide               181691 non-null int

In [6]:
data.isnull().sum()

iyear                      0
imonth                     0
iday                       0
resolution            179471
country_txt                0
region_txt                 0
provstate                421
city                     434
specificity                6
location              126196
summary                66129
crit1                      0
crit2                      0
crit3                      0
doubtterr                  1
alternative           152680
alternative_txt       152680
multiple                   1
success                    0
suicide                    0
attacktype1                0
attacktype1_txt            0
attacktype2           175377
attacktype2_txt       175377
attacktype3           181263
attacktype3_txt       181263
targtype1                  0
targtype1_txt              0
targsubtype1           10373
targsubtype1_txt       10373
                       ...  
propextent            117626
propextent_txt        117626
propvalue             142702
propcomment   

In [7]:
data[['doubtterr', 'alternative']].tail(15)

Unnamed: 0,doubtterr,alternative
181676,0.0,
181677,0.0,
181678,0.0,
181679,1.0,1.0
181680,1.0,1.0
181681,0.0,
181682,0.0,
181683,0.0,
181684,0.0,
181685,0.0,


There is no missing data in dates or general location of the event. We can spot quite a few null values throughout the data, but examining columns 'doubtterr' and 'alternative' we can see that sometimes NaNs exist if a "flag" of some related category is set to zero. Here doubterr is non-zero if there exist some doubts as to whether the event can be properly classified as act of terror. If the answer is no, the alternative column will be NaN (Reference: p. 16 of the Codebook). Therefore not all NaNs are real missing values. Usually Data Scientists hate nans, because then they have to design nifty ways for dealing with them, but here for example lots of nans in *resolution* column mean that most attacks didn't last more than a day, so 
For now I won't bother dealing with them

To add to that, some data wasn't collected before 1997 and that can explain some missing values in older events

To show some things in a bit more appealing way than rows of numbers, I'll try to bring in some more graphical way. Matplotlib, especially with Seaborn is a way to go, but to experiment with something new I will try to sneak in some __bokeh__ interactive plots


In [8]:




counter = Counter(data['iyear'])
years = list(map(str, counter.keys()))

p = figure(x_range=years, plot_height=350, title="Attacks by year",
           toolbar_location='right')

p.vbar(x=years, top=list(counter.values()), width=0.8)

p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1
p.y_range.start = 0

show(p)

That doesn't look very optimistic, after a fairly calm turn of the centurie, starting from 2005 we can observe surge in attacks. I will examine later numbers of fatalities and types of attacks, for now let's break the attacks by region a bit

In [9]:
counter = Counter(data['region_txt'])
countries = list(map(str, counter.keys()))

p = figure(x_range=countries, plot_height=350, title="All attacks per region",
           toolbar_location='right')

p.vbar(x=countries, top=list(counter.values()), width=0.8)

p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1
p.y_range.start = 0

show(p)

In [10]:


def attacks_by_year_region(region):

    temp = data.loc[data['region_txt']==region]
    counter = Counter(temp['iyear'])
    #counter = 
    countries = list(map(str, counter.keys()))
    n_colors = len(countries)
    source = ColumnDataSource(data=dict(countries=countries, counts=list(counter.values()), color=viridis(n_colors)))

    p = figure(x_range=countries, plot_width=460, plot_height=350, title="Attacks in {} by year".format(region),
               toolbar_location='right')

    p.vbar(x='countries', top='counts', width=0.8, color='color', source=source)

    p.xgrid.grid_line_color = None
    p.xaxis.major_label_orientation = 1
    p.y_range.start = 0
    bottom, top = 0, 5000
    p.y_range=Range1d(bottom, top)

    return p

regions = set(data['region_txt'])

ppp = [attacks_by_year_region(r) for r in regions]

grid =gridplot(ppp, ncols=2 )
show(grid)


~~The output above is shows probably the gravest mistakes statisticians make at the beginning of their career, which is also a trick of politicians to prove their cause. That is different scale on vertical axis. This way South Asia and Southeast Asia seem to be equally affected by terrorism. Only until we examine that former top value is 5000, while latter - 1200.~~

(Real reason I made this remark is that I didn't feel like googling to fix the graphs and now I did. Commenting line 26 in function above can be used to show how easy it is to change our perception of the world situation by manipulating data just a little bit. The world seems a little bit less scary with proper axes)

In [11]:

def attacks_by_country_region(region):

    temp= data.loc[data['region_txt']==region]

    counter = Counter(temp['country_txt'])
    countries = list(map(str, counter.keys()))
    
    if 3<=len(countries)<=20: 
        color = Category20[len(countries)] 
    else: color=Category20[20]
    
    source = ColumnDataSource(data=dict(countries=countries, counts=list(counter.values()), color=color))

    p = figure(x_range=countries, plot_width=800, plot_height=450, 
               title="Attacks in {} by country, years 1970-2017".format(region),
               toolbar_location='right')

    p.vbar(x='countries', top='counts', width=0.8, color='color', source=source)

    p.xgrid.grid_line_color = None
    p.xaxis.major_label_orientation = 1
    p.y_range.start = 0

    return p

p1 = attacks_by_country_region('Eastern Europe')
p2 = attacks_by_country_region('Western Europe')

show(column(p1,p2))




This can plot can seem confusing, but the CodeBook states that the attacks were labelled accoring to the borders and countries they from the date of the event. That explains why we have here East Germany or Yugoslavia



In [12]:


x = Counter(data['attacktype1_txt'])

data_plot = pd.Series(x).reset_index(name='value').rename(columns={'index':'Attack'})
data_plot['angle'] = data_plot['value']/data_plot['value'].sum() * 2*pi
data_plot['color'] = Category20c[len(x)]
data_plot['percent'] = round(data_plot['value']/data_plot['value'].sum()*100,2)

p = figure(plot_height=600, plot_width=600, title="Types of attacks in years 1970-2017", toolbar_location=None,
           tools="hover", tooltips="@Attack: @percent %", x_range=(-0.50, 1.0))

p.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', legend='Attack', source=data_plot)

p.axis.axis_label=None
p.axis.visible=False
p.grid.grid_line_color = None

show(p)

Over the span of 47 years almost 50% of the the terrorist attacks were bombings and almost a quarter - armed assault.

Below I will try to break down those numbers to see if there is any change in the means used by attackers.
I will compare the span of 20 years: 1998-2017, mainly because that's the period we can fairly remember


In [13]:
def pie_attacks_by_years(start, end):
    x = Counter(data.loc[(data['iyear']<=end) & (data['iyear']>=start)]['attacktype1_txt'])
    data_plot = pd.Series(x).reset_index(name='value').rename(columns={'index':'Attack'})
    data_plot['angle'] = data_plot['value']/data_plot['value'].sum() * 2*pi
    data_plot['color'] = Category20c[len(x)]
    data_plot['percent'] = round(data_plot['value']/data_plot['value'].sum()*100,2)

    p = figure(plot_height=500, plot_width=600, title="Types of attacks in years {}-{}".format(start, end), toolbar_location=None,
               tools="hover", tooltips="@Attack: @percent %", x_range=(-0.50, 1.0))

    p.wedge(x=0, y=1, radius=0.4,
            start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
            line_color="white", fill_color='color', legend='Attack', source=data_plot)

    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color = None
    return p

p1 = pie_attacks_by_years(2008, 2017)
p2 = pie_attacks_by_years(1998, 2007)

show(column([p2, p1]))

Percentage of bombings is slightly smaller when we analyze the whole data compared to 10 years periods, however it remained almost the same in 1998-2007 and 2008-2017.
Similarly, armed assault helds 2nd place on all three figures.

(Hovering the plots should show percentages for each category. Unfortunately, category colors change between the plots which makes it hard for visual comparison)  

In [14]:


def pie_attacks_by_target_years(start, end):
    x = Counter(data.loc[(data['iyear']<=end) & (data['iyear']>=start)]['targtype1_txt'])
    x = dict(x.most_common(10))
    data_plot = pd.Series(x).reset_index(name='value').rename(columns={'index':'Attack'})
    data_plot['angle'] = data_plot['value']/data_plot['value'].sum() * 2*pi
    data_plot['color'] = Category20b[len(x)]
    data_plot['percent'] = round(data_plot['value']/data_plot['value'].sum()*100,2)

    p = figure(plot_height=500, plot_width=600, title="Types of attacks in years {}-{}".format(start, end), toolbar_location=None,
               tools="hover", tooltips="@Attack: @percent %", x_range=(-0.50, 1.0))

    p.wedge(x=0, y=1, radius=0.4,
            start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
            line_color="white", fill_color='color', legend='Attack', source=data_plot)

    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color = None
    return p

p1 = pie_attacks_by_target_years(2008, 2017)
p2 = pie_attacks_by_target_years(1998, 2007)

show(column([p2, p1]))

Interestingly, there appears to be some increase in the assaults on military facilities while those on police remain unchanged. 

One last thing to examine will be number of casualties

In [15]:
temp = data.groupby('iyear')['nkill'].sum()
temp2 = data.loc[data['attacktype1_txt']=='Bombing/Explosion']
temp2 = temp2.groupby('iyear')['nkill'].sum()

In [16]:
TOOLS = "pan,wheel_zoom,box_zoom,reset,save,box_select"

x = np.linspace(1970,1, 2017)
p1 = figure(title="Legend Example", tools=TOOLS)

p1.circle(temp.index,   temp.values, legend="Casualties globally")
p1.circle(temp2.index, temp2.values, legend='Casualities from bombing', color='orange')

show(p1)

We can see that there was a period in thime when majority of casualties came from other attacks than bombing, but around 2000 this gap became to close only to resurface again 10 years later