In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh
from bokeh.io import output_notebook
output_notebook()

Let's try to read our data, encoding type was found on the web

Detailed info about meaning of the data was obtained here: https://www.start.umd.edu/gtd/downloads/Codebook.pdf

In [24]:
data = pd.read_csv('globalterrorismdb_0718dist.csv', encoding='ISO-8859-1')

  interactivity=interactivity, compiler=compiler, result=result)


Now I'd use df.head() to get an initial idea of the data, though df.tail() should work equally fine

In [25]:
data.head()
data.tail()

delete = ['eventid', 'approxdate', 'extended', 'country', 'region', 
          'latitude', 'longitude', 'provstate', 'specificity',
         'vicinity']
data=data.drop(delete, axis=1)
data.tail()

Unnamed: 0,iyear,imonth,iday,resolution,country_txt,region_txt,city,location,summary,crit1,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
181686,2017,12,31,,Somalia,Sub-Saharan Africa,Ceelka Geelow,The incident occurred near the town of Balcad.,12/31/2017: Assailants opened fire on a Somali...,1,...,,"""Somalia: Al-Shabaab Militants Attack Army Che...","""Highlights: Somalia Daily Media Highlights 2 ...","""Highlights: Somalia Daily Media Highlights 1 ...",START Primary Collection,0,0,0,0,
181687,2017,12,31,,Syria,Middle East & North Africa,Jableh,The incident occurred at the Humaymim Airport.,12/31/2017: Assailants launched mortars at the...,1,...,,"""Putin's 'victory' in Syria has turned into a ...","""Two Russian soldiers killed at Hmeymim base i...","""Two Russian servicemen killed in Syria mortar...",START Primary Collection,-9,-9,1,1,
181688,2017,12,31,,Philippines,Southeast Asia,Kubentog,The incident occurred in the Datu Hoffer distr...,12/31/2017: Assailants set fire to houses in K...,1,...,,"""Maguindanao clashes trap tribe members,"" Phil...",,,START Primary Collection,0,0,0,0,
181689,2017,12,31,,India,South Asia,Imphal,The incident occurred in the Mantripukhri neig...,12/31/2017: Assailants threw a grenade at a Fo...,1,...,,"""Trader escapes grenade attack in Imphal,"" Bus...",,,START Primary Collection,-9,-9,0,-9,
181690,2017,12,31,,Philippines,Southeast Asia,Cotabato City,,12/31/2017: An explosive device was discovered...,1,...,,"""Security tightened in Cotabato following IED ...","""Security tightened in Cotabato City,"" Manila ...",,START Primary Collection,-9,-9,0,-9,


Some first thoughts:
* Quite a lot of missing values that we will have to handle later
* There are too many variables to fit the screen 
* And those that fit don't always have very descriptive names :(

data.info() will reveal all variables, 
data.describe() would give us some some basic statistics

In next two cells we can examine the type of data and missing values.


In [22]:
data.info(verbose=True, null_counts=True, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181691 entries, 0 to 181690
Data columns (total 125 columns):
iyear                 181691 non-null int64
imonth                181691 non-null int64
iday                  181691 non-null int64
resolution            2220 non-null object
country_txt           181691 non-null object
region_txt            181691 non-null object
city                  181257 non-null object
location              55495 non-null object
summary               115562 non-null object
crit1                 181691 non-null int64
crit2                 181691 non-null int64
crit3                 181691 non-null int64
doubtterr             181690 non-null float64
alternative           29011 non-null float64
alternative_txt       29011 non-null object
multiple              181690 non-null float64
success               181691 non-null int64
suicide               181691 non-null int64
attacktype1           181691 non-null int64
attacktype1_txt       181691 non-null object

In [27]:
data.isnull().sum()

iyear                      0
imonth                     0
iday                       0
resolution            179471
country_txt                0
region_txt                 0
city                     434
location              126196
summary                66129
crit1                      0
crit2                      0
crit3                      0
doubtterr                  1
alternative           152680
alternative_txt       152680
multiple                   1
success                    0
suicide                    0
attacktype1                0
attacktype1_txt            0
attacktype2           175377
attacktype2_txt       175377
attacktype3           181263
attacktype3_txt       181263
targtype1                  0
targtype1_txt              0
targsubtype1           10373
targsubtype1_txt       10373
corp1                  42550
target1                  636
                       ...  
propextent            117626
propextent_txt        117626
propvalue             142702
propcomment   

In [32]:
data[['doubtterr', 'alternative']].tail(15)

Unnamed: 0,doubtterr,alternative
181676,0.0,
181677,0.0,
181678,0.0,
181679,1.0,1.0
181680,1.0,1.0
181681,0.0,
181682,0.0,
181683,0.0,
181684,0.0,
181685,0.0,


We can spot quite a few null values throughout the data, but examining columns 'doubtterr' and 'alternative' we can see that sometimes NaNs exist if a "flag" of some related category is set to zero. Here doubterr is non-zero if there exist some doubts as to whether the event can be properly classified as act of terror. If the answer is no, the alternative column will be NaN (Reference: p. 16 of the Codebook). Therefore not all NaNs are real missing values and in my opinion shouldn't be removed up front.
To add to that, some data wasn't collected before 1997.

That's still pretty overwhelming...
Maybe we could try to show sth in more graphical way
Matplotlib is alright, but kinda boring, seaborn looks nicer 
and I'll probably come back to it after first major problems
Until then I'll give __bokeh__ a try

In [7]:
from collections import Counter
 #
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral6
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.models import FactorRange


counter = Counter(data['iyear'])
years = list(map(str, counter.keys()))

p = figure(x_range=years, plot_height=350, title="Attacks by year",
           toolbar_location='right')

p.vbar(x=years, top=list(counter.values()), width=0.8)

p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1
p.y_range.start = 0

show(p)

In [38]:
counter = Counter(data['region_txt'])
#counter = 
countries = list(map(str, counter.keys()))

p = figure(x_range=countries, plot_height=350, title="Attacks by year",
           toolbar_location='right')

p.vbar(x=countries, top=list(counter.values()), width=0.8)

p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1
p.y_range.start = 0

show(p)

In [71]:
from bokeh.palettes import Category20b
from bokeh.palettes import viridis, Set2
from bokeh.models import ColumnDataSource

temp = data.loc[data['region_txt']=='North America']
counter = Counter(temp['iyear'])
#counter = 
countries = list(map(str, counter.keys()))

source = ColumnDataSource(data=dict(countries=countries, counts=list(counter.values()), color=viridis(47)))

p = figure(x_range=countries, plot_height=350, title="Attacks in North America by year",
           toolbar_location='right')

p.vbar(x='countries', top='counts', width=0.8, color='color', source=source)

p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1
p.y_range.start = 0

show(p)

counter = Counter(temp['country_txt'])
countries = list(map(str, counter.keys()))

source = ColumnDataSource(data=dict(countries=countries, counts=list(counter.values()), color=viridis(3)))

p = figure(x_range=countries, plot_height=350, title="Attacks in North America by country",
           toolbar_location='right')

p.vbar(x='countries', top='counts', width=0.8, color='color', source=source)

p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1
p.y_range.start = 0

show(p)
