In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh
from bokeh.io import output_notebook
output_notebook()

Let's try to read our data, encoding type was found on the web

Detailed info about meaning of the data was obtained here: https://www.start.umd.edu/gtd/downloads/Codebook.pdf

In [3]:
data = pd.read_csv('../globalterrorismdb_0718dist.csv', encoding='ISO-8859-1')

  interactivity=interactivity, compiler=compiler, result=result)


Now I'd use df.head() to get an initial idea of the data, though df.tail() should work equally fine

In [4]:
data.head()
data.tail()

delete = ['eventid', 'approxdate', 'extended', 'country', 'region', 
          'latitude', 'longitude', 
          #'provstate', 'specificity',
         'vicinity']
data=data.drop(delete, axis=1)
data.tail()

Unnamed: 0,iyear,imonth,iday,resolution,country_txt,region_txt,provstate,city,specificity,location,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
181686,2017,12,31,,Somalia,Sub-Saharan Africa,Middle Shebelle,Ceelka Geelow,2.0,The incident occurred near the town of Balcad.,...,,"""Somalia: Al-Shabaab Militants Attack Army Che...","""Highlights: Somalia Daily Media Highlights 2 ...","""Highlights: Somalia Daily Media Highlights 1 ...",START Primary Collection,0,0,0,0,
181687,2017,12,31,,Syria,Middle East & North Africa,Lattakia,Jableh,1.0,The incident occurred at the Humaymim Airport.,...,,"""Putin's 'victory' in Syria has turned into a ...","""Two Russian soldiers killed at Hmeymim base i...","""Two Russian servicemen killed in Syria mortar...",START Primary Collection,-9,-9,1,1,
181688,2017,12,31,,Philippines,Southeast Asia,Maguindanao,Kubentog,2.0,The incident occurred in the Datu Hoffer distr...,...,,"""Maguindanao clashes trap tribe members,"" Phil...",,,START Primary Collection,0,0,0,0,
181689,2017,12,31,,India,South Asia,Manipur,Imphal,1.0,The incident occurred in the Mantripukhri neig...,...,,"""Trader escapes grenade attack in Imphal,"" Bus...",,,START Primary Collection,-9,-9,0,-9,
181690,2017,12,31,,Philippines,Southeast Asia,Maguindanao,Cotabato City,1.0,,...,,"""Security tightened in Cotabato following IED ...","""Security tightened in Cotabato City,"" Manila ...",,START Primary Collection,-9,-9,0,-9,


Some first thoughts:
* Quite a lot of missing values that we will have to handle later
* There are too many variables to fit the screen 
* And those that fit don't always have very descriptive names :(

data.info() will reveal all variables, 
data.describe() would give us some some basic statistics

In next two cells we can examine the type of data and missing values.


In [6]:
data.info(verbose=True, null_counts=True, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181691 entries, 0 to 181690
Data columns (total 127 columns):
iyear                 181691 non-null int64
imonth                181691 non-null int64
iday                  181691 non-null int64
resolution            2220 non-null object
country_txt           181691 non-null object
region_txt            181691 non-null object
provstate             181270 non-null object
city                  181257 non-null object
specificity           181685 non-null float64
location              55495 non-null object
summary               115562 non-null object
crit1                 181691 non-null int64
crit2                 181691 non-null int64
crit3                 181691 non-null int64
doubtterr             181690 non-null float64
alternative           29011 non-null float64
alternative_txt       29011 non-null object
multiple              181690 non-null float64
success               181691 non-null int64
suicide               181691 non-null int

In [7]:
data.isnull().sum()

iyear                      0
imonth                     0
iday                       0
resolution            179471
country_txt                0
region_txt                 0
provstate                421
city                     434
specificity                6
location              126196
summary                66129
crit1                      0
crit2                      0
crit3                      0
doubtterr                  1
alternative           152680
alternative_txt       152680
multiple                   1
success                    0
suicide                    0
attacktype1                0
attacktype1_txt            0
attacktype2           175377
attacktype2_txt       175377
attacktype3           181263
attacktype3_txt       181263
targtype1                  0
targtype1_txt              0
targsubtype1           10373
targsubtype1_txt       10373
                       ...  
propextent            117626
propextent_txt        117626
propvalue             142702
propcomment   

In [8]:
data[['doubtterr', 'alternative']].tail(15)

Unnamed: 0,doubtterr,alternative
181676,0.0,
181677,0.0,
181678,0.0,
181679,1.0,1.0
181680,1.0,1.0
181681,0.0,
181682,0.0,
181683,0.0,
181684,0.0,
181685,0.0,


There is no missing data in dates or general location of the event. We can spot quite a few null values throughout the data, but examining columns 'doubtterr' and 'alternative' we can see that sometimes NaNs exist if a "flag" of some related category is set to zero. Here doubterr is non-zero if there exist some doubts as to whether the event can be properly classified as act of terror. If the answer is no, the alternative column will be NaN (Reference: p. 16 of the Codebook). Therefore not all NaNs are real missing values. For now I won't bother dealing with them
To add to that, some data wasn't collected before 1997.

That's still pretty overwhelming...
Maybe we could try to show sth in more graphical way
Matplotlib is alright, but kinda boring, seaborn looks nicer 
and I'll probably come back to it after first major problems
Until then I'll give __bokeh__ a try

In [9]:
from collections import Counter
 #
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral6
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.models import FactorRange


counter = Counter(data['iyear'])
years = list(map(str, counter.keys()))

p = figure(x_range=years, plot_height=350, title="Attacks by year",
           toolbar_location='right')

p.vbar(x=years, top=list(counter.values()), width=0.8)

p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1
p.y_range.start = 0

show(p)

In [10]:
counter = Counter(data['region_txt'])
#counter = 
countries = list(map(str, counter.keys()))

p = figure(x_range=countries, plot_height=350, title="Attacks by year",
           toolbar_location='right')

p.vbar(x=countries, top=list(counter.values()), width=0.8)

p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1
p.y_range.start = 0

show(p)

In [52]:
from bokeh.palettes import Category20b
from bokeh.palettes import viridis, Set2
from bokeh.models import ColumnDataSource
from bokeh.layouts import row, gridplot, column


def attacks_by_year_region(region):

    temp = data.loc[data['region_txt']==region]
    counter = Counter(temp['iyear'])
    #counter = 
    countries = list(map(str, counter.keys()))
    n_colors = len(countries)
    source = ColumnDataSource(data=dict(countries=countries, counts=list(counter.values()), color=viridis(n_colors)))

    p = figure(x_range=countries, plot_width=460, plot_height=350, title="Attacks in {} by year".format(region),
               toolbar_location='right')

    p.vbar(x='countries', top='counts', width=0.8, color='color', source=source)

    p.xgrid.grid_line_color = None
    p.xaxis.major_label_orientation = 1
    p.y_range.start = 0

    return p
    
p1 = attacks_by_year_region('North America')
p2 = attacks_by_year_region('Eastern Europe')
p3 = attacks_by_year_region('Western Europe')
p4 = attacks_by_year_region('East Asia')
grid =gridplot([p1, p4, p3, p2], ncols=2 )
show(grid)


In [54]:
from bokeh.palettes import Category20

def attacks_by_country_region(region):

    temp= data.loc[data['region_txt']==region]

    counter = Counter(temp['country_txt'])
    countries = list(map(str, counter.keys()))
    
    if 3<=len(countries)<=20: 
        color = Category20[len(countries)] 
    else: color=Category20[20]
    
    source = ColumnDataSource(data=dict(countries=countries, counts=list(counter.values()), color=color))

    p = figure(x_range=countries, plot_width=800, plot_height=450, 
               title="Attacks in {} by country, years 1970-2017".format(region),
               toolbar_location='right')

    p.vbar(x='countries', top='counts', width=0.8, color='color', source=source)

    p.xgrid.grid_line_color = None
    p.xaxis.major_label_orientation = 1
    p.y_range.start = 0

    return p

p1 = attacks_by_country_region('Eastern Europe')
p2 = attacks_by_country_region('Western Europe')

show(column(p1,p2))




This can plot can seem confusing, but the CodeBook states that the attacks were labelled accoring to the borders and countries they from the date of the event. That explains why we have here East Germany or Yugoslavia



In [37]:
Category20.keys()

dict_keys([3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])

In [38]:
Category20[14]

['#1f77b4',
 '#aec7e8',
 '#ff7f0e',
 '#ffbb78',
 '#2ca02c',
 '#98df8a',
 '#d62728',
 '#ff9896',
 '#9467bd',
 '#c5b0d5',
 '#8c564b',
 '#c49c94',
 '#e377c2',
 '#f7b6d2']

In [1]:
import tensorflow as tf

ModuleNotFoundError: No module named 'tensorflow'