In [16]:
# import packages
import pandas as pd
import numpy as np

In [17]:
# load the data to dataframe
df = pd.read_csv("data/global_terrorism_dataset.csv", header=0)
print("Dataset Size: [row, col]", df.shape)
df.head()

Dataset Size: [row, col] (129922, 31)


Unnamed: 0,eventid,iyear,imonth,iday,latitude,longitude,extended,vicinity,crit1,crit2,...,region,region_txt,attacktype1_txt,targtype1_txt,natlty1_txt,weaptype1_txt,target1,gname,ncasualties,has_casualties
0,197000000001,1970,7,2,18.456792,-69.951164,0,0,1,1,...,2,Central America & Caribbean,Assassination,Private Citizens & Property,Dominican Republic,Unknown,julio guzman,mano-d,1,1
1,197000000002,1970,0,0,19.432608,-99.133207,0,0,1,1,...,1,North America,Hostage Taking (Kidnapping),Government (Diplomatic),Belgium,Unknown,"nadine chaval, daughter",23rd of september communist league,0,0
2,197001000001,1970,1,0,15.478598,120.599741,0,0,1,1,...,5,Southeast Asia,Assassination,Journalists & Media,United States,Unknown,employee,unknown,1,1
3,197001000002,1970,1,0,37.983773,23.728157,0,0,1,1,...,8,Western Europe,Bombing/Explosion,Government (Diplomatic),United States,Explosives/Bombs/Dynamite,u.s. embassy,unknown,0,0
4,197001010002,1970,1,1,37.005105,-89.176269,0,0,1,1,...,1,North America,Armed Assault,Police,United States,Firearms,cairo police headquarters,black nationalists,0,0


In [18]:
# check column names
column_names = df.columns
column_names

Index(['eventid', 'iyear', 'imonth', 'iday', 'latitude', 'longitude',
       'extended', 'vicinity', 'crit1', 'crit2', 'crit3', 'doubtterr',
       'multiple', 'success', 'suicide', 'claimed', 'property', 'ishostkid',
       'nkill', 'nwound', 'country_txt', 'region', 'region_txt',
       'attacktype1_txt', 'targtype1_txt', 'natlty1_txt', 'weaptype1_txt',
       'target1', 'gname', 'ncasualties', 'has_casualties'],
      dtype='object')

In [19]:
# check for null or missing values
df.isnull().sum()

eventid                0
iyear                  0
imonth                 0
iday                   0
latitude            2867
longitude           2867
extended               0
vicinity               0
crit1                  0
crit2                  0
crit3                  0
doubtterr              0
multiple               0
success                0
suicide                0
claimed            42068
property               0
ishostkid            161
nkill                  0
nwound                 0
country_txt            0
region                 0
region_txt             0
attacktype1_txt        0
targtype1_txt          0
natlty1_txt         1164
weaptype1_txt          0
target1                0
gname                  0
ncasualties            0
has_casualties         0
dtype: int64

In [20]:
# remove useless columns
remove_columns = [
    "iyear", "iday", "latitude", "longitude", "claimed",
    "ishostkid", "natlty1_txt", "region", "gname", "extended",
    "eventid", "crit1", "crit2", "crit3", "weaptype1_txt", "doubtterr", "suicide",
    "property", "has_casualties", "vicinity", "multiple", "nkill", "nwound", "success"]
selected_columns = [x for x in column_names if x not in remove_columns]
# select a slice of the dataframe to work further
df = df[selected_columns]
df.head()

Unnamed: 0,imonth,country_txt,region_txt,attacktype1_txt,targtype1_txt,target1,ncasualties
0,7,Dominican Republic,Central America & Caribbean,Assassination,Private Citizens & Property,julio guzman,1
1,0,Mexico,North America,Hostage Taking (Kidnapping),Government (Diplomatic),"nadine chaval, daughter",0
2,1,Philippines,Southeast Asia,Assassination,Journalists & Media,employee,1
3,1,Greece,Western Europe,Bombing/Explosion,Government (Diplomatic),u.s. embassy,0
4,1,United States,North America,Armed Assault,Police,cairo police headquarters,0


In [21]:
# re check for null or anh missing values
df.isnull().sum()

imonth             0
country_txt        0
region_txt         0
attacktype1_txt    0
targtype1_txt      0
target1            0
ncasualties        0
dtype: int64

In [22]:
# let's work around with the object data type col
object_type_columns = [x for x in selected_columns if df[x].dtypes == "object"]
object_type_columns

['country_txt', 'region_txt', 'attacktype1_txt', 'targtype1_txt', 'target1']

In [23]:
# encoding str columns to categorical columns by encoding it
from sklearn.preprocessing import LabelEncoder

labler = LabelEncoder()

# encode each str type col to int
for x in object_type_columns:
    new_name = x + "_code"
    df[new_name] = labler.fit_transform(df[x])
    df[new_name] = df[new_name].astype("category")
df.head()

Unnamed: 0,imonth,country_txt,region_txt,attacktype1_txt,targtype1_txt,target1,ncasualties,country_txt_code,region_txt_code,attacktype1_txt_code,targtype1_txt_code,target1_code
0,7,Dominican Republic,Central America & Caribbean,Assassination,Private Citizens & Property,julio guzman,1,45,1,1,13,31401
1,0,Mexico,North America,Hostage Taking (Kidnapping),Government (Diplomatic),"nadine chaval, daughter",0,114,6,6,5,37488
2,1,Philippines,Southeast Asia,Assassination,Journalists & Media,employee,1,137,9,1,7,21436
3,1,Greece,Western Europe,Bombing/Explosion,Government (Diplomatic),u.s. embassy,0,66,11,2,5,60318
4,1,United States,North America,Armed Assault,Police,cairo police headquarters,0,186,6,0,12,11810


In [24]:
dff = df[["imonth", "country_txt", "region_txt", "target1", "ncasualties", "targtype1_txt", "attacktype1_txt"]].copy()
dff.head()

Unnamed: 0,imonth,country_txt,region_txt,target1,ncasualties,targtype1_txt,attacktype1_txt
0,7,Dominican Republic,Central America & Caribbean,julio guzman,1,Private Citizens & Property,Assassination
1,0,Mexico,North America,"nadine chaval, daughter",0,Government (Diplomatic),Hostage Taking (Kidnapping)
2,1,Philippines,Southeast Asia,employee,1,Journalists & Media,Assassination
3,1,Greece,Western Europe,u.s. embassy,0,Government (Diplomatic),Bombing/Explosion
4,1,United States,North America,cairo police headquarters,0,Police,Armed Assault


In [25]:
dff.to_csv("data/testing-again-for-satisfaction.csv", index=True)

In [26]:
dff

Unnamed: 0,imonth,country_txt,region_txt,target1,ncasualties,targtype1_txt,attacktype1_txt
0,7,Dominican Republic,Central America & Caribbean,julio guzman,1,Private Citizens & Property,Assassination
1,0,Mexico,North America,"nadine chaval, daughter",0,Government (Diplomatic),Hostage Taking (Kidnapping)
2,1,Philippines,Southeast Asia,employee,1,Journalists & Media,Assassination
3,1,Greece,Western Europe,u.s. embassy,0,Government (Diplomatic),Bombing/Explosion
4,1,United States,North America,cairo police headquarters,0,Police,Armed Assault
5,1,Uruguay,South America,juan maria de lucah/chief of directorate of in...,0,Police,Assassination
6,1,United States,North America,"r.o.t.c. offices at university of wisconsin, m...",0,Military,Facility/Infrastructure Attack
7,1,United States,North America,selective service headquarters in madison wisc...,0,Government (General),Facility/Infrastructure Attack
8,1,United States,North America,packard properties building of detroit michigan,0,Government (General),Facility/Infrastructure Attack
9,1,East Germany (GDR),Eastern Europe,jurists ball (palais au frankturm),0,Government (General),Bombing/Explosion


In [42]:
dff[dff["target1"] == "convoy of relief workers"]

Unnamed: 0,imonth,country_txt,region_txt,target1,ncasualties,targtype1_txt,attacktype1_txt
129920,12,Philippines,Southeast Asia,convoy of relief workers,0,NGO,Bombing/Explosion


In [27]:
# country name mappings with at least 2000 instances
country_names = list()

for x in df["country_txt"].unique():
    if len(df[df["country_txt"] == x]) > 2000:
        country_names.append(x)

# find the encoded value for each of these
for x in country_names:
    for i, row in df.iterrows():
        if row["country_txt"] == x:
            print(x, row["country_txt_code"])
            break

Philippines 137
United States 186
Spain 163
Turkey 180
United Kingdom 185
Pakistan 131
India 78
France 58
Colombia 33
El Salvador 50
Thailand 176
Afghanistan 0
Peru 136
Somalia 156
Iraq 82
Nigeria 128
Yemen 196


In [28]:
# region name mappings with at least 100 instances
region_names = list()

for x in df["region_txt"].unique():
    if len(df[df["region_txt"] == x]) > 100:
        region_names.append(x)

# find the encoded value for each of these
for x in region_names:
    for i, row in df.iterrows():
        if row["region_txt"] == x:
            print(x, row["region_txt_code"])
            break

Central America & Caribbean 1
North America 6
Southeast Asia 9
Western Europe 11
South America 7
Eastern Europe 4
Sub-Saharan Africa 10
Middle East & North Africa 5
East Asia 3
Australasia & Oceania 0
South Asia 8
Central Asia 2


In [29]:
# attack type name mappings with at least 100 instances
attack_type_names = list()

for x in df["attacktype1_txt"].unique():
    if len(df[df["attacktype1_txt"] == x]) > 100:
        attack_type_names.append(x)

# find the encoded value for each of these
for x in attack_type_names:
    for i, row in df.iterrows():
        if row["attacktype1_txt"] == x:
            print(x, row["attacktype1_txt_code"])
            break

Assassination 1
Hostage Taking (Kidnapping) 6
Bombing/Explosion 2
Armed Assault 0
Facility/Infrastructure Attack 3
Hijacking 4
Unarmed Assault 7
Hostage Taking (Barricade Incident) 5
Unknown 8


In [30]:
# target type name mappings with at least 100 instances
target_type_names = list()

for x in df["targtype1_txt"].unique():
    if len(df[df["targtype1_txt"] == x]) > 100:
        target_type_names.append(x)

# find the encoded value for each of these
for x in target_type_names:
    for i, row in df.iterrows():
        if row["targtype1_txt"] == x:
            print(x, row["targtype1_txt_code"])
            break

Private Citizens & Property 13
Government (Diplomatic) 5
Journalists & Media 7
Police 12
Military 9
Government (General) 6
Educational Institution 3
Business 2
Violent Political Party 21
Unknown 19
Transportation 18
Utilities 20
Airports & Aircraft 1
Religious Figures/Institutions 14
Telecommunication 15
Food or Water Supply 4
NGO 10
Terrorists/Non-State Militia 16
Other 11
Tourists 17
Maritime 8
Abortion Related 0


In [40]:
df["targtype1_txt_code"].value_counts()

13    33890
12    20598
6     18837
2     15206
18     5884
20     5711
19     4501
9      4380
14     3803
3      3798
5      2877
7      2455
16     2134
21     1346
1      1082
15      950
10      761
11      587
17      320
8       290
4       281
0       231
Name: targtype1_txt_code, dtype: int64

In [31]:
df["target1"].value_counts()

civilians                                                                         5766
unknown                                                                           5557
vehicle                                                                           1720
checkpoint                                                                        1565
officers                                                                          1506
patrol                                                                            1496
village                                                                           1440
police station                                                                    1191
bus                                                                               1119
market                                                                            1035
high tension line tower                                                            959
office                                     

In [32]:
# target1 name mappings with at least 1000 instances
target1_names = list()

for x in ["civilians", "unknown", "vehicle", "market", "officers"]:
    if len(df[df["target1"] == x]) > 1000:
        target1_names.append(x)
# find the encoded value for each of these
for x in target1_names:
    for i, row in df.iterrows():
        if row["target1"] == x:
            print(x, row["target1_code"])
            break

civilians 15037
unknown 61046
vehicle 61386
market 34315
officers 39201


In [33]:
df[["attacktype1_txt", "attacktype1_txt_code"]].to_csv("data/attack_type.csv", index=False)

In [34]:
df[["targtype1_txt", "targtype1_txt_code"]].to_csv("data/target_type.csv", index=False)

In [35]:
features = [x for x in df.columns if x not in object_type_columns]
features

['imonth',
 'ncasualties',
 'country_txt_code',
 'region_txt_code',
 'attacktype1_txt_code',
 'targtype1_txt_code',
 'target1_code']

In [36]:
df = df[features]

In [37]:
df.head()

Unnamed: 0,imonth,ncasualties,country_txt_code,region_txt_code,attacktype1_txt_code,targtype1_txt_code,target1_code
0,7,1,45,1,1,13,31401
1,0,0,114,6,6,5,37488
2,1,1,137,9,1,7,21436
3,1,0,66,11,2,5,60318
4,1,0,186,6,0,12,11810


In [38]:
df.tail()

Unnamed: 0,imonth,ncasualties,country_txt_code,region_txt_code,attacktype1_txt_code,targtype1_txt_code,target1_code
129917,12,2,131,8,0,12,41266
129918,12,7,0,8,2,16,61386
129919,12,7,0,8,2,13,15037
129920,12,0,137,9,2,10,18163
129921,12,0,41,10,6,13,62279


In [39]:
df.to_csv("data/train.csv", index=False)