# Global Terrorism Attacks - Predicting The Responsible Group

**Table of Contents**
<div id="toc"></div>

## Load data

In [1]:
%matplotlib inline

import pandas as pd
import csv
from tqdm import tqdm_notebook
from sklearn import preprocessing

gtd = pd.read_csv('gtd_converted.csv', encoding='latin1', low_memory=False)
gtd.tail(3)

Unnamed: 0.1,Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
170347,170347,201612310043,2016,12,31,,1,,229,Democratic Republic of the Congo,...,,"""DRC Armed Forces Dislodge Ugandan LRA Rebels ...",,,START Primary Collection,1,1,0,1,
170348,170348,201612310044,2016,12,31,,0,,130,Mexico,...,There is doubt that this incident meets terror...,"""Gunmen shoot at home of Mexican indigenous ri...","""â Gunmen shoot at home of Mexican indigenou...",,START Primary Collection,-9,-9,0,-9,
170349,170349,201701270001,2016,12,30,,0,,195,Sudan,...,There is doubt that this incident meets terror...,"""Rebels To Blame For Nierteti Killing: Central...","""Security deteriorates in West Darfur alongsid...",,START Primary Collection,0,0,0,0,


## Preprocessing

### Filter Groups

In [2]:
from collections import Counter

group_count = Counter(gtd['gname'])
# Remove groups with 3 attacks or less
filtered_groups = [group for group, counter in group_count.items() if counter > 3]
# Remove 'Unknown'
filtered_groups.remove('Unknown')

gtd = gtd[gtd['gname'].isin(filtered_groups)]

gtd.shape

(88657, 136)

### Define Features

In [3]:
#columns_to_keep = ['gname', 'iyear', 'country', 'region', 'crit1', 'crit2', 'crit3', 'attacktype1', 'targtype1', 'targsubtype1','natlty1', 'weaptype1','weapsubtype1', 'nperps', 'nkill', 'nwound', 'ransom']
columns_to_keep = ['gname', 'iyear', 'country', 'crit1', 'crit2', 'crit3', 'attacktype1', 'targtype1', 'targsubtype1',
'weaptype1', 'weapsubtype1', 'ransom']
# columns_to_keep = ['gname', 'iyear', 'country', 'region', 'attacktype1', 'weaptype1']
gtd = gtd[columns_to_keep]
gtd.tail(3)

Unnamed: 0,gname,iyear,country,crit1,crit2,crit3,attacktype1,targtype1,targsubtype1,weaptype1,weapsubtype1,ransom
170345,Boko Haram,2016,146,1,1,0,9,4,34.0,13,,
170347,Lord's Resistance Army (LRA),2016,229,1,1,1,6,14,75.0,13,,0.0
170349,Sudan Liberation Movement,2016,195,1,1,1,2,14,75.0,5,5.0,


### Transform Target

In [4]:
number = preprocessing.LabelEncoder()
gtd['gname'] = number.fit_transform(gtd.gname) #number.inverse_transform()
gtd = gtd.rename(columns={'gname': 'groupId'})

### Fill NaNs

In [5]:
gtd=gtd.fillna(0) #TODO: 999 or 0?

gtd.tail(3)

Unnamed: 0,groupId,iyear,country,crit1,crit2,crit3,attacktype1,targtype1,targsubtype1,weaptype1,weapsubtype1,ransom
170345,214,2016,146,1,1,0,9,4,34.0,13,0.0,0.0
170347,555,2016,229,1,1,1,6,14,75.0,13,0.0,0.0
170349,919,2016,195,1,1,1,2,14,75.0,5,5.0,0.0


In [6]:
gtd.to_csv('gtd_processed_11features.csv', encoding='utf-8', index=False)