# 任务二：根据特征预测恐怖组织

**Author:** solid

**Date:** Sep 17, 2018

### 导入相关的包

In [1]:
import time
import collections

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings; warnings.simplefilter('ignore')

from sklearn.preprocessing import scale
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [2]:
# Display up to 150 rows and columns
pd.set_option('display.max_rows', 220)
pd.set_option('display.max_columns', 150)

# Set the figure size for plots
mpl.rcParams['figure.figsize'] = (14.6, 9.0)

# Set the Seaborn default style for plots
sns.set()

# Set the color palette
sns.set_palette(sns.color_palette("muted"))

### 加载数据集
加载预处理的数据集

In [3]:
# Load the preprocessed GTD dataset
gtd_df = pd.read_csv('gtd_eda_98t017.csv', low_memory=False, index_col = 0,
                      na_values=[''])

### 探索数据集结构
清理的数据帧包含48个属性，其中一个用于数据帧索引，110,844个观察值。

In [4]:
# Display a summary of the data frame
gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113247 entries, 199801010001 to 201712310032
Data columns (total 48 columns):
iyear               113247 non-null int64
imonth              113247 non-null int64
iday                113247 non-null int64
country_txt         113247 non-null object
region_txt          113247 non-null object
provstate           113247 non-null object
city                113247 non-null object
latitude            113247 non-null float64
longitude           113247 non-null float64
specificity         113247 non-null float64
summary             113247 non-null object
attacktype1_txt     113247 non-null object
targtype1_txt       113247 non-null object
targsubtype1_txt    113247 non-null object
corp1               113247 non-null object
target1             113247 non-null object
natlty1_txt         113247 non-null object
gname               113247 non-null object
nperpcap            113247 non-null float64
weaptype1_txt       113247 non-null object
weapsubtype

### 找到主要的恐怖组织名单
获取有20次或更多次袭击的恐怖组织名单。

In [6]:
# Calculate the number of attacks by group
groups = gtd_df['gname'].value_counts()

# Include groups with at least 20 attacks
groups = groups[groups > 19]

# Exclude unknown groups
#group_list = groups.index[groups.index != 'Unknown']
group_list = groups.index

# Subset the data to major groups
major_groups = gtd_df[gtd_df['gname'].isin(group_list)]

# Display the number of attacks by group
major_groups['gname'].value_counts()

Unknown                                                         59049
Taliban                                                          7454
Islamic State of Iraq and the Levant (ISIL)                      5583
Al-Shabaab                                                       3274
Boko Haram                                                       2408
Communist Party of India - Maoist (CPI-Maoist)                   1876
New People's Army (NPA)                                          1795
Maoists                                                          1616
Tehrik-i-Taliban Pakistan (TTP)                                  1349
Revolutionary Armed Forces of Colombia (FARC)                    1295
Kurdistan Workers' Party (PKK)                                   1267
Houthi extremists (Ansar Allah)                                  1021
Al-Qaida in the Arabian Peninsula (AQAP)                         1011
Liberation Tigers of Tamil Eelam (LTTE)                           648
Al-Qaida in Iraq    

### 去掉包含文本和时间的属性
删除text和datetime属性，这些属性不会在模型中使用。

In [7]:
major_groups = major_groups.drop(['provstate', 'city', 'summary', 'corp1', 'target1',
                                  'scite1', 'dbsource', 'incident_date'], axis=1)

major_groups.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108901 entries, 199801010001 to 201712310032
Data columns (total 40 columns):
iyear               108901 non-null int64
imonth              108901 non-null int64
iday                108901 non-null int64
country_txt         108901 non-null category
region_txt          108901 non-null category
latitude            108901 non-null float64
longitude           108901 non-null float64
specificity         108901 non-null category
attacktype1_txt     108901 non-null category
targtype1_txt       108901 non-null category
targsubtype1_txt    108901 non-null category
natlty1_txt         108901 non-null category
gname               108901 non-null object
nperpcap            108901 non-null float64
weaptype1_txt       108901 non-null category
weapsubtype1_txt    108901 non-null category
nkill               108901 non-null float64
nkillus             108901 non-null float64
nkillter            108901 non-null float64
nwound              108901 non-nul

### 归一化数值属性
调整数值属性范围的差异。

In [8]:
scaler = preprocessing.RobustScaler()

# List of numeric attributes
scale_attrs = ['nperpcap', 'nkill', 'nkillus', 'nkillter', 'nwound', 'nwoundus', 'nwoundte']

# Standardize the attributes in place
major_groups[scale_attrs] = scaler.fit_transform(major_groups[scale_attrs])

# View the transformation
major_groups[scale_attrs].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
nperpcap,108901.0,0.098851,1.754319,0.0,0.0,0.0,0.0,406.0
nkill,108901.0,0.735163,6.090337,-0.5,-0.5,0.0,0.5,784.5
nkillus,108901.0,0.037089,5.793346,0.0,0.0,0.0,0.0,1360.0
nkillter,108901.0,0.501382,4.218098,0.0,0.0,0.0,0.0,500.0
nwound,108901.0,1.16796,13.231184,0.0,0.0,0.0,1.0,2730.333333
nwoundus,108901.0,0.014141,0.71764,0.0,0.0,0.0,0.0,151.0
nwoundte,108901.0,0.106115,1.498207,0.0,0.0,0.0,0.0,200.0


### 分离属性：gname=Known or Unknown,将带known标签集作为训练集，训练一个多层感知机，预测带unknow标签的数据，训练集为2015年到2016年的数据集
Split the major groups into known and unknown.

In [9]:
# Excluded Unknown groups
known_maj_groups = major_groups[gtd_df['gname'] != "Unknown"]
print("Known Major Groups: {}".format(known_maj_groups.shape))

# Only include Unknown groups
unknown_maj_groups = major_groups[gtd_df['gname'] == "Unknown"]
print("Unknown Major Groups: {}".format(unknown_maj_groups.shape))

Known Major Groups: (49852, 40)
Unknown Major Groups: (59049, 40)


In [10]:
_15_known_maj_groups=known_maj_groups[gtd_df['iyear']==2015]
_16_known_maj_groups=known_maj_groups[gtd_df['iyear']==2016]
_15to16_known_maj_groups=_15_known_maj_groups.append(_16_known_maj_groups)
_15_unknown_maj_groups=unknown_maj_groups[gtd_df['iyear']==2015]
_16_unknown_maj_groups=unknown_maj_groups[gtd_df['iyear']==2016]
_15to16_unknown_maj_groups=_15_unknown_maj_groups.append(_16_unknown_maj_groups)
_17_unkown_maj_groups=unknown_maj_groups.loc[[201701090031,201702210037,201703120023,201705050009,201705050010,201707010028,201707020006,201708110018,201711010006,201712010003],:]
print("_15to16_unKnown Major Groups: {}".format(_15to16_unknown_maj_groups.shape))
print("_15to16_Known Major Groups: {}".format(_15to16_known_maj_groups.shape))

_15to16_unKnown Major Groups: (12310, 40)
_15to16_Known Major Groups: (15167, 40)


### 编码目标属性
Convert the text values of the terrorist groups to an encoded numeric value for the Multilayer perceptron.

In [11]:
# Create the encoder
le = preprocessing.LabelEncoder()

# Fit the encoder to the target
le.fit(_15to16_known_maj_groups['gname'])

LabelEncoder()

In [12]:
# View the labels
len(list(le.classes_))

152

In [13]:
# View the encoded values for th terrorist group names
label_codes = le.transform(_15to16_known_maj_groups['gname'])
len(label_codes)

15167

In [14]:
# Convert some integers into their category names
list(le.inverse_transform([0, 1, 2, 27]))

  if diff:


['Abu Sayyaf Group (ASG)',
 'Adan-Abyan Province of the Islamic State',
 'Ahrar al-Sham',
 'Balochistan Liberation United Front (BLUF)']

### 创建训练集和目标集
The original dataset is split into 80% training and 20% testing.

In [15]:
# Seed for reproducible results
seed = 1009

# Predictor variables
X = pd.get_dummies(_15to16_known_maj_groups.drop(['gname'], axis=1), drop_first=True)
print(X.shape)
# Labels
y = label_codes
print(y)
print(y.shape)
# Create an 80/20 split for training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed)

(15167, 586)
[100  62  62 ... 141  32  85]
(15167,)


### 训练模型

In [19]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam',alpha=1e-5,hidden_layer_sizes=(80,160,250),random_state=1,verbose=True)
clf.fit(X_train,y_train)

Iteration 1, loss = 7.60899060
Iteration 2, loss = 3.31635744
Iteration 3, loss = 2.95263077
Iteration 4, loss = 2.55800719
Iteration 5, loss = 2.17297362
Iteration 6, loss = 1.88984710
Iteration 7, loss = 1.64039806
Iteration 8, loss = 1.43918915
Iteration 9, loss = 1.27202362
Iteration 10, loss = 1.12553786
Iteration 11, loss = 1.04415663
Iteration 12, loss = 0.94291013
Iteration 13, loss = 0.88446832
Iteration 14, loss = 0.83700353
Iteration 15, loss = 0.78702122
Iteration 16, loss = 0.74966790
Iteration 17, loss = 0.70165861
Iteration 18, loss = 0.66113556
Iteration 19, loss = 0.62825122
Iteration 20, loss = 0.61940573
Iteration 21, loss = 0.59046453
Iteration 22, loss = 0.56386068
Iteration 23, loss = 0.58257455
Iteration 24, loss = 0.54550576
Iteration 25, loss = 0.50265419
Iteration 26, loss = 0.49137459
Iteration 27, loss = 0.49147913
Iteration 28, loss = 0.46408811
Iteration 29, loss = 0.46227947
Iteration 30, loss = 0.45000796
Iteration 31, loss = 0.42351932
Iteration 32, los

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(80, 160, 250), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=True,
       warm_start=False)

### 预测测试集的标签

In [26]:
y_pred=clf.predict(X_test)

In [22]:
import heapq
heapq.nlargest(5, range(len(y_pred[0])), y_pred[0].take) 
y_pred[0][[56, 121, 136, 1, 128]]

array([0.96746934, 0.01781963, 0.00369832, 0.00640053, 0.00257257])

In [23]:
y_pred[0][(np.argmax(y_pred[0]))]

0.9674693448941377

In [27]:
y_test

array([ 56,  80,  31, ...,  71,  62, 112])

In [28]:
y_pred

array([ 56,  80,  31, ...,   6,  62, 112])

### 评估模型的预测能力（avg / total       0.86      0.87      0.85      3034）接近90%的准确率

In [29]:
from sklearn.metrics import classificat#
print(classification_report(y_test,y_pred))

[[11  0  0 ...  0  0  0]
 [ 0  6  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 ...
 [ 0  0  0 ...  6  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
             precision    recall  f1-score   support

          0       1.00      0.34      0.51        32
          1       0.75      0.86      0.80         7
          2       0.00      0.00      0.00         6
          3       0.00      0.00      0.00         2
          4       0.00      0.00      0.00         2
          5       1.00      1.00      1.00         1
          6       0.46      0.97      0.62        31
          7       0.93      0.98      0.95        41
          8       0.62      0.89      0.73         9
          9       0.99      1.00      1.00       194
         10       1.00      0.53      0.70        30
         11       0.60      0.75      0.67         4
         12       0.00      0.00      0.00         1
         13       0.70      1.00      0.82         7
         14       0.67      0.40      0.50      

### 预测表二的10个2017年的数据的恐怖组织（top5)

In [36]:
_17_predit=clf.predict_proba(pd.get_dummies(_17_unkown_maj_groups, drop_first=True))

In [38]:
_17_predit.shape


(10, 151)

In [42]:
for i in range(10):
    print(heapq.nlargest(5, range(len(_17_predit[i])), _17_predit[i].take))

[99, 143, 130, 72, 125]
[130, 143, 25, 99, 125]
[48, 137, 143, 99, 130]
[143, 48, 137, 99, 130]
[143, 48, 137, 99, 130]
[48, 143, 99, 130, 114]
[99, 143, 130, 18, 14]
[130, 143, 99, 125, 25]
[130, 143, 25, 99, 125]
[99, 143, 130, 125, 72]


### References

Albon, C. (2017). Convert Pandas categorical data for scikit-learn Retrieved from https://chrisalbon.com/machine_learning/preprocessing_structured_data/convert_pandas_categorical_column_into_integers_for_scikit-learn/

Keen, B. (2017). Feature scaling with scikit-learn Retrieved from http://benalexkeen.com/feature-scaling-with-scikit-learn/

Saabas, A. (2014). Selecting good features – Part III: random forests Retrieved from http://blog.datadive.net/selecting-good-features-part-iii-random-forests/

scikit-learn. (n.d.). Feature importances with forests of trees Retrieved from http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html#sphx-glr-auto-examples-ensemble-plot-forest-importances-py