# 任务四
## 武器预测

**Author:** solid

**Date:** sep 18, 2018

### Notebook Configuration

In [3]:
import time
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [4]:
# Display up to 150 rows and columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 150)

# Set the figure size for plots
mpl.rcParams['figure.figsize'] = (14.6, 9.0)

# Set the Seaborn default style for plots
sns.set()

# Set the color palette
sns.set_palette(sns.color_palette("muted"))

### Load the Datasets
Load the dataset created by the EDA notebook.

In [5]:
# Load the preprocessed GTD dataset
gtd_df = pd.read_csv('gtd_eda_98t017.csv', low_memory=False, index_col = 0,
                      na_values=[''])

### Inspect the Structure
The cleansed data frame contains 48 attributes, one of which is used for the data frame index, and 110,844 observations.

In [6]:
# Display a summary of the data frame
gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113247 entries, 199801010001 to 201712310032
Data columns (total 48 columns):
iyear               113247 non-null int64
imonth              113247 non-null int64
iday                113247 non-null int64
country_txt         113247 non-null object
region_txt          113247 non-null object
provstate           113247 non-null object
city                113247 non-null object
latitude            113247 non-null float64
longitude           113247 non-null float64
specificity         113247 non-null float64
summary             113247 non-null object
attacktype1_txt     113247 non-null object
targtype1_txt       113247 non-null object
targsubtype1_txt    113247 non-null object
corp1               113247 non-null object
target1             113247 non-null object
natlty1_txt         113247 non-null object
gname               113247 non-null object
nperpcap            113247 non-null float64
weaptype1_txt       113247 non-null object
weapsubtype

### Convert Attributes to Correct Data Type
Convert a subset of the data frame attributes to categorical to align with the GTD code book as executed previously in the EDA notebook.

In [7]:
# List of attributes that are categorical
cat_attrs = ['extended_txt', 'country_txt', 'region_txt', 'specificity', 'vicinity_txt',
             'crit1_txt', 'crit2_txt', 'crit3_txt', 'doubtterr_txt', 'multiple_txt',
             'success_txt', 'suicide_txt', 'attacktype1_txt', 'targtype1_txt', 
             'targsubtype1_txt', 'natlty1_txt', 'guncertain1_txt', 'individual_txt', 
             'claimed_txt', 'weaptype1_txt', 'weapsubtype1_txt', 'property_txt', 
             'ishostkid_txt', 'INT_LOG_txt', 'INT_IDEO_txt','INT_MISC_txt', 'INT_ANY_txt']

for cat in cat_attrs:
    gtd_df[cat] = gtd_df[cat].astype('category')
    

# Data time feature added during EDA
gtd_df['incident_date'] = pd.to_datetime(gtd_df['incident_date'])

# To prevent a mixed data type
gtd_df['gname'] = gtd_df['gname'].astype('str')

gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113247 entries, 199801010001 to 201712310032
Data columns (total 48 columns):
iyear               113247 non-null int64
imonth              113247 non-null int64
iday                113247 non-null int64
country_txt         113247 non-null category
region_txt          113247 non-null category
provstate           113247 non-null object
city                113247 non-null object
latitude            113247 non-null float64
longitude           113247 non-null float64
specificity         113247 non-null category
summary             113247 non-null object
attacktype1_txt     113247 non-null category
targtype1_txt       113247 non-null category
targsubtype1_txt    113247 non-null category
corp1               113247 non-null object
target1             113247 non-null object
natlty1_txt         113247 non-null category
gname               113247 non-null object
nperpcap            113247 non-null float64
weaptype1_txt       113247 non-null categ

### 创建训练集和测试集
The original dataset is split into 80% training and 20% testing.

In [8]:
y = gtd_df['weaptype1_txt']
y

eventid
199801010001      Firearms
199801010002    Explosives
199801010003      Firearms
199801020001    Explosives
199801020002      Firearms
199801040001    Explosives
199801040002    Explosives
199801050001    Explosives
199801050002    Explosives
199801050003      Firearms
199801050004      Firearms
199801060001      Firearms
199801060002    Explosives
199801060003         Melee
199801060004         Melee
199801070001    Explosives
199801080001         Melee
199801080002         Melee
199801090001    Explosives
199801090002    Explosives
199801100001         Melee
199801100002    Explosives
199801100003      Firearms
199801110001    Explosives
199801110002      Firearms
199801110003      Firearms
199801110004    Explosives
199801120001    Explosives
199801120002    Explosives
199801120003    Explosives
199801130001    Explosives
199801130002         Melee
199801130003    Explosives
199801140001    Explosives
199801140002    Explosives
199801150001    Explosives
199801170001      Fi

In [9]:
# Seed for reproducible results
seed = 1009

# Predictor variables with one hot encoding
X = pd.get_dummies(gtd_df[['country_txt', 'region_txt', 'attacktype1_txt', 'nkill']], 
                   drop_first = True)

# Labels
y = gtd_df['weaptype1_txt']

# Create an 80/20 split for training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed, stratify = y)

# MLT
分类模型还是用多层感知机

特征向量：['country_txt', 'region_txt', 'attacktype1_txt', 'nkill']

标签:['weaptype1_txt']

In [11]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam',alpha=1e-5,hidden_layer_sizes=(80,160,250),random_state=1,verbose=True)
clf.fit(X_train,y_train)

Iteration 1, loss = 0.44258105
Iteration 2, loss = 0.30393045
Iteration 3, loss = 0.29372095
Iteration 4, loss = 0.28921724
Iteration 5, loss = 0.28594859
Iteration 6, loss = 0.28453175
Iteration 7, loss = 0.28262204
Iteration 8, loss = 0.28242836
Iteration 9, loss = 0.28066576
Iteration 10, loss = 0.28050794
Iteration 11, loss = 0.27790240
Iteration 12, loss = 0.27735156
Iteration 13, loss = 0.27617326
Iteration 14, loss = 0.27478236
Iteration 15, loss = 0.27528130
Iteration 16, loss = 0.27388115
Iteration 17, loss = 0.27307668
Iteration 18, loss = 0.27223070
Iteration 19, loss = 0.27230448
Iteration 20, loss = 0.27193139
Iteration 21, loss = 0.27149546
Iteration 22, loss = 0.27095504
Iteration 23, loss = 0.27040042
Iteration 24, loss = 0.27020801
Iteration 25, loss = 0.26966740
Iteration 26, loss = 0.27003166
Iteration 27, loss = 0.26831658
Iteration 28, loss = 0.26838990
Iteration 29, loss = 0.26836830
Iteration 30, loss = 0.26824681
Training loss did not improve more than tol=0.000

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(80, 160, 250), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=True,
       warm_start=False)

# 评价模型

准确率:90%

In [12]:
y_pred=clf.predict(X_test)

In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

                          precision    recall  f1-score   support

              Biological       1.00      0.20      0.33         5
                Chemical       0.92      0.27      0.42        44
              Explosives       0.98      0.95      0.97     12740
            Fake Weapons       0.00      0.00      0.00         2
                Firearms       0.81      0.92      0.86      6605
              Incendiary       0.79      0.81      0.80      1080
                   Melee       0.55      0.22      0.31       443
                   Other       0.33      0.12      0.17        17
            Radiological       1.00      1.00      1.00         2
      Sabotage Equipment       0.00      0.00      0.00        20
                 Unknown       0.80      0.74      0.76      1670
Vehicle (non-explosives)       0.79      0.50      0.61        22

             avg / total       0.90      0.90      0.90     22650



  'precision', 'predicted', average, warn_for)


### References

Bowne-Anderson, H. (n.d.). Measuring model performance Retrieved from https://campus.datacamp.com/courses/supervised-learning-with-scikit-learn/classification?ex=9