# Classifying Threats with Text Classification
This notebook demonstarate classifying threats with the natural language processing techniques.


## Getting started

First, import the packages we'll need:

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from itertools import chain

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

Set global plotting options:

In [None]:
sns.set(
    context='notebook',
    font_scale=1.4,
    color_codes=False,
    palette=sns.color_palette('tab10', 10),
    
    style='whitegrid',
    rc={
        'figure.figsize': (12, 7.5)
    }
)

## Data ETL

#### Load the updated NC4 data and removes the blank lines between rows:

In [None]:
nc4 = pd.read_csv('../../datasets/NC4/NC4_update_2019-09-04.csv', encoding='latin', skip_blank_lines=True)
nc4_mod = nc4[['gist', 'type']]
nc4_mod.dropna(how="all", inplace=True)
nc4_mod.shape

#### Load the mapping excel for NC4 database:

In [None]:
mapping = pd.read_excel('../../datasets/NC4/NC4_CCS_Mapping.xlsx')
mapping.rename(columns={'NC4 Category': 'type'}, inplace=True)
#mapping.head()

#### Merge the NC4 and mapping database on the basis of type in NC4 database:

In [None]:
threats = pd.merge(nc4_mod, mapping, on=['type'], how='inner')
threats.rename(columns={'gist': 'Text','Proposed CCS Category':'Category','CCS Subcategory':'Subcategory'}, inplace=True)
threats.shape

In [None]:
#threats.to_csv('combined_mapping_threats.csv', sep='\t', encoding='utf-8')
threats.head()

## Splitting the merged database into Train and Test set

In [None]:
train = threats.sample(frac=0.8, random_state=99)
test = threats.loc[~threats.index.isin(train.index), :]

## Text Classification Modeling
#### Creating vocabulary from training set:

In [None]:
vocab_docs = set(chain(*[i.split() for i in train['Text'].unique()]))

#### Applying model using Bayes theorem on training set:

In [None]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', vocabulary=vocab_docs)), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB(fit_prior=False)),])
text_clf = text_clf.fit(train.Text, train.Subcategory)

In [None]:
predicted = text_clf.predict(test.Text)
print("Accuracy is %s " % (np.mean(predicted == test.Subcategory)))

#### Optimising parameters using grid search:

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False),'clf__alpha': (1e-2, 1e-3),}

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train.Text, train.Subcategory)

In [None]:
predicted = gs_clf.predict(test.Text)
print("Modified accuracy is %s " % (np.mean(predicted == test.Subcategory)))

In [None]:
gs_clf.best_score_
gs_clf.best_params_

#### Analyzing false predictions:

In [None]:
test['Predicted Subcategory'] = predicted
nc4 = test[predicted != test.Subcategory]
#nc4.to_csv('nc4.csv', sep='\t', encoding='utf-8')

In [None]:
test_data = pd.DataFrame({'Test' : test.groupby( [ "Subcategory"] ).size()}).reset_index()
false_data = pd.DataFrame({'False' : nc4.groupby( [ "Subcategory"] ).size()}).reset_index()
false_threats = pd.merge(test_data, false_data, on=['Subcategory'], how='left')

In [None]:
axes = false_threats.set_index('Subcategory').plot.bar(rot=90, subplots=True, figsize=(15,10), fontsize=12)
axes[0].legend(loc=2)  # doctest: +SKIP

In [None]:
Correct_predicted_subcategories = false_threats.Subcategory[false_threats['False'].isnull()]
Correct_predicted_subcategories

In [None]:
for item, label in zip(test.Text, test.Subcategory):
    result = gs_clf.predict([item])
    if result != label:
        print("Text is %s predicted label is %s, but true label is %s" % (item, result, label))