In [None]:
# In this we are using summary (description of incident happened) to predict its information source.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [2]:
df = pd.read_csv('C:/Users/Aditi/Desktop/crypto/InfoSrc.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Description.of.incident,Information.Source
0,1,The company's website was breached sometime ar...,California Attorney General
1,2,"The December 29, 2011 theft of a laptop from a...",California Attorney General
2,3,BDO was contracted by Rubio's to perform finan...,California Attorney General
3,4,"On February 10, 2012, DHI Mortgage became awar...",California Attorney General
4,5,An office burglary on New Year's Eve 2011 resu...,California Attorney General


In [3]:
df['Information.Source'].value_counts()

US Department of Health and Human Services    2474
Dataloss DB                                   1757
Media                                         1063
Databreaches.net                               851
California Attorney General                    726
PHIPrivacy.net                                 552
Indiana Attorney General                       499
Maryland Attorney General                      311
Government Agency                              251
HHS via PHIPrivacy.net                         216
Security Breach Letter                         153
Vermont Attorney General                        41
Krebs On Security                               41
NAID                                            14
Health IT Security                               5
HHS via Databreaches.net                         5
New Hampshire Attorney General                   2
Massachusetts Attorney General                   1
Name: Information.Source, dtype: int64

In [4]:
# Drop rows which contain any null value (or NaN)
df = df.dropna()

In [5]:
df["label"] = df['Information.Source'].map({'US Department of Health and Human Services':0,
'New Hampshire Attorney General' : 1,
'Media': 2,
'NAID' : 3,
'Dataloss DB': 4,
'Databreaches.net':5,
'Vermont Attorney General': 6,
'Krebs On Security' : 7,
'PHIPrivacy.net' :8,
'Security Breach Letter' : 9, 
'Indiana Attorney General' : 10,
'California Attorney General' : 11,
'Health IT Security' :12,
'HHS via Databreaches.net' : 13,
'Massachusetts Attorney General' : 14,
'HHS via PHIPrivacy.net' : 15,
'Government Agency':16,
'Maryland Attorney General': 17  })

In [6]:
# Labeled Dataset
df.head()

Unnamed: 0.1,Unnamed: 0,Description.of.incident,Information.Source,label
0,1,The company's website was breached sometime ar...,California Attorney General,11
1,2,"The December 29, 2011 theft of a laptop from a...",California Attorney General,11
2,3,BDO was contracted by Rubio's to perform finan...,California Attorney General,11
3,4,"On February 10, 2012, DHI Mortgage became awar...",California Attorney General,11
4,5,An office burglary on New Year's Eve 2011 resu...,California Attorney General,11


In [7]:
x = df['Description.of.incident']
y = df['label']
print(x.shape)
print(y.shape)

(8961,)
(8961,)


In [8]:
# Dividing whole dataset into training(75%) and testing(25%) datasets
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state = 1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6720,)
(2241,)
(6720,)
(2241,)


In [9]:
# Vectorization using CountVectorizer
Vec = CountVectorizer()
X_train_counts = Vec.fit_transform(X_train)

In [10]:
X_train_counts

<6720x12947 sparse matrix of type '<class 'numpy.int64'>'
	with 252307 stored elements in Compressed Sparse Row format>

In [11]:
X_train_counts.shape

(6720, 12947)

In [12]:
# Vectorization using Tfidf
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(6720, 12947)

In [13]:
# Model1
mod1 = MultinomialNB().fit(X_train_tfidf,y_train)
from sklearn.pipeline import Pipeline
text_mod1 = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()), ('mod1', MultinomialNB()),])

In [16]:
%time text_mod1 = text_mod1.fit(X_train, y_train)

Wall time: 1.19 s


In [28]:
y_pred1 = text_mod1.predict(X_test)
print("Accuracy of MultinomialNB Model 2 is : ",(text_mod1.score(X_test,y_test))*100)

Accuracy of MultinomialNB Model 2 is :  66.62204373047746


In [29]:
y_pred1

array([10,  0,  0, ..., 17,  4,  0], dtype=int64)

In [19]:
# Model2
mod2 = MultinomialNB()
%time mod2.fit( X_train_counts, y_train)

Wall time: 35 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
X_test_counts = Vec.transform(X_test)
X_test_counts

<2241x12947 sparse matrix of type '<class 'numpy.int64'>'
	with 82179 stored elements in Compressed Sparse Row format>

In [30]:
y_pred2 = mod2.predict(X_test_counts)

In [31]:
print("Accuracy of MultinomialNB Model 2 is : ",(metrics.accuracy_score(y_test,y_pred2))*100)

Accuracy of MultinomialNB Model 2 is :  74.8326639892905


In [None]:
# Conclusion : Model 2 is much faster and has better accuracy than Model 1
# Note : Accuracy is quite low because of imbalanced nature of dataset. I tried to make it balanced but for that it is 
# it is showing memory usage issue.