# Finding Method of Databreach from various incidents description using Various Machine Learning Techniques

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.tokenize import RegexpTokenizer


In [72]:
df = pd.read_csv('/home/samroadie/Desktop/CRYPTO_PROJECT/Data_Breaches.csv')
df.head()

Unnamed: 0,Entity,Story,Year,Records Lost,Sector,Method of Leak
0,River City Media,A dodgy backup has allegedly resulted in over ...,2017,1,,Web
1,Unique Identification Authority of India,A report says that full data base has been exp...,2017,1000000000,Government,Poor security
2,Spambot,A misconfigured spambot has leaked over 700m r...,2017,7,,Web
3,Friend Finder Network,"Usernames, email addresses, passwords for site...",2016,4,,Web
4,Equifax,"If you have a credit report, there’s a good ch...",2017,1,,Financial


In [73]:
df['Method of Leak'].value_counts()

Hacked                           136
Lost / stolen device or media     46
Web                               21
Inside job                        18
Accidentally published            16
Poor security                     15
Financial                          4
Gaming                             4
Government                         3
Tech                               2
Retail                             2
Healthcare                         1
App                                1
Telecoms                           1
Name: Method of Leak, dtype: int64

In [74]:
df = df.dropna()

In [127]:
df.head()

Unnamed: 0,Entity,Story,Year,Records Lost,Sector,Method of Leak,label
1,Unique Identification Authority of India,A report says that full data base has been exp...,2017,1000000000,Government,Poor security,4.0
5,Dailymotion,"85.2m email addresses extracted, but only 18.3...",2016,85200000,Web,Hacked,0.0
6,Malaysian telcos & MVNOs,Oct. Data from numerous Malaysian telco & MVNO...,2014,46200000,Telecoms,Hacked,0.0
9,Al.type,Dec. The app's developer failed to secure the ...,2017,31293959,App,Poor security,4.0
12,Interpark,July. South Korean police are blaming North Ko...,2016,10000000,Web,Hacked,0.0


In [83]:
df['Method of Leak'].value_counts()

Hacked                           118
Lost / stolen device or media     42
Inside job                        16
Accidentally published            14
Poor security                     13
Name: Method of Leak, dtype: int64

In [84]:
df['Sector'].value_counts()

Web           51
Government    41
Healthcare    32
Financial     24
Telecoms      10
Retail         9
Tech           8
App            6
Academia       6
Gaming         5
Energy         3
Transport      3
Media          3
Legal          1
Military       1
Name: Sector, dtype: int64

In [85]:
df["label"] = df['Method of Leak'].map({'Hacked':0,     
'Lost / stolen device or media':1,
'Inside job ' : 2,
'Accidentally published' : 3,
'Poor security' : 4 })

In [112]:
df['label']

1      4.0
5      0.0
6      0.0
9      4.0
12     0.0
      ... 
262    0.0
264    1.0
265    0.0
266    1.0
267    1.0
Name: label, Length: 203, dtype: float64

In [121]:
dfmodel = pd.DataFrame()

In [122]:
dfmodel['Story'] = df['Story']

In [123]:
dfmodel['label'] = df['label']

In [125]:
dfmodel = dfmodel.dropna()

In [132]:
dfmodel

Unnamed: 0,Story,label
1,A report says that full data base has been exp...,4.0
5,"85.2m email addresses extracted, but only 18.3...",0.0
6,Oct. Data from numerous Malaysian telco & MVNO...,0.0
9,Dec. The app's developer failed to secure the ...,4.0
12,July. South Korean police are blaming North Ko...,0.0
...,...,...
262,Press report: Tokyo police have arrested two m...,0.0
264,Laptop lost/stolen containing employee data: n...,1.0
265,CardSystems was fingered by MasterCard after i...,0.0
266,Blame the messenger! A box of computer tapes c...,1.0


In [133]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(dfmodel['Story'])

In [142]:
X_train, X_test, y_train, y_test = train_test_split(
    text_counts, dfmodel['label'], test_size=0.2, random_state=1)

# Using Multinomial Naive Bayes classifier 

In [148]:
Model1 = MultinomialNB().fit(X_train, y_train)
md1predicted= Model1.predict(X_test)


In [149]:
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, md1predicted))

MultinomialNB Accuracy: 0.7894736842105263


In [150]:
print("confusion_matrix",confusion_matrix(y_test,md1predicted))

confusion_matrix [[20  2  1  3]
 [ 0  8  0  0]
 [ 2  0  2  0]
 [ 0  0  0  0]]


In [159]:
print("classification_report")
print(classification_report(y_test,md1predicted))

classification_report
              precision    recall  f1-score   support

         0.0       0.91      0.77      0.83        26
         1.0       0.80      1.00      0.89         8
         3.0       0.67      0.50      0.57         4
         4.0       0.00      0.00      0.00         0

    accuracy                           0.79        38
   macro avg       0.59      0.57      0.57        38
weighted avg       0.86      0.79      0.82        38



  _warn_prf(average, modifier, msg_start, len(result))


# Using Random Forest

In [152]:
model2 = RandomForestClassifier(n_estimators=1000, random_state=0)
model2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [153]:
md2predicted = model2.predict(X_test)

In [155]:
print("Random Forest Accuracy",metrics.accuracy_score(y_test, md2predicted))

Random Forest Accuracy 0.868421052631579


In [156]:
print("confusion_matrix",confusion_matrix(y_test,md2predicted))

confusion_matrix [[26  0  0]
 [ 1  7  0]
 [ 4  0  0]]


In [158]:
print("classification_report")
print(classification_report(y_test,md2predicted))

classification_report
              precision    recall  f1-score   support

         0.0       0.84      1.00      0.91        26
         1.0       1.00      0.88      0.93         8
         3.0       0.00      0.00      0.00         4

    accuracy                           0.87        38
   macro avg       0.61      0.62      0.62        38
weighted avg       0.78      0.87      0.82        38

