In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,plot_confusion_matrix

# Loading Dataset 

In [2]:
input_data=pd.read_csv("data/BitcoinHeistData.csv")

In [3]:
print(input_data.head())

                              address  year  day  length    weight  count  \
0   111K8kZAEnJg245r2cM6y9zgJGHZtJPy6  2017   11      18  0.008333      1   
1  1123pJv8jzeFQaCV4w644pzQJzVWay2zcA  2016  132      44  0.000244      1   
2  112536im7hy6wtKbpH1qYDWtTyMRAcA2p7  2016  246       0  1.000000      1   
3  1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7  2016  322      72  0.003906      1   
4  1129TSjKtx65E35GiUo4AYVeyo48twbrGX  2016  238     144  0.072848    456   

   looped  neighbors       income            label  
0       0          2  100050000.0  princetonCerber  
1       0          1  100000000.0   princetonLocky  
2       0          2  200000000.0  princetonCerber  
3       0          2   71200000.0  princetonCerber  
4       0          1  200000000.0   princetonLocky  


# Preprocessing

#### Checking for Duplicates

In [4]:
print('No of duplicates in the Input Data:',sum(input_data.duplicated()))

No of duplicates in the Input Data: 0


#### Checking for NaN/null values

In [5]:
print('No of NaN/Null values in Input Data:',input_data.isnull().values.sum())

No of NaN/Null values in Input Data: 0


#### Data Prepration

In [6]:
X = input_data.drop(['label'], axis = 1)
Y = input_data['label']

print(X.head())

                              address  year  day  length    weight  count  \
0   111K8kZAEnJg245r2cM6y9zgJGHZtJPy6  2017   11      18  0.008333      1   
1  1123pJv8jzeFQaCV4w644pzQJzVWay2zcA  2016  132      44  0.000244      1   
2  112536im7hy6wtKbpH1qYDWtTyMRAcA2p7  2016  246       0  1.000000      1   
3  1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7  2016  322      72  0.003906      1   
4  1129TSjKtx65E35GiUo4AYVeyo48twbrGX  2016  238     144  0.072848    456   

   looped  neighbors       income  
0       0          2  100050000.0  
1       0          1  100000000.0  
2       0          2  200000000.0  
3       0          2   71200000.0  
4       0          1  200000000.0  


##### Feature Subset Selection

In [7]:
# Since the address attribute is irrelevant to the type of Ransomware and including it in X will cause overfitting of the model. Therefore, we can drop this feature
X = input_data.drop(['address','label'], axis = 1)

print(X.head())

   year  day  length    weight  count  looped  neighbors       income
0  2017   11      18  0.008333      1       0          2  100050000.0
1  2016  132      44  0.000244      1       0          1  100000000.0
2  2016  246       0  1.000000      1       0          2  200000000.0
3  2016  322      72  0.003906      1       0          2   71200000.0
4  2016  238     144  0.072848    456       0          1  200000000.0


#### Label Encoding

In [8]:
# Transforming non-numerical value in Y to numerical value using label Encoder
le = preprocessing.LabelEncoder()
le.fit(Y)
Y = le.transform(Y)
print(le.classes_)

['montrealAPT' 'montrealComradeCircle' 'montrealCryptConsole'
 'montrealCryptXXX' 'montrealCryptoLocker' 'montrealCryptoTorLocker2015'
 'montrealDMALocker' 'montrealDMALockerv3' 'montrealEDA2' 'montrealFlyper'
 'montrealGlobe' 'montrealGlobeImposter' 'montrealGlobev3'
 'montrealJigSaw' 'montrealNoobCrypt' 'montrealRazy' 'montrealSam'
 'montrealSamSam' 'montrealVenusLocker' 'montrealWannaCry'
 'montrealXLocker' 'montrealXLockerv5.0' 'montrealXTPLocker'
 'paduaCryptoWall' 'paduaJigsaw' 'paduaKeRanger' 'princetonCerber'
 'princetonLocky' 'white']


#### Normalising the data

In [9]:
X_n = preprocessing.normalize(X)


#### Feature Scaling

In [10]:
# MinMaxScalar
from sklearn.preprocessing import MinMaxScaler
scaler1 = MinMaxScaler().fit(X_n)
X_mm = scaler1.transform(X_n)
                         
# Standard Scaler
# from sklearn.preprocessing import StandardScaler
# scaler2 = StandardScaler().fit(X_n)
# X_st = scaler2.transform(X_n)

# Training the model

### Using Random train and test subsets


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test=train_test_split(X_mm,Y,test_size=0.2,random_state=42)

In [12]:
from sklearn.naive_bayes import MultinomialNB
clf1 = MultinomialNB()
clf1 = clf1.fit(X_train, Y_train)

In [13]:
Y_train_pred = clf1.predict(X_train)
Y_test_pred = clf1.predict(X_test)
print('----------------------------------------------------------------------------------------------------\n')
print('Accuracy Score on Training Data:',accuracy_score(Y_train, Y_train_pred))
print('\n\n------------------------------------------------------------------------------------------------\n')
print('Accuracy Score on Test Data:',accuracy_score(Y_test, Y_test_pred))
print('\n------------------------------------------------------------------------------------------------')

----------------------------------------------------------------------------------------------------

Accuracy Score on Training Data: 0.9857244305093477


------------------------------------------------------------------------------------------------

Accuracy Score on Test Data: 0.9861093016079816

------------------------------------------------------------------------------------------------


### Using K-fold cross validation

In [14]:
k=5
n=len(X_mm)//k
train_accuracy_scores=[]
test_accuracy_scores=[]
for i in range(k):
    
    #Train-test split 
    X_train=[]
    Y_train=[]
    X_test=[]
    Y_test=[]
    for j in range(len(X_mm)):
        if j>=n*i and j<n*(i+1):
            X_test.append(X_mm[j])
            Y_test.append(Y[j])
        else:
            X_train.append(X_mm[j])
            Y_train.append(Y[j])
    
    # model training
    clf2 = MultinomialNB()
    clf2 = clf2.fit(X_train, Y_train)
    
    #Accuracy calculation
    Y_train_pred = clf2.predict(X_train)
    Y_test_pred = clf2.predict(X_test)
    train_accuracy_scores.append(accuracy_score(Y_train, Y_train_pred))
    test_accuracy_scores.append(accuracy_score(Y_test, Y_test_pred))

In [15]:
print('----------------------------------------------------------------------------------------------------\n')
print('Accuracy Score on Training Data:',np.mean(train_accuracy_scores))
print('\n\n------------------------------------------------------------------------------------------------\n')
print('Accuracy Score on Test Data:',np.mean(test_accuracy_scores))
print('\n------------------------------------------------------------------------------------------------')

----------------------------------------------------------------------------------------------------

Accuracy Score on Training Data: 0.9858014072422663


------------------------------------------------------------------------------------------------

Accuracy Score on Test Data: 0.9858013950721622

------------------------------------------------------------------------------------------------
