In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,plot_confusion_matrix

# Loading Dataset 

In [None]:
input_data=pd.read_csv("data/BitcoinHeistData.csv")

In [None]:
print(input_data.head())

# Preprocessing

#### Checking for Duplicates

In [None]:
print('No of duplicates in the Input Data:',sum(input_data.duplicated()))

#### Checking for NaN/null values

In [None]:
print('No of NaN/Null values in Input Data:',input_data.isnull().values.sum())

#### Data Prepration

In [None]:
X = input_data.drop(['label'], axis = 1)
Y = input_data['label']

print(X.head())

##### Feature Subset Selection

In [None]:
# Since the address attribute is irrelevant to the type of Ransomware and including it in X will cause overfitting of the model. Therefore, we can drop this feature
X = input_data.drop(['address','label'], axis = 1)

print(X.head())

#### Label Encoding

In [None]:
# Transforming non-numerical value in Y to numerical value using label Encoder
le = preprocessing.LabelEncoder()
le.fit(Y)
Y = le.transform(Y)
print(le.classes_)

#### Normalising the data

In [None]:
X_n = preprocessing.normalize(X)


#### Feature Scaling

In [None]:
# MinMaxScalar
from sklearn.preprocessing import MinMaxScaler
scaler1 = MinMaxScaler().fit(X_n)
X_mm = scaler1.transform(X_n)
                         
# Standard Scaler
# from sklearn.preprocessing import StandardScaler
# scaler2 = StandardScaler().fit(X_n)
# X_st = scaler2.transform(X_n)

# Training the model

### Using Random train and test subsets


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test=train_test_split(X_mm,Y,test_size=0.2,random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf1 = KNeighborsClassifier(n_neighbors=3)
clf1 = clf1.fit(X_train, Y_train)

In [None]:
Y_train_pred = clf1.predict(X_train)
Y_test_pred = clf1.predict(X_test)
print('----------------------------------------------------------------------------------------------------\n')
print('Accuracy Score on Training Data:',accuracy_score(Y_train, Y_train_pred))
print('\n\n------------------------------------------------------------------------------------------------\n')
print('Accuracy Score on Test Data:',accuracy_score(Y_test, Y_test_pred))
print('\n------------------------------------------------------------------------------------------------')

### Using K-fold cross validation

In [None]:
k=5
n=len(X_mm)//k
train_accuracy_scores=[]
test_accuracy_scores=[]
for i in range(k):
    
    #Train-test split 
    X_train=[]
    Y_train=[]
    X_test=[]
    Y_test=[]
    for j in range(len(X_mm)):
        if j>=n*i and j<n*(i+1):
            X_test.append(X_mm[j])
            Y_test.append(Y[j])
        else:
            X_train.append(X_mm[j])
            Y_train.append(Y[j])
    
    # model training
    clf2 = KNeighborsClassifier(n_neighbors=3)
    clf2 = clf2.fit(X_train, Y_train)
    
    #Accuracy calculation
    Y_train_pred = clf2.predict(X_train)
    Y_test_pred = clf2.predict(X_test)
    train_accuracy_scores.append(accuracy_score(Y_train, Y_train_pred))
    test_accuracy_scores.append(accuracy_score(Y_test, Y_test_pred))

In [None]:
print('----------------------------------------------------------------------------------------------------\n')
print('Accuracy Score on Training Data:',np.mean(train_accuracy_scores))
print('\n\n------------------------------------------------------------------------------------------------\n')
print('Accuracy Score on Test Data:',np.mean(test_accuracy_scores))
print('\n------------------------------------------------------------------------------------------------')