In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [25]:
df = pd.read_csv('BitcoinHeistData.csv')

In [26]:
df.head()

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,100050000.0,princetonCerber
1,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,100000000.0,princetonLocky
2,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.0,1,0,2,200000000.0,princetonCerber
3,1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7,2016,322,72,0.003906,1,0,2,71200000.0,princetonCerber
4,1129TSjKtx65E35GiUo4AYVeyo48twbrGX,2016,238,144,0.072848,456,0,1,200000000.0,princetonLocky


In [27]:
df['address'].value_counts()

1LXrSb67EaH1LGc6d6kWHq8rgv4ZBQAcpU    420
16cVG72goMe4sNqZhnpmnqfCMZ1uSFbUit    261
12wQZTDmA8onM3sEt4jwcvzDxnNXxD8Vza    207
12YursV58dRT2c9iuZg3jEWfwgTDamBcnd    183
1LEq4WmpCrqBd7V3PywE2nvFUFC3QTe52x    176
                                     ... 
14m4NjEQjLKrcjtN3doN7TgNZi3nbvPnkL      1
1CJrNRSNJepexvLFt3wSKZkzrHRag2UMCA      1
1Fsi7R5115vXKcSmFEoDUqqmEW4oT2W5AV      1
1GTkpRYXAK71c5DP2V7irDmYtvmhS46h29      1
3LFFBxp15h9KSFtaw55np8eP5fv6kdK17e      1
Name: address, Length: 2631095, dtype: int64

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916697 entries, 0 to 2916696
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   address    object 
 1   year       int64  
 2   day        int64  
 3   length     int64  
 4   weight     float64
 5   count      int64  
 6   looped     int64  
 7   neighbors  int64  
 8   income     float64
 9   label      object 
dtypes: float64(2), int64(6), object(2)
memory usage: 222.5+ MB


No null values present in the Datset

In [29]:
df.isnull().sum()

address      0
year         0
day          0
length       0
weight       0
count        0
looped       0
neighbors    0
income       0
label        0
dtype: int64

#### Encoding 'address' column into numerical column

In [30]:
encoder = LabelEncoder()
df['address'] = encoder.fit_transform(df['address'])

In [31]:
df.head()

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,23,2017,11,18,0.008333,1,0,2,100050000.0,princetonCerber
1,128,2016,132,44,0.000244,1,0,1,100000000.0,princetonLocky
2,169,2016,246,0,1.0,1,0,2,200000000.0,princetonCerber
3,217,2016,322,72,0.003906,1,0,2,71200000.0,princetonCerber
4,293,2016,238,144,0.072848,456,0,1,200000000.0,princetonLocky


In [32]:
df['address'].value_counts()

1925732    420
481920     261
105390     207
65867      183
1895700    176
          ... 
292374       1
1074094      1
1445117      1
1506008      1
2580865      1
Name: address, Length: 2631095, dtype: int64

#### Scaling the Dataset

In [33]:
scalar = StandardScaler()
scalar.fit(df.drop('label', axis=1))
scaled_df = scalar.transform(df.drop('label', axis=1))
scaled_df = pd.DataFrame(data=scaled_df, columns=df.columns[:-1])
scaled_df['label'] = df['label']

In [34]:
scaled_df

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,-1.729688,1.118540,-1.638825,-0.457910,-0.146203,-0.426499,-0.246819,-0.011525,-0.026830,princetonCerber
1,-1.729549,0.675552,-0.475496,-0.017100,-0.148404,-0.426499,-0.246819,-0.067333,-0.026830,princetonLocky
2,-1.729495,0.675552,0.620533,-0.763086,0.123693,-0.426499,-0.246819,-0.011525,-0.026215,princetonCerber
3,-1.729431,0.675552,1.351219,0.457618,-0.147408,-0.426499,-0.246819,-0.011525,-0.027007,princetonCerber
4,-1.729331,0.675552,0.543619,1.678322,-0.128644,-0.157216,-0.246819,-0.067333,-0.026215,princetonLocky
...,...,...,...,...,...,...,...,...,...,...
2916692,-1.688957,1.561528,1.428134,-0.763086,-0.118230,-0.426499,-0.246819,-0.067333,-0.019726,white
2916693,1.167364,1.561528,1.428134,-0.763086,0.123693,-0.426499,-0.246819,-0.067333,-0.027174,white
2916694,0.680718,1.561528,1.428134,-0.729177,3.117498,-0.423540,-0.240610,1.830120,-0.012703,white
2916695,-1.215344,1.561528,1.428134,-0.763086,-0.012389,-0.426499,-0.246819,-0.067333,-0.026350,white


#### Randomly shuffle the Dataset

In [35]:
scaled_df = scaled_df.sample(frac=1)

In [36]:
scaled_df['label'].value_counts()

white                          2875284
paduaCryptoWall                  12390
montrealCryptoLocker              9315
princetonCerber                   9223
princetonLocky                    6625
montrealCryptXXX                  2419
montrealNoobCrypt                  483
montrealDMALockerv3                354
montrealDMALocker                  251
montrealSamSam                      62
montrealCryptoTorLocker2015         55
montrealGlobeImposter               55
montrealGlobev3                     34
montrealGlobe                       32
montrealWannaCry                    28
montrealRazy                        13
montrealAPT                         11
paduaKeRanger                       10
montrealFlyper                       9
montrealXTPLocker                    8
montrealVenusLocker                  7
montrealCryptConsole                 7
montrealXLockerv5.0                  7
montrealEDA2                         6
montrealJigSaw                       4
paduaJigsaw              

In [37]:
scaled_df

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
344957,-0.440514,-1.539389,1.274305,-0.559635,-0.113531,-0.404601,-0.246819,-0.011525,-0.003974,white
2329082,-1.601698,1.118540,-0.706239,-0.729177,0.123693,-0.426499,-0.246819,-0.011525,-0.026762,white
487126,-1.304006,-1.096400,-0.869682,-0.695269,-0.137286,-0.418213,-0.246819,-0.011525,-0.027058,white
1678181,0.792114,0.232564,0.053290,1.678322,-0.148110,1.337153,-0.246819,-0.011525,-0.027238,white
2807716,0.149866,1.561528,0.389790,-0.729177,0.123693,-0.426499,-0.246819,-0.011525,-0.027212,white
...,...,...,...,...,...,...,...,...,...,...
1669351,-1.365842,0.232564,-0.033239,-0.559635,-0.016641,-0.422356,-0.246819,-0.011525,-0.026398,white
460657,-1.450743,-1.096400,-1.129268,-0.763086,-0.012389,-0.426499,-0.246819,-0.067333,-0.026484,white
1256074,0.691980,-0.210424,-0.494725,1.678322,-0.145962,-0.015177,-0.246819,-0.011525,-0.024453,white
163841,1.252934,-1.539389,-0.465882,-0.729177,-0.147162,-0.426499,-0.246819,-0.011525,-0.026215,white


# Q3 Part-1

In [38]:
def train_test_validation_split(dataset: pd.DataFrame, size):
    n = dataset.shape[0]
    train_size = round((size[0]/100) * n)
    valid_size = round((size[1]/100) * n)
    test_size = round((size[2]/100) * n)
    
    x_train = dataset.iloc[0:train_size].drop('label', axis=1)
    y_train = dataset.iloc[0:train_size]['label']
    x_valid = dataset.iloc[train_size: train_size + valid_size].drop('label', axis=1)
    y_valid = dataset.iloc[train_size: train_size + valid_size]['label']
    x_test = dataset.iloc[train_size + valid_size:].drop('label', axis=1)
    y_test = dataset.iloc[train_size + valid_size:]['label']
    return x_train, y_train, x_valid, y_valid, x_test, y_test
    

In [39]:
x_train, y_train, x_valid, y_valid, x_test, y_test = train_test_validation_split(scaled_df, [70,15,15])

In [40]:
def DecisionTreeAlgorithm(split_criteria, x_train, y_train, x_valid, y_valid, x_test, y_test):
    print(f"Training, Validating & Testing the Decision Tree with '{split_criteria}' criteria...")
    print("Accuracy of Decision Tree classifers with various depth is shown below.\n")
    
    for depth in  [4, 8, 10, 15, 20]:
        tree = DecisionTreeClassifier(max_depth=depth, criterion=split_criteria)
        tree.fit(x_train, y_train)
        
        # Accuracy on Validation Set
        y_pred_valid = tree.predict(x_valid)
        print(f"Accuracy with 'Depth = {depth}' on Validation set is: {(y_pred_valid == y_valid).sum() / y_valid.shape[0]}")
        
        # Accuracy on Testing Set
        y_pred_test = tree.predict(x_test)
        print(f"Accuracy with 'Depth = {depth}' on Testing set is: {(y_pred_test == y_test).sum() / y_test.shape[0]} \n")

### Training Decision trees using the Gini index (for various depths)

In [41]:
DecisionTreeAlgorithm(split_criteria='gini', x_train=x_train, y_train=y_train,
                    x_valid=x_valid, y_valid=y_valid, x_test=x_test, y_test=y_test)

Training, Validating & Testing the Decision Tree with 'gini' criteria...
Accuracy of Decision Tree classifers with various depth is shown below.

Accuracy with 'Depth = 4' on Validation set is: 0.9857967337516143
Accuracy with 'Depth = 4' on Testing set is: 0.9859338428905793 

Accuracy with 'Depth = 8' on Validation set is: 0.9862835853304533
Accuracy with 'Depth = 8' on Testing set is: 0.986370410327677 

Accuracy with 'Depth = 10' on Validation set is: 0.9868161506725638
Accuracy with 'Depth = 10' on Testing set is: 0.9868869770333528 

Accuracy with 'Depth = 15' on Validation set is: 0.988612701569125
Accuracy with 'Depth = 15' on Testing set is: 0.9887178174370977 

Accuracy with 'Depth = 20' on Validation set is: 0.9873715728963097
Accuracy with 'Depth = 20' on Testing set is: 0.9873144016968988 



### Training Decision trees using the Entropy (for various depths)

In [42]:
DecisionTreeAlgorithm(split_criteria='entropy', x_train=x_train, y_train=y_train,
                    x_valid=x_valid, y_valid=y_valid, x_test=x_test, y_test=y_test)

Training, Validating & Testing the Decision Tree with 'entropy' criteria...
Accuracy of Decision Tree classifers with various depth is shown below.

Accuracy with 'Depth = 4' on Validation set is: 0.9857350201711981
Accuracy with 'Depth = 4' on Testing set is: 0.9858927004095962 

Accuracy with 'Depth = 8' on Validation set is: 0.9861190157826768
Accuracy with 'Depth = 8' on Testing set is: 0.9862629827384435 

Accuracy with 'Depth = 10' on Validation set is: 0.9871635752734255
Accuracy with 'Depth = 10' on Testing set is: 0.9873235444704506 

Accuracy with 'Depth = 15' on Validation set is: 0.9887269859773031
Accuracy with 'Depth = 15' on Testing set is: 0.9887612456114687 

Accuracy with 'Depth = 20' on Validation set is: 0.9875475708849042
Accuracy with 'Depth = 20' on Testing set is: 0.987520114101814 



Greatest accuracy is observed for Decision Tree with "Depth" as 15 with criteria as "Entropy".


# Q3 Part-2

In [43]:
num_trees = 100

In [44]:
def train_trees(x_train, y_train, num_trees):
    trees = [DecisionTreeClassifier(max_depth=3, criterion='entropy') for i in range(num_trees)]
    
    # random indices for bootstrap samples
    m = x_train.shape[0]
    indices = np.arange(m, dtype=np.int16)    
    
    for i in range(num_trees):
        # Selecting n random samples with replacement from training set
        random_indices = np.random.choice(indices, m//2)
        
        # Bootstrap training data
        x_bootstrap = x_train.iloc[random_indices]
        y_bootstrap = y_train.iloc[random_indices]
                
        # Train/fit the Data on the Trees
        trees[i].fit(x_bootstrap, y_bootstrap)
    
    return trees

In [45]:
def predict_outputs(trees, x_test, y_test, num_trees):
    # Test all the 'num_trees=100' trees on the Testing samples and save the Testing results 
    trees_predictions = np.empty(shape=(num_trees, x_test.shape[0]), dtype='object')
    for i in range(num_trees):
        trees_predictions[i] = trees[i].predict(x_test)
    
    # Compute the "Majority vote" for each Testing sample outputs for all 100 of Decision Tree
    y_prediction = np.empty(shape=(x_test.shape[0]), dtype='object')
    for i in range(x_test.shape[0]):
        y_prediction[i] = pd.Series(trees_predictions[:,i]).value_counts().index[0]
    
    return y_prediction, trees_predictions

In [46]:
trees = train_trees(x_train=x_train, y_train=y_train, num_trees=num_trees)

#### Reporting Accuracy on Testing Set

In [47]:
y_prediction, trees_prediction = predict_outputs(
                trees=trees, x_test=x_test, y_test=y_test, num_trees=num_trees)

In [48]:
accuracy = (y_prediction == y_test).sum() / y_test.shape[0]

print(f"Random Forest with '{num_trees} Decision-Trees' each of depth 3.")
print(f'Accuracy of Random Forest on Testing set is: {accuracy}')

Random Forest with '100 Decision-Trees' each of depth 3.
Accuracy of Random Forest on Testing set is: 0.9858927004095962


#### Reporting Accuracy on Validation Set

In [49]:
y_prediction, trees_prediction = predict_outputs(
                trees=trees, x_test=x_valid, y_test=y_valid, num_trees=num_trees)

In [50]:
accuracy = (y_prediction == y_valid).sum() / y_valid.shape[0]

print(f"Random Forest with '{num_trees} Decision-Trees' each of depth 3.")
print(f'Accuracy of Random Forest on Validation Set is: {accuracy}')

Random Forest with '100 Decision-Trees' each of depth 3.
Accuracy of Random Forest on Validation Set is: 0.9857350201711981


# Q3 Part-3

In [51]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [52]:
estimators = [4, 8, 10, 15, 20]

In [59]:
def train_AdaBoost_classifiers(x_train, y_train, estimators):
    AdaBoosts = []
    
    for n_estimator in estimators:
        # print(n_estimator)
        tree = DecisionTreeClassifier(max_depth=15, criterion='entropy')
        adaboost = AdaBoostClassifier(base_estimator=tree, n_estimators=n_estimator)
        adaboost.fit(x_train, y_train)
        
        AdaBoosts.append(adaboost)
    
    return AdaBoosts

In [57]:
def test_AdaBoost_classifiers(adaboosts, x_test, y_test, estimators):
    for i in range(len(adaboosts)):
        # y_prediction = adaboosts[i].predict(x_test)  
        # accuracy = (y_prediction == y_test).sum() / y_test.shape[0]  
        print(f"Accuracy with 'n_estimators = {estimators[i]}' is: {adaboosts[i].score(x_test, y_test)}")

In [None]:
adaboosts = train_AdaBoost_classifiers(x_train=x_train, y_train=y_train, estimators=estimators)

In [58]:
test_AdaBoost_classifiers(adaboosts=adaboosts, x_test=x_test, y_test=y_test, estimators=estimators)

Accuracy with 'n_estimators = 4' is: 0.9887109603569338
Accuracy with 'n_estimators = 8' is: 0.9883543921884143
Accuracy with 'n_estimators = 10' is: 0.9853372769163253
Accuracy with 'n_estimators = 15' is: 0.9872572593622001
Accuracy with 'n_estimators = 20' is: 0.9876069704505559
