In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

##### Load the Dataset

In [25]:
df = pd.read_csv('BitcoinHeistData.csv')

In [26]:
df.head()

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,100050000.0,princetonCerber
1,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,100000000.0,princetonLocky
2,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.0,1,0,2,200000000.0,princetonCerber
3,1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7,2016,322,72,0.003906,1,0,2,71200000.0,princetonCerber
4,1129TSjKtx65E35GiUo4AYVeyo48twbrGX,2016,238,144,0.072848,456,0,1,200000000.0,princetonLocky


In [27]:
df['address'].value_counts()

1LXrSb67EaH1LGc6d6kWHq8rgv4ZBQAcpU    420
16cVG72goMe4sNqZhnpmnqfCMZ1uSFbUit    261
12wQZTDmA8onM3sEt4jwcvzDxnNXxD8Vza    207
12YursV58dRT2c9iuZg3jEWfwgTDamBcnd    183
1LEq4WmpCrqBd7V3PywE2nvFUFC3QTe52x    176
                                     ... 
14m4NjEQjLKrcjtN3doN7TgNZi3nbvPnkL      1
1CJrNRSNJepexvLFt3wSKZkzrHRag2UMCA      1
1Fsi7R5115vXKcSmFEoDUqqmEW4oT2W5AV      1
1GTkpRYXAK71c5DP2V7irDmYtvmhS46h29      1
3LFFBxp15h9KSFtaw55np8eP5fv6kdK17e      1
Name: address, Length: 2631095, dtype: int64

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916697 entries, 0 to 2916696
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   address    object 
 1   year       int64  
 2   day        int64  
 3   length     int64  
 4   weight     float64
 5   count      int64  
 6   looped     int64  
 7   neighbors  int64  
 8   income     float64
 9   label      object 
dtypes: float64(2), int64(6), object(2)
memory usage: 222.5+ MB


#### No null values present in the Datset

In [29]:
df.isnull().sum()

address      0
year         0
day          0
length       0
weight       0
count        0
looped       0
neighbors    0
income       0
label        0
dtype: int64

#### Encoding 'address' column into numerical column using 'Label Encoder'

In [30]:
encoder = LabelEncoder()
df['address'] = encoder.fit_transform(df['address'])

In [31]:
df.head()

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,23,2017,11,18,0.008333,1,0,2,100050000.0,princetonCerber
1,128,2016,132,44,0.000244,1,0,1,100000000.0,princetonLocky
2,169,2016,246,0,1.0,1,0,2,200000000.0,princetonCerber
3,217,2016,322,72,0.003906,1,0,2,71200000.0,princetonCerber
4,293,2016,238,144,0.072848,456,0,1,200000000.0,princetonLocky


In [32]:
df['address'].value_counts()

1925732    420
481920     261
105390     207
65867      183
1895700    176
          ... 
292374       1
1074094      1
1445117      1
1506008      1
2580865      1
Name: address, Length: 2631095, dtype: int64

#### Randomly shuffle the Dataset (since all the output labels are grouped)

In [33]:
df = df.sample(frac=1)

In [34]:
df

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
505972,506425,2012,110,24,8.786737e-01,16,0,2,1.941000e+09,white
2620114,2369474,2018,34,136,2.270832e-01,6873,0,2,4.556888e+07,white
1363813,17576,2014,238,0,1.000000e+00,1,0,2,9.998000e+07,white
1176214,2199457,2014,50,6,1.000000e+00,1,0,2,5.802917e+07,white
296992,2353600,2011,266,58,1.994046e-06,46,0,1,1.030000e+08,white
...,...,...,...,...,...,...,...,...,...,...
1906332,398193,2016,50,42,7.148791e-02,70,0,2,1.115550e+08,white
297907,156143,2011,267,14,5.156250e-01,2,0,2,6.368000e+09,white
805128,483661,2013,44,0,5.000000e-01,1,0,1,2.030000e+10,white
191875,2132493,2011,161,128,1.898076e-12,24,0,2,1.315936e+10,white


# Q3 Part-1

### Training a Decision Tree using both the Gini index and the Entropy by changing the max-depth

In [35]:
def train_test_validation_split(dataset: pd.DataFrame, size):
    n = dataset.shape[0]
    train_size = round((size[0]/100) * n)
    valid_size = round((size[1]/100) * n)
    test_size = round((size[2]/100) * n)
    
    # Split into Training, Validating & Testing Data
    x_train = dataset.iloc[0:train_size].drop('label', axis=1)
    y_train = dataset.iloc[0:train_size]['label']
    x_valid = dataset.iloc[train_size: train_size + valid_size].drop('label', axis=1)
    y_valid = dataset.iloc[train_size: train_size + valid_size]['label']
    x_test = dataset.iloc[train_size + valid_size:].drop('label', axis=1)
    y_test = dataset.iloc[train_size + valid_size:]['label']
    return x_train, y_train, x_valid, y_valid, x_test, y_test
    

In [36]:
x_train, y_train, x_valid, y_valid, x_test, y_test = train_test_validation_split(df, [70,15,15])

In [85]:
def DecisionTreeAlgorithm(split_criteria, x_train, y_train, x_valid, y_valid, x_test, y_test):
    print(f"Training, Validating & Testing the Decision Tree with '{split_criteria}' criteria...")
    print("Accuracy of Decision Tree classifers with various depth is shown below.\n")    
    
    for depth in  [4, 8, 10, 15, 20]:
        # Training the Decision Tree 
        tree = DecisionTreeClassifier(max_depth=depth, criterion=split_criteria)
        tree.fit(x_train, y_train)
        
        # Accuracy on Validation Set
        y_pred_valid = tree.predict(x_valid)
        validation_accuracy = (y_pred_valid == y_valid).sum() / y_valid.shape[0]
        print(f"Accuracy with 'Depth = {depth}' on Validation set is: {validation_accuracy}")
        
        # Accuracy on Testing Set
        y_pred_test = tree.predict(x_test)
        testing_accuracy = (y_pred_test == y_test).sum() / y_test.shape[0]
        print(f"Accuracy with 'Depth = {depth}' on Testing set is: {testing_accuracy}\n")

### Training Decision trees using the Gini index (for various depths)

In [86]:
DecisionTreeAlgorithm(split_criteria='gini', x_train=x_train, y_train=y_train,
                    x_valid=x_valid, y_valid=y_valid, x_test=x_test, y_test=y_test)

Training, Validating & Testing the Decision Tree with 'gini' criteria...
Accuracy of Decision Tree classifers with various depth is shown below.

Accuracy with 'Depth = 4' on Validation set is: 0.9857075919132353
Accuracy with 'Depth = 4' on Testing set is: 0.9859566998244588

Accuracy with 'Depth = 8' on Validation set is: 0.9864207266202673
Accuracy with 'Depth = 8' on Testing set is: 0.9866538363077824

Accuracy with 'Depth = 10' on Validation set is: 0.9868207220488908
Accuracy with 'Depth = 10' on Testing set is: 0.987005833089526

Accuracy with 'Depth = 15' on Validation set is: 0.9881921349470292
Accuracy with 'Depth = 15' on Testing set is: 0.9884366771503803

Accuracy with 'Depth = 20' on Validation set is: 0.9875087141861236
Accuracy with 'Depth = 20' on Testing set is: 0.9878103971620831



### Training Decision trees using the Entropy (for various depths)

In [87]:
DecisionTreeAlgorithm(split_criteria='entropy', x_train=x_train, y_train=y_train,
                    x_valid=x_valid, y_valid=y_valid, x_test=x_test, y_test=y_test)

Training, Validating & Testing the Decision Tree with 'entropy' criteria...
Accuracy of Decision Tree classifers with various depth is shown below.

Accuracy with 'Depth = 4' on Validation set is: 0.9856298785156741
Accuracy with 'Depth = 4' on Testing set is: 0.9859109859566998

Accuracy with 'Depth = 8' on Validation set is: 0.9859978743100078
Accuracy with 'Depth = 8' on Testing set is: 0.9862721255119953

Accuracy with 'Depth = 10' on Validation set is: 0.9872092890366967
Accuracy with 'Depth = 10' on Testing set is: 0.9874652574605032

Accuracy with 'Depth = 15' on Validation set is: 0.9888206991920092
Accuracy with 'Depth = 15' on Testing set is: 0.9890081004973669

Accuracy with 'Depth = 20' on Validation set is: 0.9877144261208444
Accuracy with 'Depth = 20' on Testing set is: 0.9877692546811001



#### Greatest accuracy is observed for Decision Tree with "Depth" as 15 with criteria as "Entropy".


# Q3 Part-2

### Implementing a Random Forest Algorithm using 100 Decision Tree Classifier (with max-depth as 3)

In [43]:
# number of trees
num_trees = 100

In [44]:
def train_trees(x_train, y_train, num_trees):
    trees = [DecisionTreeClassifier(max_depth=3, criterion='entropy') for i in range(num_trees)]
    
    # random indices for bootstrap samples
    m = x_train.shape[0]
    indices = np.arange(m, dtype=np.int16)    
    
    for i in range(num_trees):
        # Selecting n random samples with replacement from training set
        random_indices = np.random.choice(indices, m//2)
        
        # Bootstrap training data
        x_bootstrap = x_train.iloc[random_indices]
        y_bootstrap = y_train.iloc[random_indices]
                
        # Train/fit the Data on the Trees
        trees[i].fit(x_bootstrap, y_bootstrap)
    
    return trees

In [53]:
def predict_outputs(trees, x_test, y_test, num_trees):
    # Test all the 'num_trees=100' trees on the Testing samples and save the Testing results 
    trees_predictions = np.empty(shape=(num_trees, x_test.shape[0]), dtype='object')
    for i in range(num_trees):
        trees_predictions[i] = trees[i].predict(x_test)
    
    # Compute the "Majority vote" for each Testing sample outputs for all 100 of Decision Tree
    y_prediction = np.empty(shape=(x_test.shape[0]), dtype='object')
    for i in range(x_test.shape[0]):
        y_prediction[i] = pd.Series(trees_predictions[:,i]).value_counts().index[0]
    
    return y_prediction, trees_predictions

##### All 100 Decision Trees 

In [46]:
trees = train_trees(x_train=x_train, y_train=y_train, num_trees=num_trees)

#### Reporting Accuracy of Random Forest on Testing Set

In [47]:
y_prediction, trees_prediction = predict_outputs(
                trees=trees, x_test=x_test, y_test=y_test, num_trees=num_trees)

In [48]:
accuracy = (y_prediction == y_test).sum() / y_test.shape[0]

print(f"Training a Random Forest with '{num_trees} Decision-Trees' each of depth 3.\n")
print(f'Accuracy of Random Forest on Testing set is: {accuracy}')

Training a Random Forest with '100 Decision-Trees' each of depth 3.

Accuracy of Random Forest on Testing set is: 0.9859109859566998


#### Reporting Accuracy of Random Forest on Validation Set

In [49]:
y_prediction, trees_prediction = predict_outputs(
                trees=trees, x_test=x_valid, y_test=y_valid, num_trees=num_trees)

In [52]:
accuracy = (y_prediction == y_valid).sum() / y_valid.shape[0]

print(f"Random Forest with '{num_trees} Decision-Trees' each of depth 3.\n")
print(f'Accuracy of Random Forest on Validation Set is: {accuracy}')

Random Forest with '100 Decision-Trees' each of depth 3.

Accuracy of Random Forest on Validation Set is: 0.9856298785156741


#### Performance of Random Forest

The Testing accuracy of Random forests is nearly same as that we obtained in part-a.

In part-a we used strong classifiers, i.e, Decision Trees with greater depth.

In this Random forest, we used 100 weak Decision Trees classifiers (each of depth 3). Then, we ensembled or combined the results of all those 100 weak classfiers to produce a strong Decision Tree Classifier, which has nearly the same accuracy as those of strong Decision Tree Classifier. Hence, this shows the power of ensembling.

# Q3 Part-3

In [55]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [54]:
# No. of estimators
estimators = [4, 8, 10, 15, 20]

### Implementing a Random Forest Algorithm (of 100 Decision Tree Classifier) using ADABOOST

#### Training Random Forests that are ensembled with AdaBoost Algorithm

In [56]:
def train_AdaBoost_classifiers(x_train, y_train, estimators):
    AdaBoosts = []
    
    for n_estimator in estimators:
        # Initializing a Decision Tree
        tree = DecisionTreeClassifier(max_depth=15, criterion='entropy')

        # Adaboost Classifier
        adaboost = AdaBoostClassifier(base_estimator=tree, n_estimators=n_estimator)

        # Training Adaboost algorithm using fit()
        adaboost.fit(x_train, y_train)
        
        # Adding Adaboost classifiers
        AdaBoosts.append(adaboost)
    return AdaBoosts

#### Testing Random Forests that are ensembled using AdaBoost Algorithm

In [72]:
def test_AdaBoost_classifiers(adaboosts, x_test, y_test, estimators):
    
    for i in range(len(estimators)):
        # Computing Training and Testing accuracy
        # y_prediction = adaboosts[i].predict(x_test)  
        # accuracy = accuracy_score(y_test=y_test, y_pred=y_prediction)
        accuracy = adaboosts[i].score(x_test, y_test)
        
        print(f"Accuracy with 'n_estimators = {estimators[i]}' is: {accuracy}")

In [57]:
adaboosts = train_AdaBoost_classifiers(x_train=x_train, y_train=y_train, estimators=estimators)

#### Reporting Accuracy of Random Forest (ensembled with Adaboost) on Validation Set

In [81]:
print("Accuracy of AdaBoost based Random Forests on Validation Set...\n")
test_AdaBoost_classifiers(adaboosts=adaboosts, x_test=x_valid, y_test=y_valid, estimators=estimators)

Accuracy of AdaBoost based Random Forests on Validation Set...

Accuracy with 'n_estimators = 4' is: 0.9882127061405013
Accuracy with 'n_estimators = 8' is: 0.9872595741762952
Accuracy with 'n_estimators = 10' is: 0.9858584473320305
Accuracy with 'n_estimators = 15' is: 0.9873692872081462
Accuracy with 'n_estimators = 20' is: 0.9874858573044879


#### Reporting Accuracy of Random Forest (ensembled with Adaboost) on Testing Set

In [74]:
print("Accuracy of AdaBoost based Random Forests on Testing Set...\n")
test_AdaBoost_classifiers(adaboosts=adaboosts, x_test=x_test, y_test=y_test, estimators=estimators)

Accuracy of AdaBoost based Random Forests on Testing Set...

Accuracy with 'n_estimators = 4' is: 0.9882538216793446
Accuracy with 'n_estimators = 8' is: 0.9876823983323582
Accuracy with 'n_estimators = 10' is: 0.9860001279988297
Accuracy with 'n_estimators = 15' is: 0.9874675431538912
Accuracy with 'n_estimators = 20' is: 0.9877692546811001


#### Results

The accuracy of "AdaBoost based Decision Tree Clssifer" is nearly same as that of "Random forest Classifer" in part-b.

Though the Adaboost shows slightly greater accuracy after two decimal places.

Since AdaBoost and Bagging are iterative and paralle versions of Ensembling repectively. 

Both of them shows similar accuracy on the Testing set. This shows benefits of Ensembling and how strong classifers are made from weak classifiers