In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('BitcoinHeistData.csv')

In [3]:
df.head()

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,100050000.0,princetonCerber
1,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,100000000.0,princetonLocky
2,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.0,1,0,2,200000000.0,princetonCerber
3,1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7,2016,322,72,0.003906,1,0,2,71200000.0,princetonCerber
4,1129TSjKtx65E35GiUo4AYVeyo48twbrGX,2016,238,144,0.072848,456,0,1,200000000.0,princetonLocky


In [4]:
df['address'].value_counts()

1LXrSb67EaH1LGc6d6kWHq8rgv4ZBQAcpU    420
16cVG72goMe4sNqZhnpmnqfCMZ1uSFbUit    261
12wQZTDmA8onM3sEt4jwcvzDxnNXxD8Vza    207
12YursV58dRT2c9iuZg3jEWfwgTDamBcnd    183
1LEq4WmpCrqBd7V3PywE2nvFUFC3QTe52x    176
                                     ... 
14m4NjEQjLKrcjtN3doN7TgNZi3nbvPnkL      1
1CJrNRSNJepexvLFt3wSKZkzrHRag2UMCA      1
1Fsi7R5115vXKcSmFEoDUqqmEW4oT2W5AV      1
1GTkpRYXAK71c5DP2V7irDmYtvmhS46h29      1
3LFFBxp15h9KSFtaw55np8eP5fv6kdK17e      1
Name: address, Length: 2631095, dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916697 entries, 0 to 2916696
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   address    object 
 1   year       int64  
 2   day        int64  
 3   length     int64  
 4   weight     float64
 5   count      int64  
 6   looped     int64  
 7   neighbors  int64  
 8   income     float64
 9   label      object 
dtypes: float64(2), int64(6), object(2)
memory usage: 222.5+ MB


No null values present in the Datset

In [6]:
df.isnull().sum()

address      0
year         0
day          0
length       0
weight       0
count        0
looped       0
neighbors    0
income       0
label        0
dtype: int64

#### Encoding 'address' column into numerical column

In [7]:
encoder = LabelEncoder()
df['address'] = encoder.fit_transform(df['address'])

In [8]:
df.head()

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,23,2017,11,18,0.008333,1,0,2,100050000.0,princetonCerber
1,128,2016,132,44,0.000244,1,0,1,100000000.0,princetonLocky
2,169,2016,246,0,1.0,1,0,2,200000000.0,princetonCerber
3,217,2016,322,72,0.003906,1,0,2,71200000.0,princetonCerber
4,293,2016,238,144,0.072848,456,0,1,200000000.0,princetonLocky


In [9]:
df['address'].value_counts()

1925732    420
481920     261
105390     207
65867      183
1895700    176
          ... 
292374       1
1074094      1
1445117      1
1506008      1
2580865      1
Name: address, Length: 2631095, dtype: int64

#### Randomly shuffle the Dataset

In [10]:
df = df.sample(frac=1)

In [11]:
df

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
1656082,2541694,2015,165,8,0.125000,1,0,1,6.880000e+07,white
650976,585887,2012,255,0,0.500000,1,0,1,2.682013e+09,white
762950,994136,2013,2,78,0.500000,1,0,2,1.090500e+08,white
2645804,494603,2018,60,0,0.500000,1,0,1,2.735091e+10,white
1578752,121652,2015,88,2,3.666667,7,0,7,5.202173e+08,white
...,...,...,...,...,...,...,...,...,...,...
897631,1883535,2013,136,144,0.295166,3276,0,2,6.890500e+08,white
1501099,1909185,2015,10,8,0.200000,1,0,2,1.605393e+08,white
2434244,230541,2017,213,26,0.007812,1,0,2,7.645797e+09,white
489425,1308439,2012,93,6,0.500000,1,0,1,2.000000e+08,white


# Q3 Part-1

In [12]:
def train_test_validation_split(dataset: pd.DataFrame, size):
    n = dataset.shape[0]
    train_size = round((size[0]/100) * n)
    valid_size = round((size[1]/100) * n)
    test_size = round((size[2]/100) * n)
    
    x_train = dataset.iloc[0:train_size].drop('label', axis=1)
    y_train = dataset.iloc[0:train_size]['label']
    x_valid = dataset.iloc[train_size: train_size + valid_size].drop('label', axis=1)
    y_valid = dataset.iloc[train_size: train_size + valid_size]['label']
    x_test = dataset.iloc[train_size + valid_size:].drop('label', axis=1)
    y_test = dataset.iloc[train_size + valid_size:]['label']
    return x_train, y_train, x_valid, y_valid, x_test, y_test
    

In [13]:
x_train, y_train, x_valid, y_valid, x_test, y_test = train_test_validation_split(df, [70,15,15])

In [14]:
def DecisionTreeAlgorithm(split_criteria, x_train, y_train, x_valid, y_valid, x_test, y_test):
    print(f"Training, Validating & Testing the Decision Tree with '{split_criteria}' criteria...")
    print("Accuracy of Decision Tree classifers with various depth is shown below.\n")
    
    for depth in  [4, 8, 10, 15, 20]:
        tree = DecisionTreeClassifier(max_depth=depth, criterion=split_criteria)
        tree.fit(x_train, y_train)
        
        # Accuracy on Validation Set
        y_pred_valid = tree.predict(x_valid)
        print(f"Accuracy with 'Depth = {depth}' on Validation set is: {(y_pred_valid == y_valid).sum() / y_valid.shape[0]}")
        
        # Accuracy on Testing Set
        y_pred_test = tree.predict(x_test)
        print(f"Accuracy with 'Depth = {depth}' on Testing set is: {(y_pred_test == y_test).sum() / y_test.shape[0]} \n")

### Training Decision trees using the Gini index (for various depths)

In [15]:
DecisionTreeAlgorithm(split_criteria='gini', x_train=x_train, y_train=y_train,
                    x_valid=x_valid, y_valid=y_valid, x_test=x_test, y_test=y_test)

Training, Validating & Testing the Decision Tree with 'gini' criteria...
Accuracy of Decision Tree classifers with various depth is shown below.

Accuracy with 'Depth = 4' on Validation set is: 0.9856801636552726
Accuracy with 'Depth = 4' on Testing set is: 0.9860252706260971 

Accuracy with 'Depth = 8' on Validation set is: 0.9864915829533377
Accuracy with 'Depth = 8' on Testing set is: 0.986752121123464 

Accuracy with 'Depth = 10' on Validation set is: 0.9867795796619467
Accuracy with 'Depth = 10' on Testing set is: 0.9871018322118198 

Accuracy with 'Depth = 15' on Validation set is: 0.9883704186237872
Accuracy with 'Depth = 15' on Testing set is: 0.9884663911644236 

Accuracy with 'Depth = 20' on Validation set is: 0.9877349973143164
Accuracy with 'Depth = 20' on Testing set is: 0.9876823983323582 



### Training Decision trees using the Entropy (for various depths)

In [36]:
DecisionTreeAlgorithm(split_criteria='entropy', x_train=x_train, y_train=y_train,
                    x_valid=x_valid, y_valid=y_valid, x_test=x_test, y_test=y_test)

Training, Validating & Testing the Decision Tree with 'entropy' criteria...
Accuracy of Decision Tree classifers with various depth is shown below.

Accuracy with 'Depth = 4' on Validation set is: 0.9856230214511834
Accuracy with 'Depth = 4' on Testing set is: 0.9859589855178467 

Accuracy with 'Depth = 8' on Validation set is: 0.986002445686335
Accuracy with 'Depth = 8' on Testing set is: 0.9863384106202457 

Accuracy with 'Depth = 10' on Validation set is: 0.9874172866595811
Accuracy with 'Depth = 10' on Testing set is: 0.9876252559976595 

Accuracy with 'Depth = 15' on Validation set is: 0.9888435560736448
Accuracy with 'Depth = 15' on Testing set is: 0.9888983872147454 

Accuracy with 'Depth = 20' on Validation set is: 0.9879132809910743
Accuracy with 'Depth = 20' on Testing set is: 0.9878858250438853 



Greatest accuracy is observed for Decision Tree with "Depth" as 15 with criteria as "Entropy".


# Q3 Part-2

In [17]:
num_trees = 100

In [18]:
def train_trees(x_train, y_train, num_trees):
    trees = [DecisionTreeClassifier(max_depth=3, criterion='entropy') for i in range(num_trees)]
    
    # random indices for bootstrap samples
    m = x_train.shape[0]
    indices = np.arange(m, dtype=np.int16)    
    
    for i in range(num_trees):
        # Selecting n random samples with replacement from training set
        random_indices = np.random.choice(indices, m//2)
        
        # Bootstrap training data
        x_bootstrap = x_train.iloc[random_indices]
        y_bootstrap = y_train.iloc[random_indices]
                
        # Train/fit the Data on the Trees
        trees[i].fit(x_bootstrap, y_bootstrap)
    
    return trees

In [19]:
def predict_outputs(trees, x_test, y_test, num_trees):
    # Test all the 'num_trees=100' trees on the Testing samples and save the Testing results 
    trees_predictions = np.empty(shape=(num_trees, x_test.shape[0]), dtype='object')
    for i in range(num_trees):
        trees_predictions[i] = trees[i].predict(x_test)
    
    # Compute the "Majority vote" for each Testing sample outputs for all 100 of Decision Tree
    y_prediction = np.empty(shape=(x_test.shape[0]), dtype='object')
    for i in range(x_test.shape[0]):
        y_prediction[i] = pd.Series(trees_predictions[:,i]).value_counts().index[0]
    
    return y_prediction, trees_predictions

In [20]:
trees = train_trees(x_train=x_train, y_train=y_train, num_trees=num_trees)

#### Reporting Accuracy on Testing Set

In [21]:
y_prediction, trees_prediction = predict_outputs(
                trees=trees, x_test=x_test, y_test=y_test, num_trees=num_trees)

In [22]:
accuracy = (y_prediction == y_test).sum() / y_test.shape[0]

print(f"Random Forest with '{num_trees} Decision-Trees' each of depth 3.")
print(f'Accuracy of Random Forest on Testing set is: {accuracy}')

Random Forest with '100 Decision-Trees' each of depth 3.
Accuracy of Random Forest on Testing set is: 0.9859589855178467


#### Reporting Accuracy on Validation Set

In [23]:
y_prediction, trees_prediction = predict_outputs(
                trees=trees, x_test=x_valid, y_test=y_valid, num_trees=num_trees)

In [24]:
accuracy = (y_prediction == y_valid).sum() / y_valid.shape[0]

print(f"Random Forest with '{num_trees} Decision-Trees' each of depth 3.")
print(f'Accuracy of Random Forest on Validation Set is: {accuracy}')

Random Forest with '100 Decision-Trees' each of depth 3.
Accuracy of Random Forest on Validation Set is: 0.9856230214511834


# Q3 Part-3

In [33]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [26]:
estimators = [4, 8, 10, 15, 20]

In [27]:
def train_AdaBoost_classifiers(x_train, y_train, estimators):
    AdaBoosts = []
    
    for n_estimator in estimators:
        # print(n_estimator)
        tree = DecisionTreeClassifier(max_depth=15, criterion='entropy')
        adaboost = AdaBoostClassifier(base_estimator=tree, n_estimators=n_estimator)
        adaboost.fit(x_train, y_train)
        
        AdaBoosts.append(adaboost)
    
    return AdaBoosts

In [34]:
def test_AdaBoost_classifiers(adaboosts, x_test, y_test, estimators):
    for i in range(len(adaboosts)):
        # y_prediction = adaboosts[i].predict(x_test)  
        # accuracy = accuracy_score(y_test=y_test, y_pred=y_prediction)
        print(f"Accuracy with 'n_estimators = {estimators[i]}' is: {adaboosts[i].score(x_test, y_test)}")

In [30]:
adaboosts = train_AdaBoost_classifiers(x_train=x_train, y_train=y_train, estimators=estimators)

In [35]:
print("Accuracy of AdaBoost with Base as Decision Tree Classifier...\n")
test_AdaBoost_classifiers(adaboosts=adaboosts, x_test=x_test, y_test=y_test, estimators=estimators)

Accuracy of AdaBoost with Base as Decision Tree Classifier...

Accuracy with 'n_estimators = 4' is: 0.9886949605032183
Accuracy with 'n_estimators = 8' is: 0.9872801162960796
Accuracy with 'n_estimators = 10' is: 0.9862081260971328
Accuracy with 'n_estimators = 15' is: 0.986880119953189
Accuracy with 'n_estimators = 20' is: 0.987899539204213
