# Finetuning
# Malicious Smart Contract Detection Training Dataset Collection Notebook

This notebook collects smart contract creation bytecode and decompiled opcodes for malicious contract classification. 
Benign contracts are gathered from blockchain explorers and malicious contracts from [Forta Network's labelled datasets github repo]("https://github.com/forta-network/labelled-datasets").

# Code provided by the Forta project

In [None]:
import logging
import pickle
import os
import time

from evmdasm import EvmBytecode
import pandas as pd
from tqdm import tqdm
import requests

tqdm.pandas()
# disable warning logs from evmdasm tool
logging.getLogger("evmdasm").setLevel(logging.CRITICAL)

blockchains = ["ethereum", "polygon", "bsc"]
current_blockchain = "ethereum"

ZETTABLOCK_API_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
ZETTABLOCK_URL = "https://api.zettablock.com/api/v1/databases/realtimeDB/queries"
EXECUTE_URL = "https://api.zettablock.com/api/v1/queries/"

headers = {
    "accept": "application/json",
    "X-API-KEY": ZETTABLOCK_API_KEY,
    "content-type": "application/json"
}

TRACES = {}
CONTRACT_DATA = {}

In [None]:
def get_verified_smart_contracts():
    verified_smart_contracts = {}
    # 5k verified SCs Downloaded from https://etherscan.io/exportData?type=open-source-contract-codes
    verified_smart_contracts["ethereum"] = pd.read_csv("/data/forta/ethereum/text/pretraining/raw/ethereum-verified.csv")
    # 5k verified SCs Downloaded from https://polygonscan.com/exportData?type=open-source-contract-codes
    verified_smart_contracts["polygon"] = pd.read_csv("/data/forta/ethereum/text/pretraining/raw/polygon-verified.csv")
    # 5k verified SCs Downloaded from https://bscscan.com/exportData?type=open-source-contract-codes
    verified_smart_contracts["bsc"] = pd.read_csv("/data/forta/ethereum/text/pretraining/raw/bsc-verified.csv")
    return verified_smart_contracts

In [None]:
def get_contract_bytecode(contract):
    try:
        contract_bytecode = get_contract_data_from_zettablock_API(contract["contract_address"], current_blockchain)
        return contract_bytecode
    except Exception as e:
        print(e)
        return []

def get_contract_data_from_zettablock_API(contract, blockchain):
    # Connect to Zettablock API
    # Its free version only allows 1 request per second
    time.sleep(1)
    payload = {"query": "SELECT bytecode FROM %s_mainnet.contract_creations WHERE address = '%s'" % (blockchain, contract)}
    response = requests.post(ZETTABLOCK_URL, json=payload, headers=headers)
    id = response.json()['id']
    response = requests.post(EXECUTE_URL+id+"/execute?includeColumnName=false&includeMetadata=false",
               headers=headers)
    return response.text

In [None]:
def get_opcodes(creation_bytecode) -> str:
    bytecode = creation_bytecode
    if bytecode is None:
        return ''
    try:
        opcodes = EvmBytecode(bytecode).disassemble()
    except Exception:
        return ''
    
    return " ".join([str(op).strip() for op in opcodes])

In [None]:
def get_malicious_contracts() -> pd.DataFrame:
    data_path = '/data/forta/ethereum/text/pretraining/raw/malicious-data.pkl'
    malicious = None

    if os.path.exists(data_path):
        with open(data_path, "rb") as data_file:
            malicious = pickle.load(data_file)
    else:
        """Collects malicious contracts from Forta's labelled dataset github repo and its decompiled opcodes."""
        # csv from https://github.com/forta-network/labelled-datasets
        github_url = 'https://raw.githubusercontent.com/forta-network/labelled-datasets/main/labels/1/malicious_smart_contracts.csv'
        malicious = pd.read_csv(github_url)
        # exclude phishing hack related contracts
        malicious = malicious[malicious['contract_creator_etherscan_label'] != 'phish-hack']
        malicious['creation_bytecode'] = malicious.progress_apply(get_contract_bytecode, axis=1)
        malicious['decompiled_opcodes'] = malicious['creation_bytecode'].progress_apply(get_opcodes)
        # Store data so we don't have to download it all the time
        malicious.to_pickle(data_path)
    return malicious

In [None]:
def get_benign_contracts() -> pd.DataFrame:
    global current_blockchain
    data_path = '/data/forta/ethereum/text/pretraining/raw/benign-data.pkl'
    benign = None

    if os.path.exists(data_path):
        with open(data_path, "rb") as data_file:
            benign = pickle.load(data_file)
    else:
        """Collects verified and mev contracts and its decompiled opcodes."""
        benign = get_verified_smart_contracts()
        for blockchain in blockchains:
            current_blockchain = blockchain
            benign[blockchain]['contract_address'] = benign[blockchain]['contract_address'].progress_apply(str.lower)
            benign[blockchain]['creation_bytecode'] = benign[blockchain].progress_apply(get_contract_bytecode, axis=1)
        begign = pd.DataFrame(pd.concat([benign["ethereum"], benign["polygon"], benign["bsc"]]))
        benign = begign.reset_index(drop=True)
        benign['decompiled_opcodes'] = benign['creation_bytecode'].progress_apply(get_opcodes)
        # Store data so we don't have to download it all the time
        benign.to_pickle(data_path)
    return benign

In [None]:
malicious_contracts = get_malicious_contracts()
malicious_contracts['malicious'] = True

In [None]:
benign_contracts = get_benign_contracts()
benign_contracts['malicious'] = False

In [None]:
dataset = pd.concat([malicious_contracts, benign_contracts])

In [None]:
dataset = dataset[(dataset['decompiled_opcodes'].notna()) & (dataset['decompiled_opcodes'] != '')]
dataset.drop_duplicates('contract_address', inplace=True)

In [None]:
dataset['malicious'].value_counts()

In [None]:
dataset.fillna('').to_parquet('/data/forta/ethereum/text/pretraining/raw/verified-smart-contracts.parquet', index=None)

In [None]:
dataset['malicious'].value_counts().plot(kind='pie', figsize=(7, 7))