In [1]:
import numpy as np
import pandas as pd
import requests as req
import time
import re
import glob
import os
import subprocess
from bs4 import BeautifulSoup

# 1. Load smart contract addresses from contract-library.com

In [2]:
### Download Smart Contract Meta Data
if False:
    MAX_PAGE = 1000
    AT_PAGE = 0
    contracts = {"address": [], "block_number": [], "ether": [], "has_source": [] }
    base_url = "https://contract-library.com/api/contracts?"
    for idx in range(MAX_PAGE - AT_PAGE):
        ### Start Downloading
        page_number = idx + 1 + AT_PAGE
        end_point = "{0}n=Ethereum&q=&t=address&s=block_number&o=desc&p={1}&c=100&w=".format(base_url, page_number)
        r = req.get(end_point)
        assert(r.status_code == 200)
        print("=> page_number: {}".format(page_number))
        json = r.json()
        for contract in json["contracts"]:
            contracts["address"].append(contract["address"])
            contracts["block_number"].append(contract["block_number"])
            contracts["ether"].append(contract["ether"])
            contracts["has_source"].append(contract["has_source"])
        ## Write Fragment
        if page_number % 100 == 0:
            csv_file = "assets/addr_{}.csv".format(int(page_number / 100))
            df = pd.DataFrame(contracts)
            df.to_csv(csv_file, header=None, index=False)
            print("=> write to {}".format(csv_file))
            print("=> sleep 2 seconds")
            time.sleep(2)
            ### Clear Contract Data
            contracts["address"] = []
            contracts["block_number"] = []
            contracts["ether"] = []
            contracts["has_source"] = []

# 2. Supported version by symEvm

In [3]:
## Supported compiler versions
if False:
    versions = []
    for i in range(25 - 18 + 1):
        versions.append("0.4.{}".format(i + 18))
    for i in range(16 - 0 + 1):
        versions.append("0.5.{}".format(i))
    for i in range(3 + 1):
        versions.append("0.6.{}".format(i))
    print("=> supported version: {}".format(len(versions)))

# 3. Download source code from etherscan

In [4]:
### Filter contract source code by version
if False:
    df = pd.read_csv(
        "assets/addr.csv",
        header=None,
        names=["address", "block_number", "ether", "has_source"],
    )
    df_with_source = df[(df.has_source == True)]
    print(df_with_source.describe())
    for address in df_with_source["address"]:
        end_point = "https://etherscan.io/address/{}#code".format(address)
        r = req.get(end_point, headers={"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36"})
        assert(r.status_code == 200)
        print("=> address: {}".format(address))
        soup = BeautifulSoup(r.text, 'html.parser')
        editor = soup.find(id="editor")
        match = re.findall(r"pragma(\s+)solidity(\s+)[^\d]*((\d+\.?)+)", editor.text)
        print(match)
        if len(match) > 0 and len(match[0]) > 2:
            version = match[0][2]
            if version in versions:
                with open("contracts/{}_{}".format(address, version), "w") as f:
                    f.write(editor.text)

# 4. Filter smart contracts by source code and version

In [22]:
### Report 1
if False:
    num_addr = !cat assets/addr.csv | wc -l
    num_addr = int(num_addr[0])
    files = !find contracts/ -type f -not -name "*.csv" -not -name "*.json"
    print("=> contracts {}".format(num_addr))
    print("=> sup_contracts {}".format(len(files)))
    print("=> % {}".format(len(files) / num_addr * 100))

# 5. Use local compiler to filter smart contracts

In [23]:
### Filter smart contract if it can not be compiled 
if False:
    files = !find contracts/ -not -name "*.csv" -not -name "*.json"
    error_files = []
    for idx, file in enumerate(files):
        version = file.split('_')[1]
        os.environ["SYM_SOLC_VERSION"] = version
        os.environ["SYM_FILE"] = file
        !solc use "$SYM_SOLC_VERSION" > /dev/null 2>&1
        output = !solc $SYM_FILE
        if "Error" in ",".join(output):
            print("=> FAIL {}".format(file))
            error_files.append(file)
        else:
            print("=> OK {}".format(file))
    df = pd.DataFrame(error_files)
    csv_file = "contracts/errors.csv"
    df.to_csv(csv_file, header=None, index=False)

# 6. Show executable smart contracts

In [24]:
### Report 2
if False:
    files = !find contracts/ -type f -not -name "*.csv" -not -name "*.json"
    num_errors = !cat contracts/errors.csv | wc -l
    num_errors = int(num_errors[0])
    print("=> sup_contracts: {}".format(len(files)))
    print("=> error: {}".format(num_errors))
    print("=> comp : {}".format(len(files) - num_errors))
    print("=> % {}".format((len(files) - num_errors)/len(files) * 100))

# 7. Use local compiler to compile smart contracts

In [28]:
if False:
    files = !find contracts/ -type f -not -name "*.csv" -not -name "*.json"
    df = pd.read_csv("contracts/errors.csv", header=None, names=["contract_path"])
    for contract_path in df["contract_path"]:
        if contract_path in files:
            files.remove(contract_path)
    print("=> comp: {}".format(len(files)))
    compiled_files = []
    for idx, file in enumerate(files):
        version = file.split('_')[1]
        os.environ["SYM_SOLC_VERSION"] = version
        os.environ["SYM_FILE"] = file
        os.environ["SYM_FILE_OUTPUT"] = "{}.json".format(file)
        print("=> {} / {} {}".format(idx + 1, len(files), file))
        !solc use $SYM_SOLC_VERSION > /dev/null 2>&1
        !solc --combined-json bin-runtime,srcmap-runtime,ast,asm $SYM_FILE > $SYM_FILE_OUTPUT
        compiled_files.append(file)
    df = pd.DataFrame(compiled_files)
    csv_file = "contracts/compiled.csv"
    df.to_csv(csv_file, header=None, index=False)

# 8. Show smart contract list

In [29]:
### Report 3
if True:
    df = pd.read_csv("contracts/compiled.csv", header=None, names=["contract_path"])
    num_ok = 0
    for contract_path in df["contract_path"]:
        with open(contract_path + ".json", "r") as f:
                if len(f.read()) == 0:
                    continue
                num_ok = num_ok + 1
    print("=> num_ok: " + str(num_ok))

=> num_ok: 1749


# 9. Set configuration for SymEVM

In [30]:
env_content = """
dataload=02
expectCoverage=0.95
maxVisitedBlock=30
maxVisitedBlockBound=100
maxVisitedBlockStep=10
allocatedRange="a0"
""".strip()
with open(".env", "w") as f:
    f.write(env_content)

# 10. Execute smart contract by SymEVM

In [26]:
### Run symEvm for coverage
if True:
    exceptions = {
        "address": [],
        "heap": [],
        "timeout": [],
        "others": [],
    }
    result = {
        # Concolic 
        "address": [], 
        "contract_name": [],
        "endpoints": [], 
        "covered_jumpis": [], 
        "total_jumpis": [],
        "bytelen": [],
        # Tainting analysis
        "success_sloads": [],
        "success_mloads": [],
        "success_mstores": [],
        "success_sstores": [],
        "failed_sloads": [],
        "failed_mloads": [],
        "failed_mstores": [],
        "failed_sstores": [],
        # Scanner
        "address": [],
        "integer": [],
        "disorder": [],
        "frez": [],
        "reentrancy": [],
        "delegate": [],
        "number": [],
        "timestamp": [],
        # Duration
        "concolic_duration": [],
        "tainting_duration": [],
        "scanner_duration": [],
    }
    df = pd.read_csv("contracts/compiled.csv", header=None, names=["contract_path"])
    files = ["results/sym_evm.csv", "results/exception.csv"]
    for file in files:
        if os.path.exists(file):
             os.remove(file)
        with open(file, "w") as f:
            pass
    contract_paths = df["contract_path"][0:1]
    for idx, contract_path in enumerate(contract_paths):
        os.environ["SYM_CONTRACT_FILE"] = contract_path
        os.environ["SYM_JSON_FILE"] = "{}.json".format(contract_path)
        print("{} / {} - {}".format(idx + 1, len(contract_paths), contract_path))
        ## if json file is empty
        with open(contract_path + ".json", "r") as f:
            if len(f.read()) == 0:
                continue
        ## execute SymEvm
        addr = contract_path.split('/')[1].split('_')[0]
        output = !timeout 120 node index.js
        output = '\n'.join(output)
        print(output)
        ## contract
        heap_error = "heap out of memory" in output
        has_error = "error" in output
        is_ok = !has_error
        if is_ok:
            #--------------------------------#
            #           Concolic
            #--------------------------------#
            if True:
                match = re.findall(r"Start Analyzing Contract:\s+([^\n]+)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:   
                    contract_name = match[0]
                ## endpoints
                match = re.findall(r"endpoints\s+:\s+(\d+)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    endpoints = int(match[0])
                ## cjumpis
                match = re.findall(r"cjumpis\s+:\s+(\d+)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    cjumpis = int(match[0])
                ## njumpis
                match = re.findall(r"njumpis\s+:\s+(\d+)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    njumpis = int(match[0])
                ## byte len
                match = re.findall(r"bytelen\s+:\s+(\d+)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    bytelen = int(match[0])
                ## concolic duration
                match = re.findall(r"concolic\s+:\s+(\d+)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    concolic_duration = int(match[0])
            #--------------------------------#
            #           Tainting
            #--------------------------------#
            if True:
                ## success
                match = re.findall(r"success\s+:\s+\[([^\]]+)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    success = list(map(lambda x: int(x), match[0].split(',')))
                ## failed
                match = re.findall(r"failed\s+:\s+\[([^\]]+)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    failed = list(map(lambda x: int(x), match[0].split(',')))
                ## tainting duration
                match = re.findall(r"tainting\s+:\s+(\d+)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    tainting_duration = int(match[0])
            #--------------------------------#
            #           Scanner
            #--------------------------------#
            if True:
                ## integer
                match = re.findall(r"integer\s+:\s+(false|true)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    integer = 0 if match[0] == 'false' else 1
                ## disorder
                match = re.findall(r"disorder\s+:\s+(false|true)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    disorder = 0 if match[0] == 'false' else 1
                ## frez
                match = re.findall(r"frez\s+:\s+(false|true)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    frez = 0 if match[0] == 'false' else 1
                ## reentrancy
                match = re.findall(r"reentrancy\s+:\s+(false|true)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    reentrancy = 0 if match[0] == 'false' else 1
                ## number
                match = re.findall(r"number\s+:\s+(false|true)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    number = 0 if match[0] == 'false' else 1
                ## timestamp
                match = re.findall(r"timestamp\s+:\s+(false|true)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    timestamp = 0 if match[0] == 'false' else 1
                ## delegate
                match = re.findall(r"delegate\s+:\s+(false|true)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    delegate = 0 if match[0] == 'false' else 1
                ## tainting duration
                match = re.findall(r"scanner\s+:\s+(\d+)", output)
                is_ok = is_ok and len(match) > 0
                if is_ok:
                    scanner_duration = int(match[0])
        if is_ok:
            ## Concolic
            result["address"].append(addr)
            result["endpoints"].append(endpoints)
            result["covered_jumpis"].append(cjumpis)
            result["total_jumpis"].append(njumpis)
            result["contract_name"].append(contract_name)
            result["bytelen"].append(bytelen)
            result["concolic_duration"].append(concolic_duration)
            ## Tainting
            result["success_sloads"].append(success[0])
            result["success_mloads"].append(success[1])
            result["success_mstores"].append(success[2])
            result["success_sstores"].append(success[3])
            result["failed_sloads"].append(failed[0])
            result["failed_mloads"].append(failed[1])
            result["failed_mstores"].append(failed[2])
            result["failed_sstores"].append(failed[3])
            result["tainting_duration"].append(tainting_duration)
            ## Scanner
            result["integer"].append(integer)
            result["disorder"].append(disorder)
            result["frez"].append(frez)
            result["reentrancy"].append(reentrancy)
            result["number"].append(number)
            result["timestamp"].append(timestamp)
            result["delegate"].append(delegate)
            result["scanner_duration"].append(scanner_duration)
        else:
            exceptions["address"].append(addr)
            if heap_error:
                exceptions["heap"].append(1)
                exceptions["others"].append(0)
                exceptions["timeout"].append(0)
            elif has_error:
                exceptions["others"].append(1)
                exceptions["heap"].append(0)
                exceptions["timeout"].append(0)
            else:
                exceptions["timeout"].append(1)
                exceptions["others"].append(0)
                exceptions["heap"].append(0)
    df = pd.DataFrame(result)
    df.to_csv(files[0], index=None)
    df = pd.DataFrame(exceptions)
    df.to_csv(files[1], index=None)