In [1]:
import os
import pandas as pd
import glob

# Helpers

In [2]:
# Get number of TP and FP among the violation
def count_tp_fp(column_name, df):
    return len(df[(df[column_name] == 'TP')]), len(df[(df[column_name] == 'FP')])

def get_tp_fp_stats(df):
    stats = {}
    stats['i1_tp'], stats['i1_fp'] = count_tp_fp("i1", df)
    stats['i2_tp'], stats['i2_fp']  = count_tp_fp("i2", df)
    stats['i6_tp'], stats['i6_fp']  = count_tp_fp("i6", df)
    stats['i7_tp'], stats['i7_fp']  = count_tp_fp("i7", df)
    stats['i8_tp'], stats['i8_fp']  = count_tp_fp("i8", df)
    stats['i9_tp'], stats['i9_fp']  = count_tp_fp("i9", df)
    stats['total_tp'] = stats['i1_tp'] + stats['i2_tp'] + stats['i6_tp'] + stats['i7_tp'] + stats['i8_tp'] + stats['i9_tp']
    stats['total_fp'] = stats['i1_fp'] + stats['i2_fp'] + stats['i6_fp'] + stats['i7_fp'] + stats['i8_fp'] + stats['i9_fp']
    return stats

### Load manual analysis CSV

In [3]:
manual_analysis = '../experiments_manual_analysis/manual-analysis.csv'

In [4]:
manual_analysis_df = pd.read_csv(manual_analysis)
manual_analysis_df.columns = [x.lower() for x in manual_analysis_df.columns]
manual_analysis_df = manual_analysis_df.drop(columns=[x for x in manual_analysis_df.columns if 'unnamed' in x])
manual_analysis_df = manual_analysis_df[manual_analysis_df['experiment type'] != "no-one"]
manual_analysis_df.head()

Unnamed: 0,address,token name,experiment type,invariants violated,i1,i2,i6,i7,i8,i9,explanation,report,note
0,0xf3e70642c28f3f707408c56624c2f30ea9f9fce3,AlbosToken,testfull10,I2,,FP,,,,68.0,violation is caused by their business logic,FP,
1,0xf7d3320c4676d11d67338b766a9df99996d19777,MKC,testfull10,I7,,,,TP,,,allowance -= safeSub(x - y),TP,
2,0xb57919aebb30812ae188dbe238bc907d56ba4a3a,EveryCoin,testfull10,"I2, I7",,TP,,FP,,,"constructor name mismatch (I2), burnFrom inter...",TP,
3,0x45e5997a4a69ca3d8eb38892bd7d5dad8eadea2b,TCZToken,testfull10,I6,,,TP,,,,"transferFrom with their own implementation, no...",TP,
4,0x4594a218d3149743758b08574f1f532cb790e268,VTEXP,testfull10,I2,,TP,,,,,"wrong transfer logic, money should not be subt...",TP,


### How many contracts have been already manually checked

In [5]:
top_contracts_sources_dir = "/home/eviglianisi/blockchain/blockchain/etherscan-scraper/21_02_toptoken/"
topcontracts_path = glob.glob(top_contracts_sources_dir + "*.sol")
topcontracts_list = [{'address': x.split('-')[-2].split('/')[-1].lower()} for x in topcontracts_path]

In [6]:
topcontractslist_df = pd.DataFrame(topcontracts_list)
topcontractslist_df.head()

Unnamed: 0,address
0,0x0142c3b2fc51819b5af5dfc4aa52df9722790851
1,0x607f4c5bb672230e8672085532f7e901544a7375
2,0x998b3b82bc9dba173990be7afb772788b5acb8bd
3,0x1014613e2b3cbc4d575054d4982e580d9b99d7b1
4,0x8db54ca569d3019a2ba126d03c37c44b5ef81ef6


In [7]:
analyzed_top_contracts = manual_analysis_df.merge(topcontractslist_df, on=['address'], how='inner')

In [8]:
print(analyzed_top_contracts.shape[0], "have been already analyzed from 10000 list")
analyzed_top_contracts_stats = get_tp_fp_stats(analyzed_top_contracts)
print("False positive:", analyzed_top_contracts_stats['total_fp'])
print("True positive:", analyzed_top_contracts_stats['total_tp'])
print("Other (eg. hardcoded)", analyzed_top_contracts.shape[0] - (analyzed_top_contracts_stats['total_tp'] + analyzed_top_contracts_stats['total_fp']))

41 have been already analyzed from 10000 list
False positive: 10
True positive: 22
Other (eg. hardcoded) 9


# _

# Load Kore Test for top 800

In [9]:
csv_path = "../experiments_raw_csvs/testfull10top.csv"
testfull10_df = pd.read_csv(csv_path)
testfull10_df['address'].str.lower()
print(testfull10_df.shape)
testfull10_df.head()

(846, 8)


Unnamed: 0,address,name,I1,I2,I6,I7,I8,I9
0,0x0142c3b2fc51819b5af5dfc4aa52df9722790851,PynToken,0,0,0,0,0,0
1,0x607F4C5BB672230e8672085532f7e901544a7375,RLC,0,0,0,0,0,0
2,0x998b3b82bc9dba173990be7afb772788b5acb8bd,BANCA,0,0,0,0,0,0
3,0x1014613e2b3cbc4d575054d4982e580d9b99d7b1,BCV,0,0,0,0,0,0
4,0x8db54ca569d3019a2ba126d03c37c44b5ef81ef6,DataWalletToken,0,0,0,0,0,0


In [10]:
def get_lines_with_not_zero_count(target_df, cols=['I1', 'I2', 'I6', 'I7', 'I8', 'I9']):
    """
    Returns a new dataframe with only the rows containing at least 1 violation
    """
    return target_df[(target_df[cols].sum(axis=1) > 0)]

In [11]:
testfull10_violated_df = get_lines_with_not_zero_count(testfull10_df, cols=['I1', 'I2', 'I6', 'I7', 'I8', 'I9'])
print("Contracts with at least 1 violation:", testfull10_violated_df.shape[0])

testfull10_violated_I1 = get_lines_with_not_zero_count(testfull10_violated_df, ['I1'])
testfull10_violated_I2 = get_lines_with_not_zero_count(testfull10_violated_df, ['I2'])
testfull10_violated_I6 = get_lines_with_not_zero_count(testfull10_violated_df, ['I6'])
testfull10_violated_I7 = get_lines_with_not_zero_count(testfull10_violated_df, ['I7'])
testfull10_violated_I8 = get_lines_with_not_zero_count(testfull10_violated_df, ['I8'])
testfull10_violated_I9 = get_lines_with_not_zero_count(testfull10_violated_df, ['I9'])

print("Contracts with violation I1:", testfull10_violated_I1.shape[0])
print("Contracts with violation I2:", testfull10_violated_I2.shape[0])
print("Contracts with violation I6:", testfull10_violated_I6.shape[0])
print("Contracts with violation I7:", testfull10_violated_I7.shape[0])
print("Contracts with violation I8:", testfull10_violated_I8.shape[0])
print("Contracts with violation I9:", testfull10_violated_I9.shape[0])

Contracts with at least 1 violation: 95
Contracts with violation I1: 8
Contracts with violation I2: 78
Contracts with violation I6: 4
Contracts with violation I7: 7
Contracts with violation I8: 1
Contracts with violation I9: 12


What are the new contracts to check?

In [12]:
to_check = testfull10_violated_df.merge(analyzed_top_contracts, on=['address'], how='outer', indicator=True)
to_check = to_check[to_check['_merge'] == 'left_only']
to_check.drop(columns=['_merge'])
print(to_check.shape)
to_check.head()

(59, 21)


Unnamed: 0,address,name,I1,I2,I6,I7,I8,I9,token name,experiment type,...,i1,i2,i6,i7,i8,i9,explanation,report,note,_merge
0,0x92e52a1a235d9a103d970901066ce910aacefd37,UCASH,0.0,0.0,0.0,1.0,0.0,0.0,,,...,,,,,,,,,,left_only
1,0x8606a8f28e1e2fd50b9074d65c01548b1f040b32,CryptrustToken,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
2,0xfa456cf55250a839088b27ee32a424d7dacb54ff,blocktrade,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
5,0xffe02ee4c69edf1b340fcad64fbd6b37a7b9e265,NANJCOIN,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
8,0xB70835D7822eBB9426B56543E391846C107bd32C,GameICO,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only


# Filter by pattern

### Check how many contracts have the pattern
{  
    balance* += ... ;  
    totalSupply += ...;  

In [13]:
import glob, re

regex = r"{(?:\n|\r\n?)\D*balance.*\[.*\].?\+=.?\w*;(?:\n|\r\n?)\D*totalSupply.?\+=.?.*;"
input_folder = "/home/eviglianisi/blockchain/blockchain/etherscan-scraper/21_02_toptoken/"

sources = glob.glob(input_folder + "*.sol")
mint_pattern = []

for s in sources:
    with open(s) as f:
        content = f.read()
        matches = re.search(regex, content, re.MULTILINE)
        if matches:
            splitted = s.split('-')
            token_name = splitted[-1].split('.')[0]
            token_addr = splitted[-2].split('/')[-1]
            mint_pattern.append(token_addr)

In [14]:
to_check.loc[to_check['address'].isin(mint_pattern)]

Unnamed: 0,address,name,I1,I2,I6,I7,I8,I9,token name,experiment type,...,i1,i2,i6,i7,i8,i9,explanation,report,note,_merge
10,0x621d78f2ef2fd937bfca696cabaf9a779f59b3ed,DRPToken,4.0,1.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
19,0xd7631787b4dcc87b1254cfd1e5ce48e96823dee8,SCLToken,7.0,5.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only


# Hardcoded addresses

Check how many contracts with hardcoded address.
NOTE. Check manually because the regex match also addresses that inside comments

In [15]:
regex = r"0x([0-9a-zA-Z]*[1-9a-bA-Z][0-9a-bA-Z]*)"
input_folder = "/home/eviglianisi/blockchain/blockchain/etherscan-scraper/21_02_toptoken/"

sources = glob.glob(input_folder + "*.sol")
hardcoded_addr = []

for s in sources:
    with open(s) as f:
        content = f.read()
        matches = re.search(regex, content, re.MULTILINE)
        if matches:
            splitted = s.split('-')
            token_name = splitted[-1].split('.')[0]
            token_addr = splitted[-2].split('/')[-1]
            hardcoded_addr.append(token_addr)

In [16]:
to_check.loc[to_check['address'].isin(hardcoded_addr)]

Unnamed: 0,address,name,I1,I2,I6,I7,I8,I9,token name,experiment type,...,i1,i2,i6,i7,i8,i9,explanation,report,note,_merge
2,0xfa456cf55250a839088b27ee32a424d7dacb54ff,blocktrade,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
5,0xffe02ee4c69edf1b340fcad64fbd6b37a7b9e265,NANJCOIN,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
8,0xB70835D7822eBB9426B56543E391846C107bd32C,GameICO,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
9,0x8727c112c712c4a03371ac87a74dd6ab104af768,JetCoin,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
10,0x621d78f2ef2fd937bfca696cabaf9a779f59b3ed,DRPToken,4.0,1.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
11,0x7a07e1a0c2514d51132183ecfea2a880ec3b7648,IXEToken,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
12,0x6b193e107a773967bd821bcf8218f3548cfa2503,PossContract,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
15,0x1602af2c782cc03f9241992e243290fccf73bb13,QBITToken,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
16,0x39013f961c378f02c2b82a6e1d31e9812786fd9d,SMSCoin,8.0,0.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
17,0x0e0989b1f9b8a38983c2ba8053269ca62ec9b195,PoetToken,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only


### All the other addresses to check manually

In [17]:
to_check.loc[~to_check['address'].isin(hardcoded_addr)]

Unnamed: 0,address,name,I1,I2,I6,I7,I8,I9,token name,experiment type,...,i1,i2,i6,i7,i8,i9,explanation,report,note,_merge
0,0x92e52a1a235d9a103d970901066ce910aacefd37,UCASH,0.0,0.0,0.0,1.0,0.0,0.0,,,...,,,,,,,,,,left_only
1,0x8606a8f28e1e2fd50b9074d65c01548b1f040b32,CryptrustToken,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
14,0x1a95b271b0535d15fa49932daba31ba612b52946,minereum,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
18,0xe30e02f049957e2a5907589e06ba646fb2c321ba,DRPUToken,1.0,1.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
19,0xd7631787b4dcc87b1254cfd1e5ce48e96823dee8,SCLToken,7.0,5.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
24,0xd26114cd6EE289AccF82350c8d8487fedB8A0C07,OMGToken,0.0,3.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
25,0x0ea984e789302b7b612147e4e4144e64f21425eb,WaleTokeN,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
37,0x43567eb78638A55bbE51E9f9FB5B2D7AD1F125aa,HacToken,0.0,10.0,10.0,0.0,10.0,0.0,,,...,,,,,,,,,,left_only
39,0xd2946be786f35c3cc402c29b323647abda799071,VikkyToken,0.0,10.0,0.0,0.0,0.0,0.0,,,...,,,,,,,,,,left_only
46,0x13f25cd52b21650caa8225c9942337d914c9b030,TokenERC20,0.0,0.0,0.0,0.0,0.0,10.0,,,...,,,,,,,,,,left_only


In [20]:
top_manual_analysis = '../experiments_manual_analysis/top-manual-analysis.csv'
top_manual_analysis_df = pd.read_csv(top_manual_analysis)
top_manual_analysis_df.columns = [x.lower() for x in top_manual_analysis_df.columns]
top_manual_analysis_df = top_manual_analysis_df.drop(columns=[x for x in top_manual_analysis_df.columns if 'unnamed' in x])
top_manual_analysis_df = top_manual_analysis_df[top_manual_analysis_df['experiment type'] != "no-one"]

In [22]:
to_check = to_check.drop(columns=['_merge'], errors='ignore')
to_check['address'] = to_check['address'].str.lower()
top_manual_analysis_df['address'] = top_manual_analysis_df['address'].str.lower()
remaining_addresses_to_analyze = top_manual_analysis_df.merge(to_check, on=['address'], how='outer', indicator=True)
remaining_addresses_to_analyze[remaining_addresses_to_analyze['_merge'] == 'right_only']

Unnamed: 0,address,experiment type_x,invariants violated_x,i1_x,i2_x,i6_x,i7_x,i8_x,i9_x,explanation_x,...,i1_y,i2_y,i6_y,i7_y,i8_y,i9_y,explanation_y,report,note_y,_merge


# Helpers

In [23]:
def get_stats(manual_analysis_df, target_df, name):
    cols=['I1', 'I2', 'I6', 'I7', 'I8', 'I9']
    target_violations = get_lines_with_not_zero_count(target_df, cols=cols)
    
    res = {}
    res['name'] = name
    res['contracts_number'] = target_df.shape[0]

    counter_fp = 0
    counter_tp = 0
    number_violations = 0
    
    for c in cols:
        violation_col = get_lines_with_not_zero_count(target_df, [c])
        merged_df = manual_analysis_df.merge(violation_col, on=['address'], how='inner', indicator=True)
        
        res[c] = violation_col.shape[0]
        c_index = c.lower()
        res[c + "_TP"] = len(merged_df[(merged_df[c_index] == "TP")])
        res[c + "_FP"] = len(merged_df[(merged_df[c_index] == "FP")])
        res[c + "_X"] = len(merged_df[(merged_df[c_index] != "FP") & (merged_df[c_index] != "TP")])
        
        counter_fp += res[c + "_FP"]
        counter_tp += res[c + "_TP"]
        number_violations += res[c]
        
        # DEBUG
        try:
            assert(res[c + "_FP"] + res[c + "_TP"] + res[c + "_X"] == res[c])
        except:
            merged_df = manual_analysis_df.merge(violation_col, on=['address'], how='outer', indicator=True)
            print(name, merged_df[merged_df['_merge'] == 'right_only'])       

        
    res['total_TP'] = counter_tp
    res['total_FP'] = counter_fp
    res['violation_number'] = number_violations


    return res

In [24]:
# Merge both manual analysis together
total_manual_analysis = pd.concat([manual_analysis_df, top_manual_analysis_df])
total_manual_analysis['address'] = total_manual_analysis['address'].str.lower()
total_manual_analysis = total_manual_analysis.drop_duplicates(subset='address', keep="first")
testfull10_df['address'] = testfull10_df['address'].str.lower()
testfull10top_stat = get_stats(total_manual_analysis, testfull10_df, 'testfull10')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [26]:
table = pd.DataFrame.from_dict([testfull10top_stat])
table.to_csv('../experiments_stats_csvs/testfull10top_stat.csv')
table

Unnamed: 0,I1,I1_FP,I1_TP,I1_X,I2,I2_FP,I2_TP,I2_X,I6,I6_FP,...,I8_X,I9,I9_FP,I9_TP,I9_X,contracts_number,name,total_FP,total_TP,violation_number
0,8,1,7,0,78,2,33,43,4,3,...,0,12,1,11,0,846,testfull10,13,53,110
