# Demo data

In [1]:
import pandas as pd

test_sol_files = pd.read_csv("./data/solfile/test_sol_file.csv")


In [2]:
test_sol_files.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18639 entries, 0 to 18638
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        18639 non-null  int64 
 1   contract_address  18639 non-null  object
 2   source_code       18639 non-null  object
dtypes: int64(1), object(2)
memory usage: 437.0+ KB


In [3]:
sol_file = test_sol_files.loc[0, "source_code"]

In [4]:
with open("sample.sol", "w") as f:
    f.write(sol_file)

In [5]:
import pandas as pd

train = pd.read_parquet("./out/data_train.parquet", engine="fastparquet")

train["num_token"] = train["function_body"].apply(lambda string: len(string.split()))
train["num_token"].describe()

count    740796.000000
mean         18.397077
std          29.428295
min           0.000000
25%           3.000000
50%           9.000000
75%          22.000000
max        1457.000000
Name: num_token, dtype: float64

In [7]:
len(train[train["num_token"] < 50]) / len(train)

0.9137832817671802

In [12]:
test = pd.read_parquet("./out/data_test.parquet", engine="fastparquet")

test["num_token"] = test["contract_masked"].apply(lambda string: len(string.split()))
test["num_token"].describe()

count    76178.000000
mean       815.031374
std        530.507409
min         11.000000
25%        356.000000
50%        763.000000
75%       1206.000000
max       2048.000000
Name: num_token, dtype: float64

In [13]:
len(test[test["num_token"] < 1024]) / len(test)

0.634028197117278

In [10]:
test[test["num_token"] == 3]["function_body"]

9                   \r\n    pendingOwner = newOwner;\r\n  
17             \r\n        migrationPhase = false;\r\n    
18       \r\n        return ERC20Basic(tokenAddress).tr...
19       \r\n        return super.transfer(_to, _value)...
20       \n        return pool.underlyingBalance(addres...
                               ...                        
76123                   \r\n        god = _newGod;\r\n    
76126    \r\n        interfaceContract = _interfaceCont...
76133    \r\n        frozenHoldings[_owner] += _tokens;...
76134    \r\n        frozenHoldings[_owner] -= _tokens;...
76175           \r\n        _decimals = decimals_;\r\n    
Name: function_body, Length: 5672, dtype: object

# Check variable accessibility

In [1]:
from solidity_parser import parser

with open("./test.sol", "r") as f:
    sol_file = f.read()

sol_file

'// SPDX-License-Identifier: MIT\npragma solidity ^0.8.9;\n\nstruct testType {\n    uint testInt;\n    string testString;\n}\n\ncontract Test {\n    uint x = 0;\n    uint num = x;\n    testType test;\n    mapping (uint => string) map;\n\n    \n    function something(testType memory x_, uint y) public pure {\n        x_.testInt = 1;\n        y = 0;\n    }\n}'

In [2]:
import json

with open("test.json", "w") as f:
    f.write(json.dumps(parser.parse(sol_file, loc=True)))

In [4]:
with open("./sample.sol", "r") as f:
    sample_file = f.read()
    
with open("sample.json", "w") as f:
    f.write(json.dumps(parser.parse(sample_file, loc=False)))

# Check data

In [1]:
import pandas as pd

original_train = pd.read_csv("./data/solfile/train_sol_file.csv")
original_valid = pd.read_csv("./data/solfile/valid_sol_file.csv")
original_test = pd.read_csv("./data/solfile/test_sol_file.csv") 

train_data = pd.concat([original_train, original_valid], axis=0).reset_index()

train_data.to_parquet("./data/solfile/train_file.parquet", engine="fastparquet")
original_test.to_parquet("./data/solfile/test_file.parquet", engine="fastparquet")

In [2]:
# Test data

train_data = pd.read_parquet("./data/solfile/train_file.parquet", engine="fastparquet")
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167758 entries, 0 to 167757
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   index             167758 non-null  int64 
 1   Unnamed: 0        167758 non-null  int64 
 2   contract_address  167758 non-null  object
 3   source_code       167758 non-null  object
dtypes: int64(2), object(2)
memory usage: 5.1+ MB
None


In [3]:
train_data = train_data.reset_index(drop=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167758 entries, 0 to 167757
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   index             167758 non-null  int64 
 1   Unnamed: 0        167758 non-null  int64 
 2   contract_address  167758 non-null  object
 3   source_code       167758 non-null  object
dtypes: int64(2), object(2)
memory usage: 5.1+ MB


In [4]:
train_data.drop(columns=["Unnamed: 0"], inplace=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167758 entries, 0 to 167757
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   index             167758 non-null  int64 
 1   contract_address  167758 non-null  object
 2   source_code       167758 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.8+ MB


In [5]:
train_data.to_parquet("./data/solfile/train_file.parquet", engine="fastparquet")

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167758 entries, 0 to 167757
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   index             167758 non-null  int64 
 1   contract_address  167758 non-null  object
 2   source_code       167758 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.8+ MB


In [7]:
test_data = pd.read_parquet("./data/solfile/test_file.parquet")
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18639 entries, 0 to 18638
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        18639 non-null  int64 
 1   contract_address  18639 non-null  object
 2   source_code       18639 non-null  object
dtypes: int64(1), object(2)
memory usage: 437.0+ KB


In [12]:
test_data.rename({"Unnamed: 0": "index"}, inplace=True)

In [13]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18639 entries, 0 to 18638
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        18639 non-null  int64 
 1   contract_address  18639 non-null  object
 2   source_code       18639 non-null  object
dtypes: int64(1), object(2)
memory usage: 582.5+ KB


In [14]:
test_data.rename(index={0: "index"}, inplace=True)
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18639 entries, index to 18638
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        18639 non-null  int64 
 1   contract_address  18639 non-null  object
 2   source_code       18639 non-null  object
dtypes: int64(1), object(2)
memory usage: 582.5+ KB


In [15]:
test_data.index.name = "index"

In [16]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18639 entries, index to 18638
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        18639 non-null  int64 
 1   contract_address  18639 non-null  object
 2   source_code       18639 non-null  object
dtypes: int64(1), object(2)
memory usage: 582.5+ KB


In [17]:
test_data = test_data.rename(index={0: "index"})
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18639 entries, index to 18638
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        18639 non-null  int64 
 1   contract_address  18639 non-null  object
 2   source_code       18639 non-null  object
dtypes: int64(1), object(2)
memory usage: 582.5+ KB


In [18]:
test_data = test_data.reset_index(drop=True)
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18639 entries, 0 to 18638
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        18639 non-null  int64 
 1   contract_address  18639 non-null  object
 2   source_code       18639 non-null  object
dtypes: int64(1), object(2)
memory usage: 437.0+ KB


In [19]:
test_data.drop(columns=["Unnamed: 0"], inplace=True)
test_data 

Unnamed: 0,contract_address,source_code
0,0x39176c7d2fe75fbcd0bfaece2fd58472b15b5a53,/**\r\n *Submitted for verification at Ethersc...
1,0x8efe2c82bd31b67fa262c0d364773629f6ea828a,// hevm: flattened sources of src/DssSpell.sol...
2,0x286708f069225905194673755f12359e6aff6fe1,pragma solidity 0.4.25;\r\n\r\ncontract ERC20B...
3,0xb374387a340e6aa7d78385c4a4aac6b425a685b0,contract Strategy is BaseStrategyInitializable...
4,0x05b55fd424765323aaefa26ae46d5a88c4e8abd2,pragma solidity >=0.4.22 <0.7.0;\r\n\r\nabstra...
...,...,...
18634,0x48a7c27fca93b3282fc403cfa849b0ac3fe26645,// SPDX-License-Identifier: GPL-3.0\r\n\r\npra...
18635,0xb518575fef12793c0d3b703f35a75b8c9e09cb71,// SPDX-License-Identifier: MIT\n\npragma soli...
18636,0x805b28b949fc66fb9bb73e5c029d5a06ec83f322,// SPDX-License-Identifier: MIT\r\n\r\npragma ...
18637,0x9815c5a8626ec80352c3abf6f138a094d51f276d,pragma solidity ^0.6.12;\r\n\r\n// uncomment i...


In [20]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18639 entries, 0 to 18638
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   contract_address  18639 non-null  object
 1   source_code       18639 non-null  object
dtypes: object(2)
memory usage: 291.4+ KB


In [21]:
test_data.to_parquet("./data/solfile/test_file.parquet", engine="fastparquet")

In [22]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167758 entries, 0 to 167757
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   index             167758 non-null  int64 
 1   contract_address  167758 non-null  object
 2   source_code       167758 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.8+ MB


In [23]:
train_data.drop(columns=["index"], inplace=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167758 entries, 0 to 167757
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   contract_address  167758 non-null  object
 1   source_code       167758 non-null  object
dtypes: object(2)
memory usage: 2.6+ MB


In [24]:
train_data.to_parquet("./data/solfile/train_file.parquet", engine="fastparquet")

In [25]:
train_data = pd.read_parquet("./data/solfile/train_file.parquet", engine="fastparquet")
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167758 entries, 0 to 167757
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   contract_address  167758 non-null  object
 1   source_code       167758 non-null  object
dtypes: object(2)
memory usage: 2.6+ MB


In [26]:
test_data = pd.read_parquet("./data/solfile/test_file.parquet", engine="fastparquet")
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18639 entries, 0 to 18638
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   contract_address  18639 non-null  object
 1   source_code       18639 non-null  object
dtypes: object(2)
memory usage: 291.4+ KB


# Make data with accessible variable information

In [29]:
import pandas as pd

train_data = pd.read_parquet("./data/data/train_data.parquet", engine="fastparquet")
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141329 entries, 0 to 141328
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   file_address     141329 non-null  object
 1   contract_name    141329 non-null  object
 2   func_name        141329 non-null  object
 3   masked_contract  141329 non-null  object
 4   func_body        141329 non-null  object
dtypes: object(5)
memory usage: 5.4+ MB
