# Remove uncompilable file

## Load solidity file

In [94]:
import os
os.chdir("/home/hieuvd/lvdthieu/CodeGen/")

In [95]:
import pandas as pd
train_file = pd.read_parquet("./data/solfile/train_file.parquet", engine="fastparquet")
train_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167310 entries, 0 to 167309
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   file_name         167310 non-null  object
 1   file_address      167310 non-null  object
 2   source_code       167310 non-null  object
 3   abi               167310 non-null  object
 4   compiler_version  167310 non-null  object
 5   library           167310 non-null  object
dtypes: object(6)
memory usage: 7.7+ MB


In [96]:
test_file = pd.read_parquet("./data/solfile/test_file.parquet", engine="fastparquet")
test_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18594 entries, 0 to 18593
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   file_name         18594 non-null  object
 1   file_address      18594 non-null  object
 2   source_code       18594 non-null  object
 3   abi               18594 non-null  object
 4   compiler_version  18594 non-null  object
 5   library           18594 non-null  object
dtypes: object(6)
memory usage: 871.7+ KB


## Filter out file with "import" word

In [97]:
test_file = test_file[~(test_file["source_code"].str.contains("import", regex=False))]\
    [["file_name", "file_address", "source_code", "compiler_version"]]\
        .reset_index(drop=True)
                        
test_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13642 entries, 0 to 13641
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   file_name         13642 non-null  object
 1   file_address      13642 non-null  object
 2   source_code       13642 non-null  object
 3   compiler_version  13642 non-null  object
dtypes: object(4)
memory usage: 426.4+ KB


In [98]:
train_file = train_file[~(train_file["source_code"].str.contains("import", regex=False))]\
    [["file_name", "file_address", "source_code", "compiler_version"]]\
        .reset_index(drop=True)
train_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127351 entries, 0 to 127350
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   file_name         127351 non-null  object
 1   file_address      127351 non-null  object
 2   source_code       127351 non-null  object
 3   compiler_version  127351 non-null  object
dtypes: object(4)
memory usage: 3.9+ MB


## Standardize compiler_version name

In [99]:
train_file["compiler_version"] = train_file["compiler_version"].apply(lambda x: x.split('+')[0].split('-')[0][1:])
train_file["compiler_version"]

0         0.4.20
1         0.8.11
2         0.6.12
3          0.8.7
4         0.4.24
           ...  
127346    0.6.11
127347    0.5.17
127348    0.4.19
127349    0.6.12
127350    0.5.17
Name: compiler_version, Length: 127351, dtype: object

In [100]:
test_file["compiler_version"] = test_file["compiler_version"].apply(lambda x: x.split('+')[0].split('-')[0][1:])
test_file["compiler_version"]

0        0.4.12
1        0.4.25
2        0.6.12
3        0.6.11
4         0.7.0
          ...  
13637     0.5.9
13638    0.6.12
13639    0.6.12
13640     0.8.1
13641     0.6.6
Name: compiler_version, Length: 13642, dtype: object

## Filter compiler with version < 0.4.11

In [101]:
def filter(stdized_version):
    version = list(map(int, stdized_version.split('.')))
    if version[0] == 0:
        if version[1] == 4:
            if version[2] < 11:
                return False
        if version[2] < 4:
            return False
    return True

In [102]:
test_file = test_file[test_file["compiler_version"].apply(filter)].reset_index(drop=True)
test_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12078 entries, 0 to 12077
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   file_name         12078 non-null  object
 1   file_address      12078 non-null  object
 2   source_code       12078 non-null  object
 3   compiler_version  12078 non-null  object
dtypes: object(4)
memory usage: 377.6+ KB


In [103]:
train_file = train_file[train_file["compiler_version"].apply(filter)].reset_index(drop=True)
train_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113462 entries, 0 to 113461
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   file_name         113462 non-null  object
 1   file_address      113462 non-null  object
 2   source_code       113462 non-null  object
 3   compiler_version  113462 non-null  object
dtypes: object(4)
memory usage: 3.5+ MB


## Test compile

In [104]:
os.chdir("/home/hieuvd/lvdthieu/CodeGen")

In [105]:
compiler_versions = list(set(train_file["compiler_version"].tolist() + test_file["compiler_version"].tolist()))
len(compiler_versions)

58

In [106]:
compiler_config = ""
for version in compiler_versions:
    compiler_config += '{version: "' + version + '"},\n'
with open("compiler_config.txt", "w") as f:
    f.write(compiler_config)

In [107]:
for i in range(100):
    source = test_file.loc[i, "source_code"]
    with open(f"./hardhat/contracts/sample_{i}.sol", "w") as f:
        f.write(source)

# Remove duplicates data

In [35]:
import pandas as pd

mask_all_func_body = pd.read_parquet("/home/hieuvd/lvdthieu/CodeGen/data/data/mask_all_func_body.parquet", engine="fastparquet")
mask_all_func_body.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1251883 entries, 0 to 1251882
Data columns (total 6 columns):
 #   Column                     Non-Null Count    Dtype 
---  ------                     --------------    ----- 
 0   source_idx                 1251883 non-null  object
 1   contract_name              1251883 non-null  object
 2   func_name                  1180119 non-null  object
 3   masked_contract            1251883 non-null  object
 4   func_body                  1251883 non-null  object
 5   func_body_removed_comment  1251883 non-null  object
dtypes: object(6)
memory usage: 57.3+ MB


In [36]:
mask_all_func_body.head()

Unnamed: 0,source_idx,contract_name,func_name,masked_contract,func_body,func_body_removed_comment
0,0,Ownable,Ownable,contract\tOwnable\t\t{\t\t\t\t\t\t\n\taddress\...,\t\t\t\t\t\t\r\n\t\towner\t= msg.sender;\t\t\t...,\t\t\t\t\t\t\n\t\towner\t= msg.sender;\t\t\t\t...
1,0,Ownable,transfertOwnership,contract\tOwnable\t\t{\t\t\t\t\t\t\n\taddress\...,\t\r\n\t\towner\t=\tnewOwner\t;\t\t\t\t\r\n\t,\t\n\t\towner\t=\tnewOwner\t;\t\t\t\t\n\t
2,1,SCT,SCT,contract SCT is SafeMath{\n string public n...,\r\n balanceOf[msg.sender] = initialSup...,\n balanceOf[msg.sender] = initialSuppl...
3,1,SCT,transfer,contract SCT is SafeMath{\n string public n...,\r\n require (_value > 0) ;\r\n re...,\n require (_value > 0) ;\n requir...
4,1,SCT,approve,contract SCT is SafeMath{\n string public n...,\r\n allowance[msg.sender][_spender] ...,\n allowance[msg.sender][_spender] = ...


In [37]:
mask_all_func_body = mask_all_func_body.sample(frac=1, random_state=29, ignore_index=True)

In [39]:
mask_all_func_body.drop_duplicates(subset=["func_body_removed_comment"], inplace=True, ignore_index=True)
mask_all_func_body.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308807 entries, 0 to 308806
Data columns (total 6 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   source_idx                 308807 non-null  object
 1   contract_name              308807 non-null  object
 2   func_name                  279223 non-null  object
 3   masked_contract            308807 non-null  object
 4   func_body                  308807 non-null  object
 5   func_body_removed_comment  308807 non-null  object
dtypes: object(6)
memory usage: 14.1+ MB


In [40]:
mask_all_func_body = mask_all_func_body.sample(frac=1, random_state=11, ignore_index=True)

In [41]:
mask_all_func_body.drop_duplicates(subset=["source_idx", "contract_name"], inplace=True, ignore_index=True)

In [43]:
mask_all_func_body.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74288 entries, 0 to 74287
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   source_idx                 74288 non-null  object
 1   contract_name              74288 non-null  object
 2   func_name                  58521 non-null  object
 3   masked_contract            74288 non-null  object
 4   func_body                  74288 non-null  object
 5   func_body_removed_comment  74288 non-null  object
dtypes: object(6)
memory usage: 3.4+ MB


In [44]:
mask_all_func_body.to_parquet("/home/hieuvd/lvdthieu/CodeGen/data/data/masked_all_func_body_v1.parquet", engine="fastparquet")

In [51]:
def remove_duplicates(df, duplicate_cols):
    df = df.sample(frac=1, random_state=29, ignore_index=True)
    df.drop_duplicates(subset=duplicate_cols, inplace=True, ignore_index=True)
    return df

In [52]:
mask_func_has_req_body = pd.read_parquet("/home/hieuvd/lvdthieu/CodeGen/data/data/mask_func_has_req_body.parquet", engine="fastparquet")
mask_func_has_req_body.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346379 entries, 0 to 346378
Data columns (total 7 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   source_idx                 346379 non-null  object
 1   contract_name              346379 non-null  object
 2   func_name                  328075 non-null  object
 3   masked_contract            346379 non-null  object
 4   func_body                  346379 non-null  object
 5   func_requirement           346379 non-null  object
 6   func_body_removed_comment  346379 non-null  object
dtypes: object(7)
memory usage: 18.5+ MB


In [53]:
mask_func_has_req_body = remove_duplicates(mask_func_has_req_body, ["func_body_removed_comment"])
mask_func_has_req_body.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113542 entries, 0 to 113541
Data columns (total 7 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   source_idx                 113542 non-null  object
 1   contract_name              113542 non-null  object
 2   func_name                  105100 non-null  object
 3   masked_contract            113542 non-null  object
 4   func_body                  113542 non-null  object
 5   func_requirement           113542 non-null  object
 6   func_body_removed_comment  113542 non-null  object
dtypes: object(7)
memory usage: 6.1+ MB


In [54]:
mask_func_has_req_body = remove_duplicates(mask_func_has_req_body, ["source_idx", "contract_name"])
mask_func_has_req_body.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35475 entries, 0 to 35474
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   source_idx                 35475 non-null  object
 1   contract_name              35475 non-null  object
 2   func_name                  29902 non-null  object
 3   masked_contract            35475 non-null  object
 4   func_body                  35475 non-null  object
 5   func_requirement           35475 non-null  object
 6   func_body_removed_comment  35475 non-null  object
dtypes: object(7)
memory usage: 1.9+ MB


In [55]:
masked_entire_function_has_req_v1 = pd.read_parquet("/home/hieuvd/lvdthieu/CodeGen/data/data/masked_entire_fuction_has_req_v1.parquet", engine="fastparquet")
masked_entire_function_has_req_v1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40787 entries, 0 to 40786
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   source_idx                40787 non-null  object
 1   contract_name             40787 non-null  object
 2   func_name                 34723 non-null  object
 3   masked_contract           40787 non-null  object
 4   function                  40787 non-null  object
 5   func_requirement          40787 non-null  object
 6   function_removed_comment  40787 non-null  object
dtypes: object(7)
memory usage: 2.2+ MB
