# Merge train data and valid data

In [27]:
import pandas as pd

data_train = pd.read_parquet("./out/all_data_train.parquet", engine="fastparquet")
data_valid = pd.read_parquet("./out/all_data_valid.parquet", engine="fastparquet")
data_train = pd.concat([data_train, data_valid], axis=0).reset_index(drop=True)
print(data_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907368 entries, 0 to 907367
Data columns (total 7 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   index                 907368 non-null  int64 
 1   file_address          907368 non-null  object
 2   contract_name         907368 non-null  object
 3   function_name         840591 non-null  object
 4   contract_masked       907368 non-null  object
 5   function_body         880821 non-null  object
 6   function_requirement  907368 non-null  object
dtypes: int64(1), object(6)
memory usage: 48.5+ MB
None


In [28]:
data_train = data_train.dropna(axis=0)
print(data_train.info())

<class 'pandas.core.frame.DataFrame'>
Index: 819390 entries, 0 to 907367
Data columns (total 7 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   index                 819390 non-null  int64 
 1   file_address          819390 non-null  object
 2   contract_name         819390 non-null  object
 3   function_name         819390 non-null  object
 4   contract_masked       819390 non-null  object
 5   function_body         819390 non-null  object
 6   function_requirement  819390 non-null  object
dtypes: int64(1), object(6)
memory usage: 50.0+ MB
None


In [29]:
data_train.groupby(["file_address", "contract_name"]).ngroups

144608

In [30]:
data_train["num_token"] = data_train["contract_masked"].apply(lambda str: len(str.split()))

In [31]:
print(data_train["num_token"].max())
print(data_train["num_token"].min())

14254
10


In [32]:
data_train[data_train["num_token"] <= 4096].shape[0] / len(data_train)

0.9797129571998682

In [33]:
len(data_train[(data_train["num_token"] <= 4096) & (data_train["num_token"] >= 100)]) / len(data_train)

0.939235284785023

In [38]:
data_train[data_train["num_token"] == 10]["contract_masked"]

252310    contract ArtemineICOFactory {\r\n    /* Constr...
624843    contract MyContract {\n    /* Constructor */\n...
635434    contract MinereumTokenCreationService {\r\n   ...
Name: contract_masked, dtype: object

In [39]:
data_train[data_train["num_token"] == 10].loc[624843, "contract_masked"]

'contract MyContract {\n    /* Constructor */\n   \r\n function MyContract() {<FILL_FUNCTION_BODY>}\r\n}'

In [40]:
data_train[data_train["num_token"] == 10].loc[635434, "contract_masked"]

'contract MinereumTokenCreationService {\r\n    /* Constructor */\r\n    function MinereumTokenCreationService() {<FILL_FUNCTION_BODY>}\r\n}'

In [43]:
data_train = data_train[data_train["num_token"] <= 2048].reset_index(drop=True)

In [44]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740796 entries, 0 to 740795
Data columns (total 8 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   index                 740796 non-null  int64 
 1   file_address          740796 non-null  object
 2   contract_name         740796 non-null  object
 3   function_name         740796 non-null  object
 4   contract_masked       740796 non-null  object
 5   function_body         740796 non-null  object
 6   function_requirement  740796 non-null  object
 7   num_token             740796 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 45.2+ MB


In [47]:
data_train = data_train.drop(columns=["num_token"])

In [48]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740796 entries, 0 to 740795
Data columns (total 7 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   index                 740796 non-null  int64 
 1   file_address          740796 non-null  object
 2   contract_name         740796 non-null  object
 3   function_name         740796 non-null  object
 4   contract_masked       740796 non-null  object
 5   function_body         740796 non-null  object
 6   function_requirement  740796 non-null  object
dtypes: int64(1), object(6)
memory usage: 39.6+ MB


In [49]:
data_train.to_parquet("./out/data_train.parquet", engine="fastparquet")

# Make test data

In [50]:
import pandas as pd

data = []
for i in range(0, 19):
    try:
        data.append(pd.read_csv(f"./out/data{i}.csv"))
    except:
        print(i)

In [51]:
all_data = pd.concat(data, axis=0).reset_index(drop=True)
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92821 entries, 0 to 92820
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   file_address          92821 non-null  object
 1   contract_name         92821 non-null  object
 2   function_name         86163 non-null  object
 3   contract_masked       92821 non-null  object
 4   function_body         90218 non-null  object
 5   function_requirement  92821 non-null  object
dtypes: object(6)
memory usage: 4.2+ MB


In [53]:
all_data = all_data.dropna(axis=0)
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84066 entries, 0 to 92820
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   file_address          84066 non-null  object
 1   contract_name         84066 non-null  object
 2   function_name         84066 non-null  object
 3   contract_masked       84066 non-null  object
 4   function_body         84066 non-null  object
 5   function_requirement  84066 non-null  object
dtypes: object(6)
memory usage: 4.5+ MB


## Check unique contract

In [54]:
all_data.groupby(["file_address", "contract_name"]).ngroups

14648

In [55]:
all_data["num_token"] = all_data["contract_masked"].apply(lambda string: len(string.split()))
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84066 entries, 0 to 92820
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   file_address          84066 non-null  object
 1   contract_name         84066 non-null  object
 2   function_name         84066 non-null  object
 3   contract_masked       84066 non-null  object
 4   function_body         84066 non-null  object
 5   function_requirement  84066 non-null  object
 6   num_token             84066 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 5.1+ MB


In [56]:
all_data["num_token"].describe()

count    84066.000000
mean      1049.983144
std        986.414891
min         11.000000
25%        395.000000
50%        852.000000
75%       1418.000000
max      10343.000000
Name: num_token, dtype: float64

In [57]:
len(all_data[all_data["num_token"] <= 2048]) / len(all_data)

0.9061689624818595

In [58]:
all_data = all_data[all_data["num_token"] <= 2048]
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76178 entries, 0 to 92820
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   file_address          76178 non-null  object
 1   contract_name         76178 non-null  object
 2   function_name         76178 non-null  object
 3   contract_masked       76178 non-null  object
 4   function_body         76178 non-null  object
 5   function_requirement  76178 non-null  object
 6   num_token             76178 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 4.6+ MB


In [59]:
all_data.groupby(["file_address", "contract_name"]).ngroups

14332

In [60]:
all_data[all_data["num_token"] == 11]["contract_masked"]

45576    contract Initializer is StorageV1 {\n\n    //c...
Name: contract_masked, dtype: object

In [61]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76178 entries, 0 to 92820
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   file_address          76178 non-null  object
 1   contract_name         76178 non-null  object
 2   function_name         76178 non-null  object
 3   contract_masked       76178 non-null  object
 4   function_body         76178 non-null  object
 5   function_requirement  76178 non-null  object
 6   num_token             76178 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 4.6+ MB


In [63]:
all_data = all_data.drop(columns=["num_token"]).reset_index()

In [64]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76178 entries, 0 to 76177
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   index                 76178 non-null  int64 
 1   file_address          76178 non-null  object
 2   contract_name         76178 non-null  object
 3   function_name         76178 non-null  object
 4   contract_masked       76178 non-null  object
 5   function_body         76178 non-null  object
 6   function_requirement  76178 non-null  object
dtypes: int64(1), object(6)
memory usage: 4.1+ MB


In [65]:
all_data.to_parquet("./out/data_test.parquet", engine="fastparquet")

In [70]:

test_data = pd.read_parquet("./out/data_test.parquet", engine="fastparquet")


In [71]:
import random
test_data.loc[random.randint(0, 2000), "contract_masked"]

'contract Controller {\r\n\r\n    struct Scheme {\r\n        bytes32 paramsHash;  // a hash "configuration" of the scheme\r\n        bytes4  permissions; // A bitwise flags of permissions,\r\n                             // All 0: Not registered,\r\n                             // 1st bit: Flag if the scheme is registered,\r\n                             // 2nd bit: Scheme can register other schemes\r\n                             // 3rd bit: Scheme can add/remove global constraints\r\n                             // 4th bit: Scheme can upgrade the controller\r\n                             // 5th bit: Scheme can call genericCall on behalf of\r\n                             //          the organization avatar\r\n    }\r\n\r\n    struct GlobalConstraint {\r\n        address gcAddress;\r\n        bytes32 params;\r\n    }\r\n\r\n    struct GlobalConstraintRegister {\r\n        bool isRegistered; //is registered\r\n        uint256 index;    //index at globalConstraints\r\n    }\r\n\r\n    