In [None]:
import os
import re
import tiktoken

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
COLS = ['contract_creator', 'contract_address', 'contract_name', 'decompiled_opcodes', 'malicious']
pretraining_data = pd.read_parquet('/data/forta/ethereum/text/pretraining/malicious_contract_training_dataset_final.parquet', columns=COLS)

In [None]:
pretraining_data['malicious'].value_counts()

In [None]:
pretraining_data.columns

In [None]:
def get_exp_2_features(row):
    creator = row['contract_creator']
    opcodes = row['decompiled_opcodes'].split()
    mask = '0xffffffffffffffffffffffffffffffffffffffff'
    features = []
    for i in range(len(opcodes)-1):
        first = opcodes[i]
        second = opcodes[i+1]
        if not first.startswith('0x'):
            token = first
            if first.startswith('UNKNOWN') or first.startswith('INVALID'):
                token = first.split('_')[0]
            features.append(token)
        elif first == 'PUSH4':
            features.append(second)
        elif first == 'PUSH20':
            if second == creator:
                features.append('creator')
            elif second == mask:
                features.append(mask)
            else:
                features.append('address')
        elif first == 'PUSH32':
            features.append(second)
    return " ".join(features)

In [None]:
# Prepare data for pretraining phase
# First clean and process the opcode data
pretraining_data['experiment_2_opcodes'] = pretraining_data.apply(get_exp_2_features, axis=1)

# Files to store the data
train_file_path = '/data/forta/ethereum/text/pretraining/small_pretraining_train.txt'
val_file_path = '/data/forta/ethereum/text/pretraining/small_pretraining_val.txt'

# Suffle data
pretraining_data = pd.concat([pretraining_data[:499], pretraining_data[5000:]])
pretraining_data = pretraining_data.sample(frac = 1)

# Save the data to disk
training_data = pretraining_data[:499]
validation_data = pretraining_data[500:]
training_data['experiment_2_opcodes'].to_csv(train_file_path, sep='\t', index=False)
validation_data['experiment_2_opcodes'].to_csv(val_file_path, sep='\t', index=False)

In [None]:
# Prepare data for finetuning phase
# Training
training_data.loc[training_data['malicious'] == False].to_csv('/data/forta/ethereum/text/finetuning/training/normal/normal.txt',
                                        columns=['experiment_2_opcodes'], sep='\t', index=False)
training_data.loc[training_data['malicious'] == True].to_csv('/data/forta/ethereum/text/finetuning/training/malicious/malicious.txt',
                                        columns=['experiment_2_opcodes'], sep='\t', index=False)

# Validation
validation_data.loc[validation_data['malicious'] == False].to_csv('/data/forta/ethereum/text/finetuning/validation/normal/normal.txt',
                                        columns=['experiment_2_opcodes'], sep='\t', index=False)
validation_data.loc[validation_data['malicious'] == True].to_csv('/data/forta/ethereum/text/finetuning/validation/malicious/malicious.txt',
                                        columns=['experiment_2_opcodes'], sep='\t', index=False)