In [1]:
import data_process as process
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
### Load the bids and payloads CSV files ###

origin_bids_df = pd.read_csv("data/Eden_MEV-Boost_bid_20240404.csv") # MEV-Boost bids data from Eden Public Data
origin_payload_df = pd.read_csv("data/mev_blocks_19580000_to_19589999.csv") # MEV-Boost Winning Bid Data

# Check if matched_df already exists in the current namespace
if 'matched_df' not in locals():
    # Process the data if matched_df does not exist
    bids_df, payload_df = process.cleaning(origin_bids_df, origin_payload_df)
    bids_df, payload_df = process.transformation(bids_df, payload_df)
    matched_df,  origin_matched_df = process.get_matched_df(bids_df, payload_df)
    
else:
    print("matched_df already exists. Skipping processing steps.")

Data cleaning has completed
Data transformation has completed
Amount of distinct block_number in bids_df: 1206
Amount of matched block_number in payload_df:  1036
Amount of distinct winner block_hash: 1036
Got matched_df (winner bids data)


In [4]:
# List of parameter sets
parameters1 = ['base_fee_per_gas', 'normalised_num_tx', 'normalised_value', 
              'gasUsedRatio','normalised_t_diff','time_difference_max',
              'bids_count']
parameters2 = ['base_fee_per_gas', 'num_tx', 'value', 
              'gasUsedRatio','time_difference','time_difference_max',
              'bids_count']

parameters = [parameters1, parameters2]

results = {}

In [5]:
### Feature selection ###
a = 1 # Initialised number of parameter sets
for f in parameters:
    for target in f:
        
        predictors = [p for p in f if p != target]
        
        X = matched_df[predictors]
        y = matched_df[target]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        rf = RandomForestRegressor(
            n_estimators=200,
            max_depth=20,
            min_samples_split=2,
            min_samples_leaf=1,
            random_state=42
        )
        
        rf.fit(X_train, y_train)
        
        y_pred = rf.predict(X_test)
        
        r2 = r2_score(y_test, y_pred)
    
        cross_val_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        avg_cross_val_score = cross_val_scores.mean()
    
        results[target] = {
            'r2_score': r2,
            'cross_val_score': avg_cross_val_score
        }
        
        print(f"\n({a}) Results for target variable '{target}':")
        print(f"R squared error: {r2}")
        print(f"Cross-Validation Score (Negative MSE): {avg_cross_val_score}")
    a = a + 1
    
    
    print("\nSummary of results:")
    for target, scores in results.items():
        print(f"Target variable '{target}': R squared error = {scores['r2_score']}, Cross-Validation Score = {scores['cross_val_score']}")



(1) Results for target variable 'base_fee_per_gas':
R squared error: 0.08314124015594804
Cross-Validation Score (Negative MSE): -9.95077938098865e+19

(1) Results for target variable 'normalised_num_tx':
R squared error: 0.46596289468959173
Cross-Validation Score (Negative MSE): -0.012860044833661769

(1) Results for target variable 'normalised_value':
R squared error: -0.3222176707529676
Cross-Validation Score (Negative MSE): -0.01694690719609069

(1) Results for target variable 'gasUsedRatio':
R squared error: 0.30336360960933284
Cross-Validation Score (Negative MSE): -295.4087842959914

(1) Results for target variable 'normalised_t_diff':
R squared error: 0.35139177748030737
Cross-Validation Score (Negative MSE): -0.0009395777706437654

(1) Results for target variable 'time_difference_max':
R squared error: 0.44558083016995875
Cross-Validation Score (Negative MSE): -0.1556194192122567

(1) Results for target variable 'bids_count':
R squared error: 0.33239628515017405
Cross-Validati