## Import Libraries


In [1]:
import certifi
import os
import polars as pl
import pandas as pd
import numpy as np
import torch 
os.environ['SSL_CERT_FILE'] = certifi.where()
from giza_datasets import DatasetsLoader, DatasetsHub



# Data Collection


In [2]:
wstETH_daily_APY = pd.read_parquet('../data/weETH_daily_yield.parquet')
wstETH_daily_APY.head()


Unnamed: 0,day,lido_yield
0,2023-01-01,0.043986
1,2023-01-02,0.045363
2,2023-01-03,0.047311
3,2023-01-04,0.049922
4,2023-01-05,0.056912


In [3]:
loader = DatasetsLoader()

df_daily_token = loader.load('tokens-daily-prices-mcap-volume').to_pandas()
stETH_data = df_daily_token[df_daily_token['token'] == 'STETH']
stETH_data.head()

Unnamed: 0,date,price,market_cap,volumes_last_24h,token
14240,2020-12-22,617.164997,0.0,137563.026557,STETH
14241,2020-12-23,617.164997,0.0,137563.026557,STETH
14242,2020-12-24,596.767711,4424223.0,67102.259216,STETH
14243,2020-12-25,611.504712,5651613.0,54336.794215,STETH
14244,2020-12-26,624.80042,7284744.0,45933.064541,STETH


In [4]:
merged_data = pd.merge(wstETH_daily_APY, stETH_data,left_on='day',right_on='date', how='outer').drop(columns=['day','token'])

In [5]:
merged_data.head()

Unnamed: 0,lido_yield,date,price,market_cap,volumes_last_24h
0,0.043986,2023-01-01,1180.264013,5707855000.0,171250700.0
1,0.045363,2023-01-02,1184.713344,5734759000.0,6866893.0
2,0.047311,2023-01-03,1199.652682,5812986000.0,12774850.0
3,0.049922,2023-01-04,1199.474802,5814007000.0,14679410.0
4,0.056912,2023-01-05,1241.457359,6013456000.0,15954030.0


In [7]:
protocol_trade_df = pd.read_parquet('../data/protocol_trades_liquidity.parquet')
protocol_trade_df.head()

Unnamed: 0,time,project,tvl,liquidity_usd,steth_amount,trading_volume,liquidity_utilization,Liquidity utilization (ma_30),Trading volume (ma_30)
0,2024-04-19,curve,375178000.0,152321000.0,72439.378535,21565960.0,0.057482,0.182963,77001600.0
1,2024-04-19,uniswap_v3,63503780.0,11278950.0,16786.824563,68047760.0,1.071554,1.405975,85107650.0
2,2024-04-19,balancer,35673390.0,10842140.0,8057.973942,22387120.0,0.627558,0.431717,18641390.0
3,2024-04-19,aerodrome,21664960.0,9606449.0,3912.618499,278330.5,0.012847,0.054155,1128186.0
4,2024-04-19,velodrome_v2,14210420.0,7090750.0,2310.182895,302434.8,0.021283,0.041766,941222.9


In [8]:
# Pivot the dataframe
df_pivot = protocol_trade_df.pivot(index='time', columns='project', values='liquidity_utilization')

# Rename the columns
df_pivot.columns = [f'{col}_liquidity' for col in df_pivot.columns]

df_pivot["average_liquidity_utilization"] = df_pivot.mean(axis=1)

# Reset the index
df_pivot = df_pivot.reset_index()

# Print the modified dataframe
df_pivot.head()




Unnamed: 0,time,aerodrome_liquidity,balancer_liquidity,beethoven_x_liquidity,camelot_liquidity,curve_liquidity,kyberswap_liquidity,maverick_liquidity,pancakeswap_liquidity,ramses_liquidity,solidly_liquidity,synkswap_liquidity,uniswap_v2_liquidity,uniswap_v3_liquidity,velodrome_liquidity,velodrome_v2_liquidity,wombat_liquidity,average_liquidity_utilization
0,2023-04-01,,0.020318,0.045924,0.002815,0.006956,0.230702,0.904569,,,,,0.038565,0.054271,0.006175,,,0.145588
1,2023-04-02,,0.012283,0.0194,0.002324,0.004144,0.179021,0.943711,,,,,0.093076,0.051226,0.007171,,,0.145817
2,2023-04-03,,0.03492,0.153568,0.007185,0.018795,0.406682,1.116773,136.591153,,,,0.024879,0.223456,0.012876,,,13.859029
3,2023-04-04,,0.025758,0.025106,0.00445,0.017779,0.373843,1.429629,0.016077,,,,0.081125,0.157517,0.009976,,,0.214126
4,2023-04-05,,0.026831,0.051435,0.011451,0.0134,0.373309,0.743393,0.430509,,,,0.125411,0.121602,0.012321,,,0.190966


In [9]:
# Pivot the dataframe
df_pivot2 = protocol_trade_df.pivot(index='time', columns='project', values='trading_volume')

# Rename the columns
df_pivot2.columns = [f'{col}_trade_volume' for col in df_pivot2.columns]

df_pivot2["total_trade_volume"] = df_pivot2.sum(axis=1)

# Reset the index
df_pivot2 = df_pivot2.reset_index()

# Print the modified dataframe
df_pivot2.head()

Unnamed: 0,time,aerodrome_trade_volume,balancer_trade_volume,beethoven_x_trade_volume,camelot_trade_volume,curve_trade_volume,kyberswap_trade_volume,maverick_trade_volume,pancakeswap_trade_volume,ramses_trade_volume,solidly_trade_volume,synkswap_trade_volume,uniswap_v2_trade_volume,uniswap_v3_trade_volume,velodrome_trade_volume,velodrome_v2_trade_volume,wombat_trade_volume,total_trade_volume
0,2023-04-01,,8454003.0,347055.1,24559.746851,12659140.0,8085850.0,7476432.0,,,,,124788.661512,1019266.0,163775.045996,,,38354870.0
1,2023-04-02,,5070993.0,145454.0,20120.713855,7457547.0,6568809.0,7756407.0,,,,,298886.911597,941479.4,188471.487918,,,28448170.0
2,2023-04-03,,14328750.0,1143170.0,61965.551126,33432270.0,14395550.0,8542356.0,81472.549362,,,,79326.044844,4096361.0,337226.210283,,,76498450.0
3,2023-04-04,,10587080.0,191495.9,39211.823481,32242950.0,13396000.0,11200780.0,280.817245,,,,265074.131597,2944226.0,267583.920214,,,71134680.0
4,2023-04-05,,11241800.0,400262.4,104030.755685,25123040.0,13831790.0,7722024.0,87977.748846,,,,425600.84953,2293711.0,341430.542341,,,61571670.0


In [10]:
# Pivot the dataframe
df_pivot3 = protocol_trade_df.pivot(index='time', columns='project', values='Liquidity utilization (ma_30)')

# Rename the columns
df_pivot3.columns = [f'{col}_liquidity_utilization_30day_avg' for col in df_pivot3.columns]

df_pivot3["average_liquidity_utilization_30day_avg"] = df_pivot3.mean(axis=1)

# Reset the index
df_pivot3 = df_pivot3.reset_index()

# Print the modified dataframe
df_pivot.head()

Unnamed: 0,time,aerodrome_liquidity,balancer_liquidity,beethoven_x_liquidity,camelot_liquidity,curve_liquidity,kyberswap_liquidity,maverick_liquidity,pancakeswap_liquidity,ramses_liquidity,solidly_liquidity,synkswap_liquidity,uniswap_v2_liquidity,uniswap_v3_liquidity,velodrome_liquidity,velodrome_v2_liquidity,wombat_liquidity,average_liquidity_utilization
0,2023-04-01,,0.020318,0.045924,0.002815,0.006956,0.230702,0.904569,,,,,0.038565,0.054271,0.006175,,,0.145588
1,2023-04-02,,0.012283,0.0194,0.002324,0.004144,0.179021,0.943711,,,,,0.093076,0.051226,0.007171,,,0.145817
2,2023-04-03,,0.03492,0.153568,0.007185,0.018795,0.406682,1.116773,136.591153,,,,0.024879,0.223456,0.012876,,,13.859029
3,2023-04-04,,0.025758,0.025106,0.00445,0.017779,0.373843,1.429629,0.016077,,,,0.081125,0.157517,0.009976,,,0.214126
4,2023-04-05,,0.026831,0.051435,0.011451,0.0134,0.373309,0.743393,0.430509,,,,0.125411,0.121602,0.012321,,,0.190966


In [11]:
# Pivot the dataframe
df_pivot4 = protocol_trade_df.pivot(index='time', columns='project', values='Trading volume (ma_30)')

# Rename the columns
df_pivot4.columns = [f'{col}_trade_volume_30day_avg' for col in df_pivot4.columns]

df_pivot4["total_trade_volume_30day_avg"] = df_pivot4.sum(axis=1)

# Reset the index
df_pivot4 = df_pivot4.reset_index()

# Print the modified dataframe
df_pivot4.head()

Unnamed: 0,time,aerodrome_trade_volume_30day_avg,balancer_trade_volume_30day_avg,beethoven_x_trade_volume_30day_avg,camelot_trade_volume_30day_avg,curve_trade_volume_30day_avg,kyberswap_trade_volume_30day_avg,maverick_trade_volume_30day_avg,pancakeswap_trade_volume_30day_avg,ramses_trade_volume_30day_avg,solidly_trade_volume_30day_avg,synkswap_trade_volume_30day_avg,uniswap_v2_trade_volume_30day_avg,uniswap_v3_trade_volume_30day_avg,velodrome_trade_volume_30day_avg,velodrome_v2_trade_volume_30day_avg,wombat_trade_volume_30day_avg,total_trade_volume_30day_avg
0,2023-04-01,,8454003.0,347055.080303,24559.746851,12659140.0,8085850.0,7476432.0,,,,,124788.661512,1019266.0,163775.045996,,,38354870.0
1,2023-04-02,,6762498.0,246254.519378,22340.230353,10058340.0,7327330.0,7616419.0,,,,,211837.786555,980372.6,176123.266957,,,33401520.0
2,2023-04-03,,9284582.0,545226.347598,35548.670611,17849650.0,9683405.0,7925065.0,81472.549362,,,,167667.205984,2019035.0,229824.248066,,,47821480.0
3,2023-04-04,,9610206.0,456793.739887,36464.458828,21447980.0,10611550.0,8743992.0,40876.683303,,,,192018.937388,2250333.0,239264.166103,,,53629480.0
4,2023-04-05,,9936524.0,445487.47566,49977.7182,22182990.0,11255600.0,8539599.0,56577.038484,,,,238735.319816,2259009.0,259697.441351,,,55224200.0


In [12]:

# Merge df_pivot and df_pivot2
merged_df = pd.merge(df_pivot, df_pivot2, on="time", how="outer")

# Merge merged_df and df_pivot3
merged_df = pd.merge(merged_df, df_pivot3, on="time", how="outer")

# Merge merged_df and df_pivot4
merged_protocol_trade_df = pd.merge(merged_df, df_pivot4, on="time", how="outer")

merged_protocol_trade_df.fillna(0, inplace=True)

merged_protocol_trade_df.head()


Unnamed: 0,time,aerodrome_liquidity,balancer_liquidity,beethoven_x_liquidity,camelot_liquidity,curve_liquidity,kyberswap_liquidity,maverick_liquidity,pancakeswap_liquidity,ramses_liquidity,...,pancakeswap_trade_volume_30day_avg,ramses_trade_volume_30day_avg,solidly_trade_volume_30day_avg,synkswap_trade_volume_30day_avg,uniswap_v2_trade_volume_30day_avg,uniswap_v3_trade_volume_30day_avg,velodrome_trade_volume_30day_avg,velodrome_v2_trade_volume_30day_avg,wombat_trade_volume_30day_avg,total_trade_volume_30day_avg
0,2023-04-01,0.0,0.020318,0.045924,0.002815,0.006956,0.230702,0.904569,0.0,0.0,...,0.0,0.0,0.0,0.0,124788.661512,1019266.0,163775.045996,0.0,0.0,38354870.0
1,2023-04-02,0.0,0.012283,0.0194,0.002324,0.004144,0.179021,0.943711,0.0,0.0,...,0.0,0.0,0.0,0.0,211837.786555,980372.6,176123.266957,0.0,0.0,33401520.0
2,2023-04-03,0.0,0.03492,0.153568,0.007185,0.018795,0.406682,1.116773,136.591153,0.0,...,81472.549362,0.0,0.0,0.0,167667.205984,2019035.0,229824.248066,0.0,0.0,47821480.0
3,2023-04-04,0.0,0.025758,0.025106,0.00445,0.017779,0.373843,1.429629,0.016077,0.0,...,40876.683303,0.0,0.0,0.0,192018.937388,2250333.0,239264.166103,0.0,0.0,53629480.0
4,2023-04-05,0.0,0.026831,0.051435,0.011451,0.0134,0.373309,0.743393,0.430509,0.0,...,56577.038484,0.0,0.0,0.0,238735.319816,2259009.0,259697.441351,0.0,0.0,55224200.0


In [13]:
merged_protocol_trade_df.describe()

Unnamed: 0,aerodrome_liquidity,balancer_liquidity,beethoven_x_liquidity,camelot_liquidity,curve_liquidity,kyberswap_liquidity,maverick_liquidity,pancakeswap_liquidity,ramses_liquidity,solidly_liquidity,...,pancakeswap_trade_volume_30day_avg,ramses_trade_volume_30day_avg,solidly_trade_volume_30day_avg,synkswap_trade_volume_30day_avg,uniswap_v2_trade_volume_30day_avg,uniswap_v3_trade_volume_30day_avg,velodrome_trade_volume_30day_avg,velodrome_v2_trade_volume_30day_avg,wombat_trade_volume_30day_avg,total_trade_volume_30day_avg
count,385.0,385.0,385.0,385.0,385.0,385.0,385.0,385.0,385.0,385.0,...,385.0,385.0,385.0,385.0,385.0,385.0,385.0,385.0,385.0,385.0
mean,0.017519,0.245775,0.165082,0.006574,0.048418,5.063398,1.028614,0.65006,1.350181,1.777435,...,347046.3,1004951.0,2360998.0,27464.915721,488405.5,36425570.0,114440.591577,291542.2,27871.466197,87433770.0
std,0.068574,0.366493,0.181016,0.010804,0.065192,96.001983,1.127091,6.976651,1.725002,5.46436,...,361221.5,1482085.0,3455042.0,65329.702702,246366.2,27962790.0,177805.965062,328042.6,63082.478336,46157300.0
min,0.0,0.00282,0.001871,0.0,0.001523,0.0,2.7e-05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,124788.7,980372.6,420.323733,0.0,0.0,32187600.0
25%,0.0,0.034708,0.057913,0.001222,0.013226,0.000235,0.230623,0.008435,0.003088,0.0,...,77005.72,23399.59,0.0,0.0,280841.6,13150710.0,1042.865176,46404.81,175.51758,52903410.0
50%,0.0,0.131638,0.110453,0.003225,0.025998,0.093138,0.741606,0.079556,0.702236,0.0,...,317046.2,127862.8,0.0,0.0,413730.5,27546710.0,2916.746406,111231.8,519.217005,69291160.0
75%,0.017376,0.307587,0.215948,0.00721,0.052017,0.253179,1.383267,0.254655,2.065668,2.569607,...,485736.7,1434785.0,6225047.0,0.0,718107.5,54857970.0,245344.367006,526100.8,3431.971058,112668800.0
max,1.230141,4.425931,1.689594,0.099551,0.416641,1883.852156,6.377461,136.591153,9.908926,97.439051,...,1993394.0,5211081.0,9834001.0,248701.816109,1071626.0,102600200.0,522830.968696,1010911.0,259311.737157,211257700.0


In [14]:
merged_protocol_trade_df['time'] = pd.to_datetime(merged_protocol_trade_df['time'])

merged_data = pd.merge(merged_data,merged_protocol_trade_df, left_on='date', right_on='time', how='inner')

merged_data.head()

Unnamed: 0,lido_yield,date,price,market_cap,volumes_last_24h,time,aerodrome_liquidity,balancer_liquidity,beethoven_x_liquidity,camelot_liquidity,...,pancakeswap_trade_volume_30day_avg,ramses_trade_volume_30day_avg,solidly_trade_volume_30day_avg,synkswap_trade_volume_30day_avg,uniswap_v2_trade_volume_30day_avg,uniswap_v3_trade_volume_30day_avg,velodrome_trade_volume_30day_avg,velodrome_v2_trade_volume_30day_avg,wombat_trade_volume_30day_avg,total_trade_volume_30day_avg
0,0.045177,2023-04-01,1820.918138,10743770000.0,20932350.0,2023-04-01,0.0,0.020318,0.045924,0.002815,...,0.0,0.0,0.0,0.0,124788.661512,1019266.0,163775.045996,0.0,0.0,38354870.0
1,0.039801,2023-04-02,1817.016461,10718410000.0,11485370.0,2023-04-02,0.0,0.012283,0.0194,0.002324,...,0.0,0.0,0.0,0.0,211837.786555,980372.6,176123.266957,0.0,0.0,33401520.0
2,0.041943,2023-04-03,1790.21015,10558500000.0,7915934.0,2023-04-03,0.0,0.03492,0.153568,0.007185,...,81472.549362,0.0,0.0,0.0,167667.205984,2019035.0,229824.248066,0.0,0.0,47821480.0
3,0.046121,2023-04-04,1806.825427,10676880000.0,34873270.0,2023-04-04,0.0,0.025758,0.025106,0.00445,...,40876.683303,0.0,0.0,0.0,192018.937388,2250333.0,239264.166103,0.0,0.0,53629480.0
4,0.044591,2023-04-05,1867.159722,11026190000.0,32511900.0,2023-04-05,0.0,0.026831,0.051435,0.011451,...,56577.038484,0.0,0.0,0.0,238735.319816,2259009.0,259697.441351,0.0,0.0,55224200.0


In [15]:
merged_data['lagged_yield'] = merged_data['lido_yield'].shift(-7)


merged_data = merged_data.dropna(subset=['lagged_yield']).drop(columns=['time','lido_yield'])

merged_data.head()

Unnamed: 0,date,price,market_cap,volumes_last_24h,aerodrome_liquidity,balancer_liquidity,beethoven_x_liquidity,camelot_liquidity,curve_liquidity,kyberswap_liquidity,...,ramses_trade_volume_30day_avg,solidly_trade_volume_30day_avg,synkswap_trade_volume_30day_avg,uniswap_v2_trade_volume_30day_avg,uniswap_v3_trade_volume_30day_avg,velodrome_trade_volume_30day_avg,velodrome_v2_trade_volume_30day_avg,wombat_trade_volume_30day_avg,total_trade_volume_30day_avg,lagged_yield
0,2023-04-01,1820.918138,10743770000.0,20932350.0,0.0,0.020318,0.045924,0.002815,0.006956,0.230702,...,0.0,0.0,0.0,124788.661512,1019266.0,163775.045996,0.0,0.0,38354870.0,0.039954
1,2023-04-02,1817.016461,10718410000.0,11485370.0,0.0,0.012283,0.0194,0.002324,0.004144,0.179021,...,0.0,0.0,0.0,211837.786555,980372.6,176123.266957,0.0,0.0,33401520.0,0.090548
2,2023-04-03,1790.21015,10558500000.0,7915934.0,0.0,0.03492,0.153568,0.007185,0.018795,0.406682,...,0.0,0.0,0.0,167667.205984,2019035.0,229824.248066,0.0,0.0,47821480.0,0.040085
3,2023-04-04,1806.825427,10676880000.0,34873270.0,0.0,0.025758,0.025106,0.00445,0.017779,0.373843,...,0.0,0.0,0.0,192018.937388,2250333.0,239264.166103,0.0,0.0,53629480.0,0.045267
4,2023-04-05,1867.159722,11026190000.0,32511900.0,0.0,0.026831,0.051435,0.011451,0.0134,0.373309,...,0.0,0.0,0.0,238735.319816,2259009.0,259697.441351,0.0,0.0,55224200.0,0.044622


### Test-Training Split

In [16]:


# Calculate the index of the last day in the dataset
last_day = merged_data.date.max()

# Calculate the index of the first day in the test set
test_set_start = last_day - pd.DateOffset(days=30)

# Split the dataset into training and test sets
test_set = merged_data[merged_data['date'] >= test_set_start ]
train_set = merged_data[merged_data['date'] < test_set_start ]

# Print the shapes of the training and test sets
print("Training set shape:", train_set.shape)
print("Test set shape:", test_set.shape)

test_set.head()


Training set shape: (273, 73)
Test set shape: (31, 73)


Unnamed: 0,date,price,market_cap,volumes_last_24h,aerodrome_liquidity,balancer_liquidity,beethoven_x_liquidity,camelot_liquidity,curve_liquidity,kyberswap_liquidity,...,ramses_trade_volume_30day_avg,solidly_trade_volume_30day_avg,synkswap_trade_volume_30day_avg,uniswap_v2_trade_volume_30day_avg,uniswap_v3_trade_volume_30day_avg,velodrome_trade_volume_30day_avg,velodrome_v2_trade_volume_30day_avg,wombat_trade_volume_30day_avg,total_trade_volume_30day_avg,lagged_yield
273,2023-12-30,2295.99917,21005540000.0,20156890.0,0.053367,0.117359,0.017637,0.000781,0.043956,0.061888,...,1002103.0,3286379.0,0.0,736552.233475,62229290.0,2940.977931,738274.614788,358.270306,117689700.0,0.033167
274,2023-12-31,2295.35384,21123660000.0,18188960.0,0.009361,0.14356,0.007936,0.000298,0.017734,0.011167,...,1018956.0,3496360.0,0.0,721031.124588,62054450.0,3052.128654,751619.424436,357.944499,116867800.0,0.03182
275,2024-01-01,2277.753314,20988970000.0,9363343.0,0.006999,0.342659,0.013578,0.000343,0.031979,0.03266,...,1059514.0,3833198.0,0.0,720322.450841,61686630.0,3188.370228,775331.059688,357.944499,116817500.0,0.034341
276,2024-01-02,2345.476128,21626040000.0,17260180.0,0.028162,0.780432,0.09581,0.01237,0.059002,0.047785,...,1148289.0,4173345.0,0.0,808354.714752,62722620.0,3238.239648,790884.166229,418.745598,119598300.0,0.035887
277,2024-01-03,2354.26124,21698140000.0,28720130.0,0.008899,0.245947,0.01627,0.001632,0.054634,0.060176,...,1216938.0,4397041.0,0.0,844979.237844,60572970.0,3357.006133,817338.071528,324.969871,118261600.0,0.041281


### Preprocessing

Rescaling the features were going to train to [0,1]

In [17]:
from sklearn.preprocessing import MinMaxScaler


def minmax_fit_scale(columns, df):

    scaler = MinMaxScaler()

    scaled_df = df.copy()
    scaled_df[columns] = scaler.fit_transform(scaled_df[columns])
    return scaled_df, scaler 

def minmax_scale(columns, df, scaler):

    scaled_df = df.copy()
    scaled_df[columns] = scaler.transform(scaled_df[columns])
    return scaled_df

In [18]:

column_names = merged_data.columns.tolist()

# Define the columns to be normalized
columns_to_scale = column_names.copy() 

# Remove the columns you don't want
columns_to_scale.remove('date')
columns_to_scale.remove('lagged_yield') 


# Fit the scaler on the training set
train_set_scaled,scaler = minmax_fit_scale(columns_to_scale, train_set)

# Apply the scaler to the test set
test_set_scaled = minmax_scale(columns_to_scale,test_set, scaler)

# Print the scaled training set
print("Scaled Training Set:")
print(train_set_scaled.head())

# Print the scaled test set
print("\nScaled Test Set:")
print(test_set_scaled.head())

Scaled Training Set:
        date     price  market_cap  volumes_last_24h  aerodrome_liquidity  \
0 2023-04-01  0.336892    0.016554          0.170027                  0.0   
1 2023-04-02  0.332234    0.014288          0.088115                  0.0   
2 2023-04-03  0.300231    0.000000          0.057166                  0.0   
3 2023-04-04  0.320067    0.010577          0.290903                  0.0   
4 2023-04-05  0.392099    0.041789          0.270428                  0.0   

   balancer_liquidity  beethoven_x_liquidity  camelot_liquidity  \
0            0.011633               0.024455           0.025376   
1            0.006291               0.008713           0.020429   
2            0.021341               0.088344           0.069406   
3            0.015250               0.012100           0.041842   
4            0.015963               0.027727           0.112380   

   curve_liquidity  kyberswap_liquidity  ...  ramses_trade_volume_30day_avg  \
0         0.018567             0.0

Since we are going to create two models, one that takes the 3 day lagged features, and the other taking the 7 day lagged data, lets divide the input features into 2 (both for test and training)

In [19]:
train_set_scaled.describe()

Unnamed: 0,price,market_cap,volumes_last_24h,aerodrome_liquidity,balancer_liquidity,beethoven_x_liquidity,camelot_liquidity,curve_liquidity,kyberswap_liquidity,maverick_liquidity,...,ramses_trade_volume_30day_avg,solidly_trade_volume_30day_avg,synkswap_trade_volume_30day_avg,uniswap_v2_trade_volume_30day_avg,uniswap_v3_trade_volume_30day_avg,velodrome_trade_volume_30day_avg,velodrome_v2_trade_volume_30day_avg,wombat_trade_volume_30day_avg,total_trade_volume_30day_avg,lagged_yield
count,273.0,273.0,273.0,273.0,273.0,273.0,273.0,273.0,273.0,273.0,...,273.0,273.0,273.0,273.0,273.0,273.0,273.0,273.0,273.0,273.0
mean,0.379794,0.371143,0.150949,0.008492,0.086622,0.097474,0.064512,0.101955,0.003789,0.173471,...,0.179228,0.074961,0.0,0.370566,0.350782,0.306655,0.165142,0.150664,0.372304,0.041944
std,0.226801,0.246168,0.150482,0.063023,0.115478,0.116871,0.110054,0.134247,0.060515,0.168492,...,0.30617,0.177884,0.0,0.176503,0.273583,0.369496,0.230746,0.277693,0.242728,0.008635
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032065
25%,0.21324,0.203628,0.057676,0.0,0.015194,0.029765,0.0137,0.031126,4.1e-05,0.059514,...,0.0,0.0,0.0,0.232066,0.050945,0.001172,0.0,4e-06,0.202081,0.037085
50%,0.373178,0.318216,0.10571,0.0,0.043976,0.063096,0.031957,0.064093,9.5e-05,0.130915,...,0.02871,0.0,0.0,0.343961,0.357015,0.013473,0.07569,0.002322,0.286446,0.039638
75%,0.446854,0.417579,0.189214,0.0,0.115115,0.12458,0.06965,0.111452,0.000171,0.23258,...,0.125345,0.0,0.0,0.489962,0.452736,0.723975,0.186872,0.198613,0.469583,0.042915
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.090548


In [20]:
X_train = train_set_scaled[['price', 'market_cap', 'volumes_last_24h','average_liquidity_utilization','total_trade_volume', 'average_liquidity_utilization_30day_avg','total_trade_volume_30day_avg']]

X_extended_train = train_set_scaled[['price', 'market_cap', 'volumes_last_24h', 'aerodrome_liquidity', 'balancer_liquidity', 'beethoven_x_liquidity', 'camelot_liquidity', 'curve_liquidity', 'kyberswap_liquidity', 'maverick_liquidity', 'pancakeswap_liquidity', 'ramses_liquidity', 'solidly_liquidity', 'synkswap_liquidity', 'uniswap_v2_liquidity', 'uniswap_v3_liquidity', 'velodrome_liquidity', 'velodrome_v2_liquidity', 'wombat_liquidity', 'average_liquidity_utilization', 'aerodrome_trade_volume', 'balancer_trade_volume', 'beethoven_x_trade_volume', 'camelot_trade_volume', 'curve_trade_volume', 'kyberswap_trade_volume', 'maverick_trade_volume', 'pancakeswap_trade_volume', 'ramses_trade_volume', 'solidly_trade_volume', 'synkswap_trade_volume', 'uniswap_v2_trade_volume', 'uniswap_v3_trade_volume', 'velodrome_trade_volume', 'velodrome_v2_trade_volume', 'wombat_trade_volume', 'total_trade_volume', 'aerodrome_liquidity_utilization_30day_avg', 'balancer_liquidity_utilization_30day_avg', 'beethoven_x_liquidity_utilization_30day_avg', 'camelot_liquidity_utilization_30day_avg', 'curve_liquidity_utilization_30day_avg', 'kyberswap_liquidity_utilization_30day_avg', 'maverick_liquidity_utilization_30day_avg', 'pancakeswap_liquidity_utilization_30day_avg', 'ramses_liquidity_utilization_30day_avg', 'solidly_liquidity_utilization_30day_avg', 'synkswap_liquidity_utilization_30day_avg', 'uniswap_v2_liquidity_utilization_30day_avg', 'uniswap_v3_liquidity_utilization_30day_avg', 'velodrome_liquidity_utilization_30day_avg', 'velodrome_v2_liquidity_utilization_30day_avg', 'wombat_liquidity_utilization_30day_avg', 'average_liquidity_utilization_30day_avg', 'aerodrome_trade_volume_30day_avg', 'balancer_trade_volume_30day_avg', 'beethoven_x_trade_volume_30day_avg', 'camelot_trade_volume_30day_avg', 'curve_trade_volume_30day_avg', 'kyberswap_trade_volume_30day_avg', 'maverick_trade_volume_30day_avg', 'pancakeswap_trade_volume_30day_avg', 'ramses_trade_volume_30day_avg', 'solidly_trade_volume_30day_avg', 'synkswap_trade_volume_30day_avg', 'uniswap_v2_trade_volume_30day_avg', 'uniswap_v3_trade_volume_30day_avg', 'velodrome_trade_volume_30day_avg', 'velodrome_v2_trade_volume_30day_avg', 'wombat_trade_volume_30day_avg', 'total_trade_volume_30day_avg']]

Y_train = train_set_scaled[['lagged_yield']]

X_test = test_set_scaled[['price', 'market_cap', 'volumes_last_24h','average_liquidity_utilization','total_trade_volume', 'average_liquidity_utilization_30day_avg','total_trade_volume_30day_avg']]

X_extended_test = test_set_scaled[['price', 'market_cap', 'volumes_last_24h', 'aerodrome_liquidity', 'balancer_liquidity', 'beethoven_x_liquidity', 'camelot_liquidity', 'curve_liquidity', 'kyberswap_liquidity', 'maverick_liquidity', 'pancakeswap_liquidity', 'ramses_liquidity', 'solidly_liquidity', 'synkswap_liquidity', 'uniswap_v2_liquidity', 'uniswap_v3_liquidity', 'velodrome_liquidity', 'velodrome_v2_liquidity', 'wombat_liquidity', 'average_liquidity_utilization', 'aerodrome_trade_volume', 'balancer_trade_volume', 'beethoven_x_trade_volume', 'camelot_trade_volume', 'curve_trade_volume', 'kyberswap_trade_volume', 'maverick_trade_volume', 'pancakeswap_trade_volume', 'ramses_trade_volume', 'solidly_trade_volume', 'synkswap_trade_volume', 'uniswap_v2_trade_volume', 'uniswap_v3_trade_volume', 'velodrome_trade_volume', 'velodrome_v2_trade_volume', 'wombat_trade_volume', 'total_trade_volume', 'aerodrome_liquidity_utilization_30day_avg', 'balancer_liquidity_utilization_30day_avg', 'beethoven_x_liquidity_utilization_30day_avg', 'camelot_liquidity_utilization_30day_avg', 'curve_liquidity_utilization_30day_avg', 'kyberswap_liquidity_utilization_30day_avg', 'maverick_liquidity_utilization_30day_avg', 'pancakeswap_liquidity_utilization_30day_avg', 'ramses_liquidity_utilization_30day_avg', 'solidly_liquidity_utilization_30day_avg', 'synkswap_liquidity_utilization_30day_avg', 'uniswap_v2_liquidity_utilization_30day_avg', 'uniswap_v3_liquidity_utilization_30day_avg', 'velodrome_liquidity_utilization_30day_avg', 'velodrome_v2_liquidity_utilization_30day_avg', 'wombat_liquidity_utilization_30day_avg', 'average_liquidity_utilization_30day_avg', 'aerodrome_trade_volume_30day_avg', 'balancer_trade_volume_30day_avg', 'beethoven_x_trade_volume_30day_avg', 'camelot_trade_volume_30day_avg', 'curve_trade_volume_30day_avg', 'kyberswap_trade_volume_30day_avg', 'maverick_trade_volume_30day_avg', 'pancakeswap_trade_volume_30day_avg', 'ramses_trade_volume_30day_avg', 'solidly_trade_volume_30day_avg', 'synkswap_trade_volume_30day_avg', 'uniswap_v2_trade_volume_30day_avg', 'uniswap_v3_trade_volume_30day_avg', 'velodrome_trade_volume_30day_avg', 'velodrome_v2_trade_volume_30day_avg', 'wombat_trade_volume_30day_avg', 'total_trade_volume_30day_avg']]

Y_test = test_set_scaled[['lagged_yield']]

In [21]:
print("X_train shape:", X_train.shape)
print("X_extended_train shape:", X_extended_train.shape)
print("X_test shape:", X_test.shape)
print("X_extended_test shape:", X_extended_test.shape)

X_train shape: (273, 7)
X_extended_train shape: (273, 71)
X_test shape: (31, 7)
X_extended_test shape: (31, 71)


### Model Development

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim

class FeedForwardNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(FeedForwardNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu = nn.ReLU()   
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size2, output_size)
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

input_size = 7
input_size_extended = 71  
hidden_size1 = 32
hidden_size2 = 16
output_size = 1  

# Create an instance of the feedforward neural network
model_simple = FeedForwardNN(input_size, hidden_size1, hidden_size2, output_size)
model_extended = FeedForwardNN(input_size_extended, hidden_size1, hidden_size2, output_size)

# Print the model architecture
print(model_simple)
print(model_extended)


FeedForwardNN(
  (fc1): Linear(in_features=7, out_features=32, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=32, out_features=16, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=16, out_features=1, bias=True)
)
FeedForwardNN(
  (fc1): Linear(in_features=71, out_features=32, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=32, out_features=16, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=16, out_features=1, bias=True)
)


Training in a K-folds cross validation scheme

In [23]:
from sklearn.model_selection import KFold


def train_model(model, X, Y, num_epochs, batch_size, num_folds):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    
    # Convert X and Y to tensors
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    Y_tensor = torch.tensor(Y.values, dtype=torch.float32)
    
    # Create a dataset from X and Y tensors
    dataset = torch.utils.data.TensorDataset(X_tensor, Y_tensor)
    
    # Create a data loader for the dataset
    train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Perform cross-validation
    kf = KFold(n_splits=num_folds, shuffle=False)
    fold = 1
    
    # Array to store cross-validation errors
    cv_errors = []
    
    for train_index, val_index in kf.split(X):
        print(f"Fold {fold}/{num_folds}")
        
        # Split the data into training and validation sets
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        Y_train, Y_val = Y.iloc[train_index], Y.iloc[val_index]
        
        # Convert training and validation sets to tensors
        X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        Y_train_tensor = torch.tensor(Y_train.values, dtype=torch.float32)
        X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
        Y_val_tensor = torch.tensor(Y_val.values, dtype=torch.float32)
        
        # Create datasets and data loaders for training and validation sets
        train_dataset = torch.utils.data.TensorDataset(X_train_tensor, Y_train_tensor)
        val_dataset = torch.utils.data.TensorDataset(X_val_tensor, Y_val_tensor)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        
        fold_errors = []  # Array to store errors for each fold
        
        for epoch in range(num_epochs):
            model.train()
            running_loss = 0.0
            
            for inputs, labels in train_loader:
                optimizer.zero_grad()
                
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                loss.backward()
                optimizer.step()
                
                running_loss += loss.item()
            
            epoch_loss = running_loss / len(train_loader)
            print(f"Fold {fold}/{num_folds}, Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}")
        
        # Evaluate the model on the validation set after the last batch is trained
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        print(f"Fold {fold}/{num_folds}, Validation Loss: {val_loss:.4f}")
        
        fold_errors.append(val_loss)  # Save validation loss for the fold
        cv_errors.append(fold_errors)  # Save fold errors to cross-validation errors array
        fold += 1
    
    return cv_errors



Lets save an example input data to use with verifiable inference in the second part.

In [24]:
# Transform the data into a numpy array
data_array = X_test.iloc[0]
data_extended_array = X_extended_test.iloc[0]

# Save the array as a numpy file
np.save('../data/data_extended_array.npy', data_extended_array)
np.save('../data/data_array.npy', data_array)


In [25]:
from sklearn.metrics import root_mean_squared_error


def train_and_evaluate(model, X_train, Y_train, X_test, Y_test, num_epochs, batch_size):

    # Train the model with X_train
    cv = train_model(model, X_train, Y_train, num_epochs, batch_size, num_folds=5)

    # Set the model to evaluation mode
    model.eval()

    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    pred = model(X_test_tensor)

    pred = pred.detach().numpy()

    rmse = root_mean_squared_error(Y_test, pred)



    return {'RMSE': rmse}

In [26]:
num_epochs = 10
batch_size = 5

results_simple = train_and_evaluate(model_simple, X_train, Y_train, X_test, Y_test, num_epochs, batch_size)
results_extended = train_and_evaluate(model_extended, X_extended_train, Y_train, X_extended_test, Y_test, num_epochs, batch_size)

Fold 1/5
Fold 1/5, Epoch 1/10, Training Loss: 0.0001
Fold 1/5, Epoch 2/10, Training Loss: 0.0000
Fold 1/5, Epoch 3/10, Training Loss: 0.0000
Fold 1/5, Epoch 4/10, Training Loss: 0.0000
Fold 1/5, Epoch 5/10, Training Loss: 0.0000
Fold 1/5, Epoch 6/10, Training Loss: 0.0000
Fold 1/5, Epoch 7/10, Training Loss: 0.0000
Fold 1/5, Epoch 8/10, Training Loss: 0.0000
Fold 1/5, Epoch 9/10, Training Loss: 0.0000
Fold 1/5, Epoch 10/10, Training Loss: 0.0000
Fold 1/5, Validation Loss: 0.0002
Fold 2/5
Fold 2/5, Epoch 1/10, Training Loss: 0.0001
Fold 2/5, Epoch 2/10, Training Loss: 0.0001
Fold 2/5, Epoch 3/10, Training Loss: 0.0001
Fold 2/5, Epoch 4/10, Training Loss: 0.0001
Fold 2/5, Epoch 5/10, Training Loss: 0.0001
Fold 2/5, Epoch 6/10, Training Loss: 0.0000
Fold 2/5, Epoch 7/10, Training Loss: 0.0000
Fold 2/5, Epoch 8/10, Training Loss: 0.0000
Fold 2/5, Epoch 9/10, Training Loss: 0.0000
Fold 2/5, Epoch 10/10, Training Loss: 0.0000
Fold 2/5, Validation Loss: 0.0000
Fold 3/5
Fold 3/5, Epoch 1/10, T

In [27]:
results_extended = train_and_evaluate(model_extended, X_extended_train, Y_train, X_extended_test, Y_test, num_epochs, batch_size)

Fold 1/5
Fold 1/5, Epoch 1/10, Training Loss: 0.0000
Fold 1/5, Epoch 2/10, Training Loss: 0.0000
Fold 1/5, Epoch 3/10, Training Loss: 0.0000
Fold 1/5, Epoch 4/10, Training Loss: 0.0000
Fold 1/5, Epoch 5/10, Training Loss: 0.0000
Fold 1/5, Epoch 6/10, Training Loss: 0.0000
Fold 1/5, Epoch 7/10, Training Loss: 0.0000
Fold 1/5, Epoch 8/10, Training Loss: 0.0000
Fold 1/5, Epoch 9/10, Training Loss: 0.0000
Fold 1/5, Epoch 10/10, Training Loss: 0.0000
Fold 1/5, Validation Loss: 0.0001
Fold 2/5
Fold 2/5, Epoch 1/10, Training Loss: 0.0000
Fold 2/5, Epoch 2/10, Training Loss: 0.0000
Fold 2/5, Epoch 3/10, Training Loss: 0.0000
Fold 2/5, Epoch 4/10, Training Loss: 0.0000
Fold 2/5, Epoch 5/10, Training Loss: 0.0000
Fold 2/5, Epoch 6/10, Training Loss: 0.0000
Fold 2/5, Epoch 7/10, Training Loss: 0.0000
Fold 2/5, Epoch 8/10, Training Loss: 0.0000
Fold 2/5, Epoch 9/10, Training Loss: 0.0000
Fold 2/5, Epoch 10/10, Training Loss: 0.0000
Fold 2/5, Validation Loss: 0.0000
Fold 3/5
Fold 3/5, Epoch 1/10, T

## Export as ONNX Model

In [28]:
import torch.onnx as onnx

def onnx_export(model, filename, input_size):
    dummy_input = torch.randn(1, input_size)
    onnx.export(model, dummy_input, filename, opset_version=11)
    print(f"{filename} exported to ONNX successfully!")

In [29]:

onnx_export(model_simple,"model_simple.onnx",input_size)
onnx_export(model_extended,"model_extended.onnx",input_size_extended)

model_simple.onnx exported to ONNX successfully!
model_extended.onnx exported to ONNX successfully!


Awesome, now that we have succesfully exported the model to the ONNX, we can start using the giza-cli and giza-actions to make the model verifiable!