In [95]:
import mt.ml_model_training as training
import mt.resources.ml_funcs as mlf
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from sklearn.metrics import fbeta_score
from sklearn.metrics import confusion_matrix, roc_auc_score, make_scorer
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, chi2
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids

In [96]:
strat_name = 'ChannelRun'
side = 'long'
timeframe = '1h'
strat_params = (200, 'edge')
num_pairs = 150
selection_method = '1w_volumes'
thresh = 0.4
data_len = 2500
pairs = mlf.get_margin_pairs(selection_method, num_pairs)
# results = training.create_risk_dataset(strat_name, side, timeframe, strat_params, num_pairs, selection_method, thresh)
results = training.generate_channel_run_dataset(pairs, side, timeframe, strat_params, data_len)

data generation began: 2023/11/09 11:43


In [97]:
results

Unnamed: 0,timestamp,open,high,low,close,base_vol,quote_vol,num_trades,taker_buy_base_vol,taker_buy_quote_vol,...,weekly_open_ratio,entry_l,entry_s,atr-10,r_pct,rr,lifespan,pnl_pct,pnl_r,pnl_cat
0,2023-07-27 13:00:00+00:00,0.0523,0.0525,0.0521,0.0523,534300.0,27936.0,276.0,330100.0,17267.0,...,1.0077,False,False,0.00045483,0.00047631,13.1769,120,-0.0256,-53.792,0
1,2023-07-27 15:00:00+00:00,0.942,0.945,0.938,0.939,95730.0,90011.0,454.0,36850.0,34680.0,...,0.9269,False,False,0.0055733,0.0053634,8.9711,2,-0.013,-2.4205,0
2,2023-07-27 16:00:00+00:00,0.939,0.94,0.936,0.938,84080.0,78837.0,281.0,18625.0,17470.0,...,0.926,True,False,0.0053634,0.0051889,11.9629,1,-0.0107,-2.0583,0
3,2023-07-27 17:00:00+00:00,0.1916,0.1916,0.1907,0.1907,107310.0,20506.0,124.0,42739.0,8171.4,...,0.9321,False,False,0.0012588,0.0013996,8.4362,142,-0.0151,-10.823,0
4,2023-07-27 17:00:00+00:00,0.0525,0.0527,0.0524,0.0524,164400.0,8637.5,85.0,59936.0,3148.6,...,1.0096,False,False,0.00048016,0.00051963,12.7088,142,-0.0265,-50.91,0
5,2023-07-27 17:00:00+00:00,0.0521,0.0522,0.0518,0.0521,3349300.0,174020.0,587.0,1292400.0,67205.0,...,1.0296,False,False,0.00048106,0.00049574,10.0428,104,-0.0223,-44.903,0
6,2023-07-27 17:00:00+00:00,0.4272,0.4277,0.4254,0.4256,367390.0,156630.0,740.0,155240.0,66188.0,...,0.964,False,False,0.0027107,0.003055,8.8036,104,-0.0138,-4.5177,0
7,2023-07-27 17:00:00+00:00,0.938,0.938,0.934,0.936,71630.0,67035.0,286.0,39605.0,37087.0,...,0.924,True,False,0.0051889,0.0061528,11.8977,74,-0.0118,-1.9201,0
8,2023-07-27 18:00:00+00:00,0.0525,0.0525,0.0517,0.052,719850.0,37475.0,315.0,231660.0,12081.0,...,1.0019,True,False,0.00051963,0.00049338,14.2677,115,-0.0241,-48.78,0
9,2023-07-27 19:00:00+00:00,1.906,1.914,1.902,1.909,66654.0,127200.0,553.0,31237.0,59616.0,...,0.8769,False,False,0.016612,0.016889,9.4683,72,-0.0187,-1.1076,0


In [102]:
# split features from labels
X = results.drop(['timestamp', 'pnl_pct', 'pnl_r', 'pnl_cat'], axis=1)
y = results.pnl_cat  # pnl > threshold

# balance classes
# us = RandomUnderSampler(random_state=0)
us = ClusterCentroids(random_state=0)
X, y = us.fit_resample(X, y)

original_cols = list(X.columns)  # list of strings, names of all features
X = pd.DataFrame(QuantileTransformer().fit_transform(X), columns=original_cols)

In [103]:
# eliminate features
nan_condition = X.columns[X.isnull().mean(axis=0) < 0.1]
X = X[nan_condition]
variance_condition = X.columns[X.var() > 0.001]
X = X[variance_condition]

In [114]:
# find collinearity
corr_thresh = 0.5
corr_matrix = X.corr()
# Extract the upper triangle of the correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

In [115]:
upper

Unnamed: 0,open,quote_vol,atr_z_25,atr_100_pct,ats_z_12,bullish_bar,day_of_week,day_of_week_180,dd_z_12,dd_z_25,...,rsi_100_above_30,rsi_14_above_50,rsi_timing_l_3_14,skew_6,skew_100,ema_48_above_192,vol_delta,recent_vd_div_1,entry_l,lifespan
open,,0.1605,0.0114,-0.0852,-0.0381,0.0375,0.0445,0.0035,0.0168,0.0208,...,-0.066542,0.099266,0.055,-0.0201,-0.0365,0.0095,0.2384,0.0633,0.0175,0.1988
quote_vol,,,0.2588,0.3166,0.3019,-0.0092,-0.0336,-0.0254,0.1745,0.1253,...,-0.056191,0.042374,0.1232,-0.068,-0.1806,0.1814,-0.1512,0.0256,0.0285,-0.3363
atr_z_25,,,,-0.1393,0.2601,-0.0767,-0.2948,0.0268,0.255,0.4007,...,0.024987,-0.16406,0.1404,-0.1194,-0.2233,0.0744,-0.0678,-0.0463,-0.0808,-0.0757
atr_100_pct,,,,,0.0821,0.1523,-0.0098,-0.0676,-0.0776,-0.1031,...,-0.076346,0.14946,0.0346,0.0303,0.0182,0.2898,-0.0287,0.0753,0.2217,-0.0517
ats_z_12,,,,,,-0.0361,-0.0644,-0.0068,0.0959,0.1297,...,-0.032437,-0.044562,0.0485,0.0148,-0.021,0.108,-0.0694,-0.0247,0.0042,-0.1189
bullish_bar,,,,,,,-0.0323,-0.0999,-0.2785,-0.1853,...,9.7834e-05,0.2949,0.0986,-0.0542,0.0079,0.1703,0.2749,0.3506,-0.0038,-0.0105
day_of_week,,,,,,,,-0.1666,-0.1234,-0.1409,...,-0.10261,0.01429,-0.1007,0.0492,0.2347,-0.1161,0.0026,-0.0555,0.0069,0.1267
day_of_week_180,,,,,,,,,0.0076,0.0095,...,0.031983,-0.14036,-0.1142,-0.0459,-0.1923,-0.1024,0.0221,-0.0469,-0.0415,0.0273
dd_z_12,,,,,,,,,,0.2951,...,0.043679,-0.12328,0.1718,-0.1735,-0.0271,0.0278,-0.2229,-0.0968,0.1053,-0.1135
dd_z_25,,,,,,,,,,,...,0.074043,-0.25963,0.2547,-0.1736,-0.0851,0.0635,-0.1185,-0.0747,0.065,-0.0297


In [116]:

# Select the features with correlations above the threshold
# Need to use the absolute value
to_drop = [column for column in upper.columns if any(upper[column].abs() > corr_thresh)]

In [117]:
to_drop

[]

In [108]:

# Iterate through the columns to drop to record pairs of correlated features
record_collinear = []
for column in to_drop:
    # Find the correlated features
    corr_features = list(upper.index[upper[column].abs() > corr_thresh])

    # Find the correlated values
    corr_values = list(upper[column][upper[column].abs() > corr_thresh])
    drop_features = [column for _ in range(len(corr_features))]

    # # Record the information (need a temp df for now)
    # temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features,
    #                                   'corr_feature': corr_features,
    #                                   'corr_value': corr_values})

    # Add to dataframe
    if drop_features:
        record_collinear.append(drop_features)

collinear_features = set(record_collinear)

In [109]:
collinear_features

Unnamed: 0,drop_feature,corr_feature,corr_value
0,high,open,1.0
1,low,open,0.9999
2,low,high,0.9999
3,close,open,0.9999
4,close,high,0.9999
5,close,low,1.0
6,base_vol,open,-0.7445
7,base_vol,high,-0.7435
8,base_vol,low,-0.7469
9,base_vol,close,-0.7458


In [110]:
drop = list(collinear_features.drop_feature.unique())
len(drop)

161

In [111]:
corr = list(collinear_features.corr_feature.unique())
len(corr)

164

In [113]:
X = X.drop(drop, axis=1)