In [95]:
import mt.ml_model_training as training
import mt.resources.ml_funcs as mlf
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from sklearn.metrics import fbeta_score
from sklearn.metrics import confusion_matrix, roc_auc_score, make_scorer
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, chi2
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids

In [96]:
strat_name = 'ChannelRun'
side = 'long'
timeframe = '1h'
strat_params = (200, 'edge')
num_pairs = 150
selection_method = '1w_volumes'
thresh = 0.4
data_len = 2500
pairs = mlf.get_margin_pairs(selection_method, num_pairs)
# results = training.create_risk_dataset(strat_name, side, timeframe, strat_params, num_pairs, selection_method, thresh)
results = training.generate_channel_run_dataset(pairs, side, timeframe, strat_params, data_len)

data generation began: 2023/11/09 11:43


In [79]:
results

Unnamed: 0,conf_l,conf_s,inval_dist,mkt_rank_1d,mkt_rank_1w,mkt_rank_1m,pnl,win
0,0.3588,1.0,0.0508,0.0254,0.0881,0.963,True,True
1,0.7476,1.0,0.0164,0.0623,0.0399,0.0257,False,False
2,0.8338,1.0,0.0158,0.2918,0.0741,0.1457,False,False
3,0.8407,1.0,0.0147,0.1785,0.037,0.0571,True,True
4,0.2949,0.0277,0.0343,0.1756,0.0285,0.9486,True,True
5,0.4048,0.5052,0.0214,0.1558,0.0399,0.02,True,True
6,0.6952,1.0,0.0333,0.0763,0.0142,0.9031,True,True
7,0.6229,1.0,0.0381,0.1586,0.0085,0.9857,True,True
8,0.3269,1.0,0.0502,0.0989,0.0114,0.0028,True,True
9,0.1905,0.5052,0.0537,0.0085,0.0142,0.6571,True,True


In [80]:
# split features from labels
X = results.drop(['win', 'pnl'], axis=1)
y = results.win  # pnl > threshold

# balance classes
# us = RandomUnderSampler(random_state=0)
us = ClusterCentroids(random_state=0)
X, y = us.fit_resample(X, y)

original_cols = list(X.columns)  # list of strings, names of all features
X = pd.DataFrame(QuantileTransformer().fit_transform(X), columns=original_cols)

In [81]:
# eliminate features
nan_condition = X.columns[X.isnull().mean(axis=0) < 0.1]
X = X[nan_condition]
variance_condition = X.columns[X.var() > 0.001]
X = X[variance_condition]

In [91]:
# find collinearity
corr_thresh = 0.5
corr_matrix = X.corr()
# Extract the upper triangle of the correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

In [92]:
upper

Unnamed: 0,conf_l,mkt_rank_1d
conf_l,,0.4
mkt_rank_1d,,


In [93]:

# Select the features with correlations above the threshold
# Need to use the absolute value
to_drop = [column for column in upper.columns if any(upper[column].abs() > corr_thresh)]

In [94]:
to_drop

[]

In [86]:

# Iterate through the columns to drop to record pairs of correlated features
record_collinear = []
for column in to_drop:
    # Find the correlated features
    corr_features = list(upper.index[upper[column].abs() > corr_thresh])

    # Find the correlated values
    corr_values = list(upper[column][upper[column].abs() > corr_thresh])
    drop_features = [column for _ in range(len(corr_features))]

    # Record the information (need a temp df for now)
    temp_df = pd.DataFrame.from_dict({'drop_feature': drop_features,
                                      'corr_feature': corr_features,
                                      'corr_value': corr_values})

    # Add to dataframe
    record_collinear.append(temp_df)

if record_collinear:
    collinear_features = pd.concat(record_collinear, axis=0, ignore_index=True)
else:
    collinear_features = None

In [87]:
collinear_features

Unnamed: 0,drop_feature,corr_feature,corr_value
0,conf_s,conf_l,0.9467
1,inval_dist,conf_l,-1.0
2,inval_dist,conf_s,-0.9467
3,mkt_rank_1w,conf_l,0.8
4,mkt_rank_1w,conf_s,0.7746
5,mkt_rank_1w,inval_dist,-0.8
6,mkt_rank_1m,conf_s,-0.6025


In [88]:
list(collinear_features.drop_feature.unique())

['conf_s', 'inval_dist', 'mkt_rank_1w', 'mkt_rank_1m']

In [89]:
list(collinear_features.corr_feature.unique())

['conf_l', 'conf_s', 'inval_dist']

In [90]:
X = X.drop(list(collinear_features.drop_feature), axis=1)