In [1]:
import pandas as pd
import yfinance as yf
from tuneta.tune_ta import TuneTA
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [2]:
# Step 1: Data Collection
start_date = '2020-01-01'
end_date = '2024-01-01'
btc_data = yf.download('BTC-USD', start=start_date, end=end_date)

# Ensure the dataframe contains the necessary columns: 'Open', 'High', 'Low', 'Close', 'Volume'
btc_data = btc_data[['Open', 'High', 'Low', 'Close', 'Volume']]


[*********************100%%**********************]  1 of 1 completed


In [3]:
# Step 2: Identify Significant Moves
btc_data['Pct_Change'] = btc_data['Close'].pct_change() * 100
btc_data['Significant_Move'] = (btc_data['Pct_Change'].abs() >= 5).astype(int)
btc_data['Direction'] = (btc_data['Pct_Change'] >= 0).astype(int)  # 1 for up days, 0 for down days

# Filter for significant moves only
significant_moves_data = btc_data[btc_data['Significant_Move'] == 1]

# Drop unnecessary columns
significant_moves_data = significant_moves_data.drop(columns=['Pct_Change', 'Significant_Move'])


In [4]:
from tuneta.tune_ta import TuneTA

# Initialize TuneTA
tuner = TuneTA(n_jobs=4, verbose=1)
# Adjust the ranges to be within the length of the dataset
adjusted_ranges = [(4, 30), (31, min(60, len(significant_moves_data)-1))]
# Fit TuneTA with selected indicators
tuner.fit(
    X=significant_moves_data.drop(columns=['Direction']), 
    y=significant_moves_data['Direction'],
    indicators=['tta.RSI', 'tta.MACD', 'tta.MOM'],
    ranges=adjusted_ranges,
    trials=100,
    early_stop=20
)

# Report correlations
tuner.report(target_corr=True, features_corr=True)

# Transform the data to get the optimized features
optimized_features = tuner.transform(significant_moves_data.drop(columns=['Direction']))


[I 2024-07-09 11:40:37,224] A new study created in memory with name: tta.RSI(X.close, timeperiod=trial.suggest_int('timeperiod', 4, 30), )
[I 2024-07-09 11:40:37,225] A new study created in memory with name: tta.MACD(X.close, fastperiod=trial.suggest_int('fastperiod', 4, 30), slowperiod=trial.suggest_int('slowperiod', 4, 30), signalperiod=trial.suggest_int('signalperiod', 4, 30), )
[I 2024-07-09 11:40:37,226] A new study created in memory with name: tta.MOM(X.close, timeperiod=trial.suggest_int('timeperiod', 4, 30), )
[I 2024-07-09 11:40:37,227] A new study created in memory with name: tta.RSI(X.close, timeperiod=trial.suggest_int('timeperiod', 31, 60), )
[I 2024-07-09 11:40:37,246] Trial 0 finished with value: 0.05916647395594693 and parameters: {'fastperiod': 14, 'slowperiod': 29, 'signalperiod': 23}. Best is trial 0 with value: 0.05916647395594693.
[I 2024-07-09 11:40:37,247] Trial 0 finished with value: 0.144716389655318 and parameters: {'timeperiod': 42}. Best is trial 0 with valu


Indicator Correlation to Target:

                                                        Correlation
----------------------------------------------------  -------------
tta_RSI_timeperiod_7                                       0.443373
tta_MACD_fastperiod_45_slowperiod_57_signalperiod_53       0.396583
tta_MOM_timeperiod_12                                      0.269353
tta_MACD_fastperiod_8_slowperiod_7_signalperiod_25         0.258742
tta_RSI_timeperiod_33                                      0.227708
tta_MOM_timeperiod_34                                      0.10946


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.



Indicator Correlation to Each Other:

                                                        tta_RSI_timeperiod_7    tta_MACD_fastperiod_45_slowperiod_57_signalperiod_53    tta_MOM_timeperiod_12    tta_MACD_fastperiod_8_slowperiod_7_signalperiod_25    tta_RSI_timeperiod_33    tta_MOM_timeperiod_34
----------------------------------------------------  ----------------------  ------------------------------------------------------  -----------------------  ----------------------------------------------------  -----------------------  -----------------------
tta_RSI_timeperiod_7                                                0                                                       0.398417                 0.807807                                              0.893205                 0.594713                 0.489753
tta_MACD_fastperiod_45_slowperiod_57_signalperiod_53                0.398417                                                0                        0.390359                  

In [None]:


# Split data into training and testing sets
X = significant_moves_data.drop(columns=['Direction'])
y = significant_moves_data['Direction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Validate that there are no NaN values
assert not X_train.isnull().values.any(), "X_train contains NaN values"
assert not y_train.isnull().values.any(), "y_train contains NaN values"

# Validate the ranges to be within the dataset length
valid_ranges = [(4, min(30, len(X_train)-1))]

try:
    tt = TuneTA(n_jobs=6, verbose=True)
    # Optimize indicators
    tt.fit(
        X=X_train, 
        y=y_train,
        indicators=['all'],
        ranges=valid_ranges,
        trials=100,
        early_stop=10,
    )

    # Show time duration in seconds per indicator
    tt.fit_times()
except KeyError as e:
    print(f"Error: {e}")
    print("Possible causes might be related to the data indexing or specific columns in the dataset.")
# # Show correlation of indicators to target
# tt.report(target_corr=True, features_corr=True)

# # Select features with at most x correlation between each other
# tt.prune(max_inter_correlation=.7)

# # Show correlation of indicators to target and among themselves
# tt.report(target_corr=True, features_corr=True)

# # Add indicators to X_train
# features = tt.transform(X_train)
# X_train = pd.concat([X_train, features], axis=1)

# # Add same indicators to X_test
# features = tt.transform(X_test)
# X_test = pd.concat([X_test, features], axis=1)

[I 2024-07-09 11:40:58,113] A new study created in memory with name: tta.BBANDS(X.close, timeperiod=trial.suggest_int('timeperiod', 4, 30), )
[I 2024-07-09 11:40:58,113] A new study created in memory with name: tta.DEMA(X.close, timeperiod=trial.suggest_int('timeperiod', 4, 30), )
[I 2024-07-09 11:40:58,114] A new study created in memory with name: tta.EMA(X.close, timeperiod=trial.suggest_int('timeperiod', 4, 30), )
[I 2024-07-09 11:40:58,115] A new study created in memory with name: tta.HT_TRENDLINE(X.close, )
[I 2024-07-09 11:40:58,116] A new study created in memory with name: tta.KAMA(X.close, timeperiod=trial.suggest_int('timeperiod', 4, 30), )
[I 2024-07-09 11:40:58,118] A new study created in memory with name: tta.MA(X.close, timeperiod=trial.suggest_int('timeperiod', 4, 30), )
[I 2024-07-09 11:40:58,129] Trial 0 finished with value: 0.1866572236802032 and parameters: {}. Best is trial 0 with value: 0.1866572236802032.
[I 2024-07-09 11:40:58,130] Trial 0 finished with value: 0.3

Error: False
Possible causes might be related to the data indexing or specific columns in the dataset.


[I 2024-07-09 11:41:08,722] Trial 14 finished with value: 0.834607267869486 and parameters: {'fast': 20, 'medium': 16, 'slow': 25}. Best is trial 14 with value: 0.834607267869486.


[I 2024-07-09 11:41:08,725] Trial 23 finished with value: 0.3934206313612763 and parameters: {'length': 30, 'signal': 10}. Best is trial 11 with value: 0.3934206313612763.
[I 2024-07-09 11:41:08,727] Trial 8 finished with value: 0.14680481092042927 and parameters: {'length': 20}. Best is trial 6 with value: 0.30012592227622936.
[I 2024-07-09 11:41:08,733] Trial 13 finished with value: 0.08449842125172813 and parameters: {'length': 25}. Best is trial 2 with value: 0.27284382848690597.
[I 2024-07-09 11:41:08,734] Trial 0 finished with value: 0.09943649495536612 and parameters: {'length': 14}. Best is trial 0 with value: 0.09943649495536612.
[I 2024-07-09 11:41:08,743] Trial 9 finished with value: 0.25214638950310436 and parameters: {'length': 23}. Best is trial 6 with value: 0.30012592227622936.
[I 2024-07-09 11:41:08,747] Trial 1 finished with value: 0.16204412029017728 and parameters: {'length': 29}. Best is trial 1 with value: 0.16204412029017728.
[I 2024-07-09 11:41:08,750] Trial 15 

Error:  Function: pta.mcgd(X.close, length=trial.suggest_int('length', 4, 30), lookahead=False, )  Parameters: {'length': 14}

[I 2024-07-09 11:41:09,851] Trial 22 finished with value: 0.09748720110059439 and parameters: {'length': 15}. Best is trial 15 with value: 0.36755534554517355.





[I 2024-07-09 11:41:09,852] Trial 22 finished with value: 0.19800605891042097 and parameters: {'high_length': 6, 'low_length': 9, 'mamode': 't3'}. Best is trial 11 with value: 0.36633185584983635.
[W 2024-07-09 11:41:09,853] Trial 0 failed with parameters: {'length': 14} because of the following error: Exception(AttributeError("'Series' object has no attribute 'append'")).
Traceback (most recent call last):
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/tuneta/optimize.py", line 128, in eval_res
    res = eval(function)
  File "<string>", line 1, in <module>
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/pandas_ta/overlap/mcgd.py", line 24, in mcgd
    mcg_ds = close[:1].append(mcg_cell[1:])
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/pandas/core/generic.py", line 6299, in __getattr__
    return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'ap

[!] VWAP volume series is not datetime ordered. Results may not be as expected.


[I 2024-07-09 11:41:16,512] Trial 11 finished with value: 0.14918243427101077 and parameters: {'fast': 29, 'slow': 14, 'signal': 15}. Best is trial 4 with value: 0.5777524892836698.


[!] VWAP price series is not datetime ordered. Results may not be as expected.


[W 2024-07-09 11:41:16,517] Trial 9 failed with parameters: {'width': 23} because of the following error: The value nan is not acceptable.
[W 2024-07-09 11:41:16,518] Trial 9 failed with value nan.
[I 2024-07-09 11:41:16,524] Trial 0 finished with value: 0.0660019806718709 and parameters: {}. Best is trial 0 with value: 0.0660019806718709.
[I 2024-07-09 11:41:16,523] Trial 29 finished with value: 0.08584170022931523 and parameters: {'fast': 15, 'slow': 9, 'signal': 28, 'mamode': 'ema'}. Best is trial 23 with value: 0.6372159570220692.
[I 2024-07-09 11:41:16,529] A new study created in memory with name: fta.SMA(X, period=trial.suggest_int('period', 4, 30), )
[W 2024-07-09 11:41:16,532] Trial 10 failed with parameters: {'width': 4} because of the following error: The value nan is not acceptable.
[I 2024-07-09 11:41:16,534] Trial 12 finished with value: 0.2619755586683056 and parameters: {'fast': 10, 'slow': 4, 'signal': 25}. Best is trial 4 with value: 0.5777524892836698.
[W 2024-07-09 1

Error:  Function: fta.KAMA(X, er=trial.suggest_int('er', 4, 30), ema_fast=trial.suggest_int('ema_fast', 4, 30), ema_slow=trial.suggest_int('ema_slow', 4, 30), period=trial.suggest_int('period', 4, 30), )  Parameters: {'er': 14, 'ema_fast': 29, 'ema_slow': 23, 'period': 20}

[W 2024-07-09 11:41:17,631] Trial 62 failed with parameters: {'width': 26} because of the following error: The value nan is not acceptable.





[W 2024-07-09 11:41:17,633] Trial 62 failed with value nan.
[W 2024-07-09 11:41:17,634] Trial 0 failed with parameters: {'er': 14, 'ema_fast': 29, 'ema_slow': 23, 'period': 20} because of the following error: Exception(AttributeError("'Series' object has no attribute 'iteritems'")).
Traceback (most recent call last):
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/tuneta/optimize.py", line 128, in eval_res
    res = eval(function)
  File "<string>", line 1, in <module>
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/finta/finta.py", line 34, in wrap
    return func(*args, **kwargs)
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/finta/finta.py", line 292, in KAMA
    sc.iteritems(), sma.shift().iteritems(), ohlc[column].iteritems()
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/pandas/core/generic.py", line 6299, in __getattr__
    return object.__getattr

Error:  Function: fta.EVWMA(X, period=trial.suggest_int('period', 4, 30), )  Parameters: {'period': 14}


[I 2024-07-09 11:41:18,020] Trial 7 finished with value: 0.15771481456033404 and parameters: {'period': 27}. Best is trial 2 with value: 0.29078345351781476.
[W 2024-07-09 11:41:18,022] Trial 0 failed with parameters: {'period': 14} because of the following error: Exception(AttributeError("'Series' object has no attribute 'iteritems'")).
Traceback (most recent call last):
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/tuneta/optimize.py", line 128, in eval_res
    res = eval(function)
  File "<string>", line 1, in <module>
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/finta/finta.py", line 34, in wrap
    return func(*args, **kwargs)
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/finta/finta.py", line 34, in wrap
    return func(*args, **kwargs)
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/finta/finta.py", line 399, in EVWMA
    for x, y in zip(x.fi

Error:  Function: fta.EV_MACD(X, period_fast=trial.suggest_int('period_fast', 4, 30), period_slow=trial.suggest_int('period_slow', 4, 30), signal=trial.suggest_int('signal', 4, 30), )  Parameters: {'period_fast': 14, 'period_slow': 29, 'signal': 23}


[W 2024-07-09 11:41:18,506] Trial 0 failed with parameters: {'period_fast': 14, 'period_slow': 29, 'signal': 23} because of the following error: Exception(AttributeError("'Series' object has no attribute 'iteritems'")).
Traceback (most recent call last):
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/tuneta/optimize.py", line 128, in eval_res
    res = eval(function)
  File "<string>", line 1, in <module>
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/finta/finta.py", line 34, in wrap
    return func(*args, **kwargs)
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/finta/finta.py", line 34, in wrap
    return func(*args, **kwargs)
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/finta/finta.py", line 634, in EV_MACD
    evwma_slow = cls.EVWMA(ohlcv, period_slow)
  File "/Users/ericervin/miniforge3/envs/tuneta_py39/lib/python3.9/site-packages/finta/finta.py

AttributeError: 'ApplyResult' object has no attribute 'function'

In [6]:
# Step 4: Generate and Optimize Features without excluding highly correlated features
optimized_features = tuner.tune(high_corr=True, normalize=True)

# Extract the optimized features
features = optimized_features.columns
X = optimized_features[features]
y = significant_moves_data['Direction']


AttributeError: 'TuneTA' object has no attribute 'tune'

In [None]:
# Step 5: Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Step 6: Model Training
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [None]:
# Step 7: Model Evaluation
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))


In [38]:
from tuneta.tune_ta import TuneTA
import pandas as pd
from pandas_ta import percent_return
from sklearn.model_selection import train_test_split
import yfinance as yf

if __name__ == "__main__":
    # Download data set from yahoo, calculate next day return and split into train and test
    X = yf.download("SPY", period="10y", interval="1d", auto_adjust=True)
    y = percent_return(X['Close'], offset=-1)
    
    # Ensure y does not have NaN values after dropping from X
    y.dropna(inplace=True)
    
    # Align the indices of X and y and make sure they line up
    compatible_index = X.index.intersection(y.index)
    X = X.loc[compatible_index]
    y = y.loc[compatible_index]
    
    # Remove NaN values
    X.dropna(inplace=True)
    y.dropna(inplace=True)
    
    # Debug: print shapes and head of dataframes
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    print(f"X head:\n{X.head()}")
    print(f"y head:\n{y.head()}")
    
    # Check for NaNs in X and y explicitly
    if X.isna().sum().sum() > 0:
        raise ValueError("X contains NaN values")
    if y.isna().sum() > 0:
        raise ValueError("y contains NaN values")
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
    
    # Debug: print shapes after split
    print(f"X_train shape: {X_train.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_test shape: {y_test.shape}")
    
    # Initialize TuneTA with x cores and show trial results
    tt = TuneTA(n_jobs=-1)  # Assuming 'n_jobs=-1' utilizes all available cores
    
    # Convert 'y_train' series to a dataframe
    y_train = pd.DataFrame(y_train)
    
    # Debug: print head of y_train dataframe
    print(f"y_train head:\n{y_train.head()}")
    
    # Ensure X_train and y_train do not contain NaN values
    assert not X_train.isna().sum().sum() > 0, "X_train contains NaN values"
    assert not y_train.isna().sum().sum() > 0, "y_train contains NaN values"
    
    # Debug: print the first few rows of X_train and y_train
    print(f"X_train first few rows:\n{X_train.head()}")
    print(f"y_train first few rows:\n{y_train.head()}")

    # Fit TuneTA with selected indicators
    tt.fit(X_train, y_train,
           indicators=['all'],
           ranges=[(4, 30)],
           trials=100,
           early_stop=10,
           )
    
    # Select features with at most x correlation between each other
    tt.prune(max_inter_correlation=.7)
    
    # Show correlation of indicators to target and among themselves
    tt.report(target_corr=True, features_corr=True)
    
    # Add indicators to X_train
    features = tt.transform(X_train)
    X_train = pd.concat([X_train, features], axis=1)
    
    # Add same indicators to X_test
    features = tt.transform(X_test)
    X_test = pd.concat([X_test, features], axis=1)

    # Debug: print transformed X_train and X_test
    print(f"Transformed X_train:\n{X_train.head()}")
    print(f"Transformed X_test:\n{X_test.head()}")


[*********************100%%**********************]  1 of 1 completed

X shape: (2515, 5)
y shape: (2515,)
X head:
                  Open        High         Low       Close    Volume
Date                                                                
2014-06-27  163.200936  163.954256  163.125608  163.904037  71445100
2014-06-30  163.803584  164.196981  163.661293  163.820328  70201200
2014-07-01  164.222117  165.419053  164.163532  164.916840  90470000
2014-07-02  164.933603  165.293513  164.858274  165.084259  52475000
2014-07-03  165.552955  165.971462  165.427408  165.896133  52938800
y head:
Date
2014-06-27   -0.000511
2014-06-30    0.006693
2014-07-01    0.001015
2014-07-02    0.004918
2014-07-03   -0.003481
Name: PCTRET_1, dtype: float64
X_train shape: (1760, 5)
y_train shape: (1760,)
X_test shape: (755, 5)
y_test shape: (755,)
y_train head:
            PCTRET_1
Date                
2014-06-27 -0.000511
2014-06-30  0.006693
2014-07-01  0.001015
2014-07-02  0.004918
2014-07-03 -0.003481
X_train first few rows:
                  Open        High   




ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().