In [70]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import StandardScaler
import joblib  # For saving and loading models
import backtrader as bt  # For backtesting
from ta import add_all_ta_features  # For technical indicators

In [71]:
data = pd.read_csv('../data/NVDA_2020-04-01_2024-09-29_peak_valley.csv', parse_dates=['Date'], index_col='Date')
data = data[['Open', 'High', 'Low', 'Close', 'Volume', 'train_target']]

data.index = pd.to_datetime(data.index)
data.sort_index(inplace=True)

data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,train_target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-01,6.391250,6.538250,6.032000,6.076750,656912000,0
2020-04-02,6.106000,6.400000,6.057750,6.386750,675764000,0
2020-04-03,6.349000,6.390750,5.959750,6.097750,663212000,0
2020-04-06,6.381000,6.747000,6.323250,6.710000,727884000,0
2020-04-07,6.932500,6.956250,6.432500,6.475750,784520000,0
...,...,...,...,...,...,...
2024-09-23,116.550003,116.989998,114.860001,116.260002,206228500,0
2024-09-24,116.519997,121.800003,115.379997,120.870003,354966800,0
2024-09-25,122.019997,124.940002,121.610001,123.510002,284692900,0
2024-09-26,126.800003,127.669998,121.800003,124.040001,302582900,-1


In [72]:
# Feature Engineering
# Compute technical indicators using 'ta' library
data = add_all_ta_features(
    data,
    open="Open",
    high="High",
    low="Low",
    close="Close",
    volume="Volume",
    fillna=True
)

# Ensure data types are correct for numeric columns
numeric_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN or infinite values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)
data

  self._psar[i] = high2


Unnamed: 0_level_0,Open,High,Low,Close,Volume,train_target,volume_adi,volume_obv,volume_cmf,volume_fi,...,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,momentum_pvo,momentum_pvo_signal,momentum_pvo_hist,momentum_kama,others_dr,others_dlr,others_cr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-01,6.391250,6.538250,6.032000,6.076750,656912000,0,-5.407771e+08,656912000,-0.823211,0.000000e+00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6.076750,0.000000,0.000000,0.000000
2020-04-02,6.106000,6.400000,6.057750,6.386750,675764000,0,8.266379e+07,1332676000,0.062028,2.094871e+08,...,0.405419,0.081084,0.324335,0.228444,0.045689,0.182755,6.201035,5.101418,4.975559,5.101418
2020-04-03,6.349000,6.390750,5.959750,6.097750,663212000,0,-1.558461e+08,669464000,-0.078084,1.521792e+08,...,0.340488,0.132965,0.207523,0.252579,0.087067,0.165512,6.161110,-4.524994,-4.630569,0.345586
2020-04-06,6.381000,6.747000,6.323250,6.710000,727884000,0,4.449261e+08,1397348000,0.163349,1.941032e+08,...,1.078454,0.322062,0.756391,1.043830,0.278419,0.765411,6.385160,10.040586,9.567908,10.420871
2020-04-07,6.932500,6.956250,6.432500,6.475750,784520000,0,-2.100262e+08,612828000,-0.059866,1.401207e+08,...,1.336866,0.525023,0.811843,2.302989,0.683333,1.619655,6.421036,-3.491059,-3.553453,6.566013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-23,116.550003,116.989998,114.860001,116.260002,206228500,0,3.637431e+10,49520614800,-0.114186,7.000997e+06,...,-0.273221,-0.604087,0.330866,-6.281854,-3.937607,-2.344246,114.768189,0.224140,0.223889,1813.193828
2024-09-24,116.519997,121.800003,115.379997,120.870003,354966800,0,3.662644e+10,49875581600,-0.056892,2.397719e+08,...,0.112538,-0.460762,0.573300,-4.499850,-4.050056,-0.449794,115.467970,3.965251,3.888653,1889.056760
2024-09-25,122.019997,124.940002,121.610001,123.510002,284692900,0,3.666662e+10,50160274500,-0.080962,3.128886e+08,...,0.591240,-0.250362,0.841602,-4.854849,-4.211015,-0.643835,115.955243,2.184164,2.160653,1932.501027
2024-09-26,126.800003,127.669998,121.800003,124.040001,302582900,-1,3.659497e+10,50462857400,-0.096678,2.911000e+08,...,0.991598,-0.001970,0.993568,-4.623327,-4.293477,-0.329850,116.314254,0.429114,0.428196,1941.222775


In [73]:
# Prepare features and target variable
# Exclude 'train_target' from features
X = data.drop(['train_target'], axis=1)
y = data['train_target']

# Optionally, you can exclude 'Open', 'High', 'Low', 'Volume' if not needed
X_model = X.drop(['Open', 'High', 'Low', 'Volume'], axis=1)

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_model)

In [74]:
# Cross-validation using TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
model = LogisticRegression(multi_class='multinomial', max_iter=1000)

models = []

# Loop through each train-test split
for split_number, (train_index, test_index) in enumerate(tscv.split(X_scaled), start=1):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    models.append(model)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Calculate precision and recall
    precision = precision_score(y_test, y_pred, average=None)  # Use 'micro', 'macro', or None as needed
    recall = recall_score(y_test, y_pred, average=None)

    # Print results
    print(f"Split {split_number}:")
    for idx, label in enumerate(set(y)):
        print(f"Class {label} - Precision: {precision[idx]:.4f}, Recall: {recall[idx]:.4f}")
    print("-------------")

Split 1:
Class 0 - Precision: 0.1724, Recall: 0.2778
Class 1 - Precision: 0.8592, Recall: 0.7974
Class -1 - Precision: 0.5294, Recall: 0.5294
-------------
Split 2:
Class 0 - Precision: 0.5000, Recall: 0.0278
Class 1 - Precision: 0.6309, Recall: 0.8103
Class -1 - Precision: 0.4324, Recall: 0.4444
-------------
Split 3:
Class 0 - Precision: 0.3158, Recall: 0.4286
Class 1 - Precision: 0.7050, Recall: 0.7481
Class -1 - Precision: 0.3636, Recall: 0.1379
-------------
Split 4:
Class 0 - Precision: 0.2346, Recall: 0.5938
Class 1 - Precision: 0.6304, Recall: 0.4640
Class -1 - Precision: 0.2667, Recall: 0.1290
-------------
Split 5:
Class 0 - Precision: 0.3083, Recall: 0.8810
Class 1 - Precision: 0.6667, Recall: 0.2500
Class -1 - Precision: 0.6897, Recall: 0.4762
-------------


In [75]:
# Choose the best model based on cross-validation results
# For simplicity, we'll use the last model (You can implement model selection logic)
best_model = models[-1]

# Save the trained model and scaler for future use
joblib.dump(best_model, '../model/best_model.pkl')
joblib.dump(scaler, '../model/scaler.pkl')

['../model/scaler.pkl']

In [76]:
# Add the features used in the model to the DataFrame
# Assume 'X_model.columns' contains the feature names
feature_columns = X_model.columns.tolist()
data_features = data[feature_columns]

# Merge the features back into the main DataFrame
data_bt = pd.concat([data[['Open', 'High', 'Low', 'Close', 'Volume']], data_features], axis=1)

# Ensure that the DataFrame index is a DatetimeIndex
data_bt.index = pd.to_datetime(data_bt.index)
data_bt.sort_index(inplace=True)

# Drop any rows with NaN values
data_bt.dropna(inplace=True)

In [78]:
class MLStrategy(bt.Strategy):
    def __init__(self):
        self.model = best_model
        self.scaler = scaler
        self.features = feature_columns  # Use the same features as in training
        
        # Create lines for each feature
        self.feature_lines = {}
        for i, feature in enumerate(self.features):
            line = self.datas[0].lines[6 + i]  # Skip the first 6 default lines
            self.feature_lines[feature] = line
    
    def next(self):
        # Prepare the feature set for the current date
        current_features = {}
        for feature in self.features:
            current_features[feature] = self.feature_lines[feature][0]
        
        # Convert to DataFrame for scaling and prediction
        df = pd.DataFrame([current_features])
        
        # Scale features
        X_real_time_scaled = self.scaler.transform(df)
        
        # Predict probabilities
        probs = self.model.predict_proba(X_real_time_scaled)[0]
        classes = self.model.classes_
        
        # Create a mapping of class probabilities
        prob_dict = dict(zip(classes, probs))
        
        # Trading logic based on predicted probabilities
        if prob_dict.get(1, 0) > 0.8 and not self.position:
            self.order = self.buy()
        elif prob_dict.get(-1, 0) > 0.8 and not self.position:
            self.order = self.sell()
        elif prob_dict.get(0, 0) >= 0.2 and self.position:
            self.order = self.close()


In [79]:
# Define a custom data feed class
class CustomPandasData(bt.feeds.PandasData):
    # Add lines for each additional feature
    lines = tuple(feature_columns)
    params = dict(
        # Default parameters from PandasData
        datetime=None,
        open='Open',
        high='High',
        low='Low',
        close='Close',
        volume='Volume',
        openinterest=-1,
        # Map feature columns
        **{feature: feature for feature in feature_columns}
    )


In [80]:
# Initialize Cerebro engine
cerebro = bt.Cerebro()
cerebro.addstrategy(MLStrategy)

# Convert data to Backtrader feed using the custom data feed
data_feed = CustomPandasData(dataname=data_bt)

# Add data to Cerebro
cerebro.adddata(data_feed)

# Set initial cash
cerebro.broker.setcash(100000.0)

# Run backtesting
print('Starting Portfolio Value: %.2f' % cerebro.broker.getvalue())
cerebro.run()
print('Final Portfolio Value: %.2f' % cerebro.broker.getvalue())


Starting Portfolio Value: 100000.00
Final Portfolio Value: 100000.00
