In [5]:
import os
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# A helper to find the project root
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..', '..', '..'))

In [6]:
# --- Configuration ---
feature_file = os.path.join(PROJECT_ROOT, "data/features/ml_feature_data.parquet")
model_output_dir = os.path.join(PROJECT_ROOT, "qmind_quant/ml_models/models")
model_output_path = os.path.join(model_output_dir, "random_forest_v1.joblib")

# Load the feature-rich data
df = pd.read_parquet(feature_file)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            560 non-null    datetime64[ns]
 1   ticker          560 non-null    object        
 2   open            560 non-null    float64       
 3   high            560 non-null    float64       
 4   low             560 non-null    float64       
 5   close           560 non-null    float64       
 6   volume          560 non-null    int64         
 7   returns_1d      560 non-null    float64       
 8   returns_5d      560 non-null    float64       
 9   returns_21d     560 non-null    float64       
 10  volatility_21d  560 non-null    float64       
 11  rsi_14d         560 non-null    float64       
 12  target          560 non-null    int64         
dtypes: datetime64[ns](1), float64(9), int64(2), object(1)
memory usage: 57.0+ KB


Unnamed: 0,date,ticker,open,high,low,close,volume,returns_1d,returns_5d,returns_21d,volatility_21d,rsi_14d,target
0,2024-02-01,AAPL,182.445359,185.380501,182.276788,185.29126,64885400,0.013341,-0.037648,0.006572,0.013361,48.296784,1
1,2024-02-02,AAPL,178.350031,185.75732,177.745152,184.289749,102551700,-0.005405,-0.034144,0.008684,0.013307,46.465574,1
2,2024-02-05,AAPL,186.570441,187.661212,184.279836,186.104385,69668800,0.009847,-0.021123,0.031719,0.013097,50.153429,0
3,2024-02-06,AAPL,185.291252,187.72068,185.202011,187.71077,43490800,0.008632,0.006701,0.044817,0.013119,53.225256,0
4,2024-02-07,AAPL,189.039509,189.44607,187.026553,187.819839,53439000,0.000581,0.027169,0.020748,0.012112,53.435082,0


In [7]:
# Define the features we want to use for prediction
# We exclude non-feature columns like date, ticker, and OHLCV
features = [
    'returns_1d', 
    'returns_5d', 
    'returns_21d', 
    'volatility_21d', 
    'rsi_14d'
]

X = df[features]
y = df['target']

print("Feature matrix (X) shape:", X.shape)
print("Target vector (y) shape:", y.shape)

Feature matrix (X) shape: (560, 5)
Target vector (y) shape: (560,)


In [8]:
# We'll use 80% of the data for training and 20% for testing
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 448
Testing set size: 112


In [9]:
# Initialize and train the classifier
# n_estimators is the number of trees in the forest
# random_state ensures our results are reproducible
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

print("Training the model...")
model.fit(X_train, y_train)
print("Model training complete.")

Training the model...
Model training complete.


In [10]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy on Test Set: {accuracy:.4f}\n")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Down', 'Up']))

Model Accuracy on Test Set: 0.5714

Classification Report:
              precision    recall  f1-score   support

        Down       0.41      0.47      0.44        40
          Up       0.68      0.62      0.65        72

    accuracy                           0.57       112
   macro avg       0.55      0.55      0.55       112
weighted avg       0.59      0.57      0.58       112



In [11]:
# Ensure the output directory exists
os.makedirs(model_output_dir, exist_ok=True)

# Save the model
joblib.dump(model, model_output_path)
print(f"Model saved to: {model_output_path}")


Model saved to: /Users/enisyasaroglu/qmind_quant_platform/qmind_quant/ml_models/models/random_forest_v1.joblib
