In [94]:
#%pip install -r ../requirements.txt



In [95]:
def calculate_rsi(data, window=14):
    delta = data['Close'].diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_macd(data, short_window=12, long_window=26, signal_window=9):
    # Short-term EMA (12-period)
    short_ema = data['Close'].ewm(span=short_window, adjust=False).mean()
    
    # Long-term EMA (26-period)
    long_ema = data['Close'].ewm(span=long_window, adjust=False).mean()
    
    # MACD Line
    macd_line = short_ema - long_ema
    
    # Signal Line (9-period EMA of MACD Line)
    signal_line = macd_line.ewm(span=signal_window, adjust=False).mean()
    
    # MACD Histogram (difference between MACD Line and Signal Line)
    macd_histogram = macd_line - signal_line
    
    return macd_line, signal_line, macd_histogram


In [96]:
import pandas as pd

# Load the CSV file, assuming it's saved as 'data.csv'
#data = pd.read_csv('../sampledata/MSFT_5d_1m_intraday_last_week.csv', parse_dates=['Datetime'], index_col='Datetime')
data = pd.read_csv('../sampledata/AAPL_intraday_data.csv', parse_dates=['Datetime'], index_col='Datetime')

data = data[(data.index.time >= pd.to_datetime('09:30:00').time()) &
            (data.index.time <= pd.to_datetime('16:00:00').time())]

# Display the data to ensure it's loaded correctly
print(data)


                         Open     High     Low     Close   Volume
Datetime                                                         
2024-08-15 09:30:00  419.8000  420.430  418.92  419.0450   407933
2024-08-15 09:31:00  419.0400  419.165  418.11  418.9818    83780
2024-08-15 09:32:00  419.1250  419.567  418.36  418.7850    64848
2024-08-15 09:33:00  418.8100  419.110  418.28  418.4500    43177
2024-08-15 09:34:00  418.4600  418.990  418.41  418.9350    44487
...                       ...      ...     ...       ...      ...
2024-09-13 15:56:00  430.9102  431.190  430.76  431.1350   843444
2024-09-13 15:57:00  431.1350  431.140  430.70  431.0350    74687
2024-09-13 15:58:00  431.0200  431.200  430.97  431.0700   105582
2024-09-13 15:59:00  431.0600  431.100  430.48  430.5700   251679
2024-09-13 16:00:00  430.5700  430.590  430.16  430.5900  6527477

[8211 rows x 5 columns]


2. Feature Engineering
With your data, the next step is to create features that your machine learning model can learn from. These might include:

Moving averages
Price changes over time (e.g., close-to-close percentage change)
High-Low spread
Here's how to create some simple features:

In [97]:
# Feature: 10-period moving average
data['MA10'] = data['Close'].rolling(window=10).mean()

# Feature: 50-period moving average
data['MA50'] = data['Close'].rolling(window=50).mean()

# Feature: Close-to-Close percentage change
data['Pct_Change'] = data['Close'].pct_change()

# Feature: High-Low spread
data['HL_Spread'] = data['High'] - data['Low']

# Add RSI as a feature
data['RSI'] = calculate_rsi(data)

# Add MACD Line, Signal Line, and MACD Histogram as features
data['MACD_Line'], data['Signal_Line'], data['MACD_Histogram'] = calculate_macd(data)

# Calculate the percentage above/below current price for moving averages
data['MA10_pct'] = (data['MA10'] - data['Close']) / data['Close'] * 100
data['MA50_pct'] = (data['MA50'] - data['Close']) / data['Close'] * 100

# Calculate the MACD and Signal Line as a percentage of the current price
data['MACD_pct'] = data['MACD_Line'] / data['Close'] * 100
data['Signal_pct'] = data['Signal_Line'] / data['Close'] * 100


# Drop any rows with missing values due to moving averages
data = data.dropna()



Scale the features

In [98]:

from sklearn.preprocessing import StandardScaler

# Define the new percentage-based features
features = ['MA10_pct', 'MA50_pct', 'Pct_Change', 'HL_Spread', 'RSI', 'MACD_pct', 'Signal_pct', 'MACD_Histogram']

# Scale the features
scaler = StandardScaler()
data_scaled = data.copy()
data_scaled[features] = scaler.fit_transform(data[features])


In [99]:
data_scaled[features]

Unnamed: 0_level_0,MA10_pct,MA50_pct,Pct_Change,HL_Spread,RSI,MACD_pct,Signal_pct,MACD_Histogram
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-08-15 10:19:00,-0.557081,-0.122028,0.465720,-0.429551,0.074833,-0.196602,-0.218299,0.041321
2024-08-15 10:20:00,-0.717053,-0.234759,0.488761,-0.221987,0.846189,-0.118283,-0.199401,0.264167
2024-08-15 10:21:00,-1.081366,-0.455344,0.969411,-0.170095,1.377224,0.009828,-0.157322,0.589780
2024-08-15 10:22:00,-0.475011,-0.278324,-0.710072,0.110117,1.193966,0.062202,-0.112775,0.626399
2024-08-15 10:23:00,-0.498364,-0.306723,0.187739,-0.014422,1.183766,0.115205,-0.065947,0.657658
...,...,...,...,...,...,...,...,...
2024-09-13 15:56:00,-0.412410,-0.333022,0.802163,0.525246,0.449120,0.223765,0.257415,-0.080242
2024-09-13 15:57:00,-0.169810,-0.233103,-0.381274,0.566759,0.080595,0.226528,0.253648,-0.056121
2024-09-13 15:58:00,-0.219459,-0.251002,0.125771,-0.305013,-0.120564,0.234571,0.252253,-0.020543
2024-09-13 15:59:00,0.880942,0.198295,-1.883823,1.313992,-0.798066,0.111276,0.225505,-0.391606


3. Define the Target
You can define the target variable for training your model. A common approach is to predict the price direction (1 for upward movement, 0 for downward movement).

In [100]:
def calculate_target(data, n_periods):
    """
    Create a target column based on the price performance over N periods.
    
    data: DataFrame
        The price data.
    n_periods: int
        Number of periods into the future to evaluate the target.
    """
    # Target: if Close price N periods ahead is greater than current Close price -> 1 (buy), else 0 (sell)
    data['Target'] = (data['Close'].shift(-n_periods) > (data['Close']*1.001)).astype(int)
    return data

# Choose the number of periods, e.g., N=5
N = 4
data_scaled = calculate_target(data_scaled, N)
print(data_scaled['Target'].value_counts())

# Drop the last N rows as they won't have valid targets
data_scaled = data_scaled.dropna()


Target
0    7133
1    1029
Name: count, dtype: int64


4. Train-Test Split
Before training the model, split your data into training and testing sets:

In [101]:
# Define feature columns including RSI and MACD-related features
features = ['MA10_pct', 'MA50_pct', 'Pct_Change', 'HL_Spread', 'RSI', 'MACD_pct', 'Signal_pct', 'MACD_Histogram']

X = data_scaled[features]
y = data_scaled['Target']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


5. Train a Logistic Regression Model
Logistic regression is a simple model you can use as a starting point:

In [102]:

# Train the logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Increase the number of iterations
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)
print(y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy for N={N} periods: {accuracy:.2f}")


Datetime
2024-08-15 10:19:00    0
2024-08-15 10:20:00    0
2024-08-15 10:21:00    0
2024-08-15 10:22:00    1
2024-08-15 10:23:00    1
                      ..
2024-09-09 14:47:00    0
2024-09-09 14:48:00    0
2024-09-09 14:49:00    0
2024-09-09 14:50:00    0
2024-09-09 14:51:00    0
Name: Target, Length: 6529, dtype: int64
Model Accuracy for N=4 periods: 0.82


In [103]:
import joblib

# Save the model
joblib.dump(model, 'logistic_regression_model.pkl')
joblib.dump(scaler, 'logistic_regression_scaler.pkl')

['logistic_regression_scaler.pkl']