In [67]:
#%pip install -r ../requirements.txt



In [68]:
def calculate_rsi(data, window=14):
    delta = data['Close'].diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_macd(data, short_window=12, long_window=26, signal_window=9):
    # Short-term EMA (12-period)
    short_ema = data['Close'].ewm(span=short_window, adjust=False).mean()
    
    # Long-term EMA (26-period)
    long_ema = data['Close'].ewm(span=long_window, adjust=False).mean()
    
    # MACD Line
    macd_line = short_ema - long_ema
    
    # Signal Line (9-period EMA of MACD Line)
    signal_line = macd_line.ewm(span=signal_window, adjust=False).mean()
    
    # MACD Histogram (difference between MACD Line and Signal Line)
    macd_histogram = macd_line - signal_line
    
    return macd_line, signal_line, macd_histogram


In [91]:
import pandas as pd

# Load the CSV file, assuming it's saved as 'data.csv'
#data = pd.read_csv('../sampledata/MSFT_5d_1m_intraday_last_week.csv', parse_dates=['Datetime'], index_col='Datetime')
data = pd.read_csv('../sampledata/msft_intraday_data.csv', parse_dates=['Datetime'], index_col='Datetime')


#data = data[(data['Datetime'].dt.time >= pd.to_datetime('09:30:00').time()) &
#        (data['Datetime'].dt.time <= pd.to_datetime('16:00:00').time())]


# Display the data to ensure it's loaded correctly
print(data.head())


                       Open    High     Low   Close  Volume
Datetime                                                   
2024-08-15 04:00:00  417.31  418.75  416.84  418.00     473
2024-08-15 04:01:00  418.09  418.29  418.01  418.27     665
2024-08-15 04:02:00  418.11  418.27  417.77  417.86     459
2024-08-15 04:03:00  418.11  418.11  417.80  417.80     183
2024-08-15 04:04:00  418.00  418.00  417.82  418.00      40


2. Feature Engineering
With your data, the next step is to create features that your machine learning model can learn from. These might include:

Moving averages
Price changes over time (e.g., close-to-close percentage change)
High-Low spread
Here's how to create some simple features:

In [79]:
# Feature: 10-period moving average
data['MA10'] = data['Close'].rolling(window=10).mean()

# Feature: 50-period moving average
data['MA50'] = data['Close'].rolling(window=50).mean()

# Feature: Close-to-Close percentage change
data['Pct_Change'] = data['Close'].pct_change()

# Feature: High-Low spread
data['HL_Spread'] = data['High'] - data['Low']

# Add RSI as a feature
data['RSI'] = calculate_rsi(data)

# Add MACD Line, Signal Line, and MACD Histogram as features
data['MACD_Line'], data['Signal_Line'], data['MACD_Histogram'] = calculate_macd(data)

# Calculate the percentage above/below current price for moving averages
data['MA10_pct'] = (data['MA10'] - data['Close']) / data['Close'] * 100
data['MA50_pct'] = (data['MA50'] - data['Close']) / data['Close'] * 100

# Calculate the MACD and Signal Line as a percentage of the current price
data['MACD_pct'] = data['MACD_Line'] / data['Close'] * 100
data['Signal_pct'] = data['Signal_Line'] / data['Close'] * 100


# Drop any rows with missing values due to moving averages
data = data.dropna()



Scale the features

In [81]:

from sklearn.preprocessing import StandardScaler

# Define the new percentage-based features
features = ['MA10_pct', 'MA50_pct', 'Pct_Change', 'HL_Spread', 'RSI', 'MACD_pct', 'Signal_pct', 'MACD_Histogram']

# Scale the features
scaler = StandardScaler()
data_scaled = data.copy()
data_scaled[features] = scaler.fit_transform(data[features])


In [82]:
data_scaled[features]

Unnamed: 0_level_0,MA10_pct,MA50_pct,Pct_Change,HL_Spread,RSI,MACD_pct,Signal_pct,MACD_Histogram
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-08-15 04:50:00,-0.015878,-0.007233,-0.010317,-0.191348,1.756685,0.130720,-0.039030,0.397822
2024-08-15 04:51:00,0.102173,0.071726,-0.085515,-0.165835,1.000377,0.104964,-0.007528,0.268410
2024-08-15 04:52:00,0.081538,0.040033,0.014752,-0.162190,0.941388,0.089994,0.014281,0.185803
2024-08-15 04:53:00,-0.045666,-0.070770,0.098320,-0.176769,1.308554,0.116791,0.037800,0.200500
2024-08-15 04:54:00,-0.053684,-0.091650,0.014747,-0.133032,1.456131,0.142907,0.062544,0.210897
...,...,...,...,...,...,...,...,...
2024-09-13 19:55:00,-0.183983,-0.117268,0.219439,-0.091846,0.474647,-0.007806,-0.032716,0.051190
2024-09-13 19:56:00,0.098407,0.108620,-0.215119,-0.191348,-0.450841,-0.043597,-0.036177,-0.030343
2024-09-13 19:57:00,-0.012255,0.019365,0.083190,-0.149433,-0.090651,-0.037558,-0.037566,-0.012261
2024-09-13 19:58:00,-0.060062,-0.016874,0.034523,-0.118453,0.035145,-0.018123,-0.034271,0.028900


3. Define the Target
You can define the target variable for training your model. A common approach is to predict the price direction (1 for upward movement, 0 for downward movement).

In [84]:
def calculate_target(data, n_periods):
    """
    Create a target column based on the price performance over N periods.
    
    data: DataFrame
        The price data.
    n_periods: int
        Number of periods into the future to evaluate the target.
    """
    # Target: if Close price N periods ahead is greater than current Close price -> 1 (buy), else 0 (sell)
    data['Target'] = (data['Close'].shift(-n_periods) > (data['Close']*1.001)).astype(int)
    return data

# Choose the number of periods, e.g., N=5
N = 4
data_scaled = calculate_target(data_scaled, N)
print(data_scaled['Target'].value_counts())

# Drop the last N rows as they won't have valid targets
data_scaled = data_scaled.dropna()


Target
0    18149
1     1649
Name: count, dtype: int64


4. Train-Test Split
Before training the model, split your data into training and testing sets:

In [85]:
# Define feature columns including RSI and MACD-related features
features = ['MA10_pct', 'MA50_pct', 'Pct_Change', 'HL_Spread', 'RSI', 'MACD_pct', 'Signal_pct', 'MACD_Histogram']

X = data_scaled[features]
y = data_scaled['Target']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


5. Train a Logistic Regression Model
Logistic regression is a simple model you can use as a starting point:

In [87]:

# Train the logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Increase the number of iterations
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)
print(y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy for N={N} periods: {accuracy:.2f}")


Datetime
2024-08-15 04:50:00    0
2024-08-15 04:51:00    0
2024-08-15 04:52:00    0
2024-08-15 04:53:00    0
2024-08-15 04:54:00    0
                      ..
2024-09-09 16:04:00    1
2024-09-09 16:05:00    1
2024-09-09 16:06:00    1
2024-09-09 16:07:00    1
2024-09-09 16:08:00    0
Name: Target, Length: 15838, dtype: int64
Model Accuracy for N=4 periods: 0.90


In [89]:
import joblib

# Save the model
joblib.dump(model, 'logistic_regression_model.pkl')
joblib.dump(scaler, 'logistic_regression_scaler.pkl')

['logistic_regression_scaler.pkl']