In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import warnings

In [2]:
# Suppress warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('stock_data_with_indicators.csv')
df.head(5)

Unnamed: 0,Stock_1,Stock_2,Stock_3,Stock_4,Stock_5,Stock_6,Stock_7,Stock_8,Stock_9,Stock_10,...,Stock_50_SMA10,Stock_50_EMA20,Stock_50_EMA50,Stock_50_EMA10,Stock_50_RSI14,Stock_50_BB_middle,Stock_50_BB_upper,Stock_50_BB_lower,Stock_50_MACD,Stock_50_MACD_signal
0,13.46,71.65,48.46,50.52,52.1,13.0,18.98,47.71,69.49,49.96,...,0.0,56.09,56.09,56.09,0.0,0.0,0.0,0.0,0.0,0.0
1,13.48,72.1,48.52,50.5,52.06,12.95,18.95,47.84,69.73,49.93,...,0.0,56.09,56.09,56.09,0.0,0.0,0.0,0.0,-0.0,-0.0
2,13.47,72.35,48.48,50.62,51.8,12.79,18.98,47.98,69.6,49.33,...,0.0,56.07,56.08,56.05,0.0,0.0,0.0,0.0,-0.02,-0.0
3,13.53,72.51,48.42,50.75,51.66,12.66,18.96,48.74,69.54,49.67,...,0.0,56.08,56.08,56.07,0.0,0.0,0.0,0.0,-0.01,-0.0
4,13.64,71.99,48.4,50.65,51.97,12.62,18.89,48.88,69.68,49.46,...,0.0,56.06,56.08,56.04,0.0,0.0,0.0,0.0,-0.02,-0.01


In [11]:
print(df.iloc[46:52, 48:55])

    Stock_49  Stock_50  Stock_1_SMA20  Stock_1_SMA50  Stock_1_SMA10  \
46     43.36     56.81          13.34           0.00          13.21   
47     43.01     57.13          13.32           0.00          13.18   
48     43.28     57.16          13.30           0.00          13.17   
49     43.47     57.12          13.28          13.47          13.15   
50     43.47     57.23          13.26          13.47          13.14   
51     43.63     56.86          13.24          13.46          13.12   

    Stock_1_EMA20  Stock_1_EMA50  
46          13.31          13.42  
47          13.29          13.41  
48          13.28          13.40  
49          13.26          13.39  
50          13.24          13.37  
51          13.22          13.36  


In [20]:
# split df into training and testing
training_org_df = df[49:375]
testing_org_df = df[375:]
print(training_org_df.iloc[0][47:52])
print(training_org_df.iloc[0]['Stock_1'])

Stock_48         35.41
Stock_49         43.47
Stock_50         57.12
Stock_1_SMA20    13.28
Stock_1_SMA50    13.47
Name: 49, dtype: float64
13.07


In [22]:
def create_X_Y(df):
    X = []
    Y = []
    for row_id in range(len(df) - 1):
        for stock_id in range(1, 51):
            stock_columns = [col for col in df.columns if col.startswith(f'Stock_{stock_id}_')]
            stock_columns.append(f'Stock_{stock_id}')
            x = []
            for column_name in stock_columns:
                x.append(df.iloc[row_id][column_name])
            
            X.append(x)
            Y.append(df.iloc[row_id + 1][f'Stock_{stock_id}'])
        
    
    X = np.array(X)
    Y = np.array(Y)
    return X, Y

In [30]:
X_train, Y_train = create_X_Y(training_org_df)

print(X_train.shape)
print(Y_train.shape)

(16250, 13)
(16250,)


In [31]:
print(X_train[0])
print(X_train[50])
print(Y_train[0])

[13.28 13.47 13.15 13.26 13.39 13.17 31.18 13.28 13.61 12.96 -0.11 -0.1
 13.07]
[13.26 13.47 13.14 13.24 13.37 13.16 39.02 13.26 13.57 12.95 -0.11 -0.1
 13.1 ]
13.1


In [32]:
X_test, Y_test = create_X_Y(testing_org_df)

print(X_test.shape)
print(Y_test.shape)

(6200, 13)
(6200,)


In [35]:
# Create dataset for LightGBM
train_data = lgb.Dataset(X_train, label=Y_train)

# Set parameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'mse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
model = lgb.train(params, train_data, num_boost_round=100)

# Make predictions on test data
Y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, Y_pred)
mae = mean_absolute_error(Y_test, Y_pred) 

print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"R-squared Score: {r2:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")

# Feature importance
feature_importance = model.feature_importance()
feature_names = model.feature_name()

for name, importance in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True):
    print(f"{name}: {importance}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001313 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3314
[LightGBM] [Info] Number of data points in the train set: 16250, number of used features: 13
[LightGBM] [Info] Start training from score 41.243282
Mean Squared Error: 0.1479
Root Mean Squared Error: 0.3846
R-squared Score: 0.9996
Mean Absolute Error: 0.2617
Column_12: 1713
Column_5: 223
Column_6: 215
Column_11: 151
Column_10: 129
Column_9: 116
Column_4: 113
Column_8: 92
Column_1: 81
Column_2: 66
Column_0: 57
Column_3: 44
Column_7: 0


In [37]:
model_filename = 'lightgbm_model_1stock_and_all_indicators.joblib'
joblib.dump(model, model_filename)

['lightgbm_model_1stock_and_all_indicators.joblib']