In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
import os

# Load your stock data
data = pd.read_csv('/content/stock_data.csv')
data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)

# Step 1: Calculate stock-specific features
stock_features = data.groupby('context_id').agg({
    'volume': ['mean'],
    'close': ['std', 'mean'],
}).reset_index()

# Flatten column names
stock_features.columns = ['context_id', 'avg_volume', 'volatility', 'avg_price']

# Step 2: Normalize numerical features
scaler = StandardScaler()
stock_features[['avg_volume', 'volatility', 'avg_price']] = scaler.fit_transform(
    stock_features[['avg_volume', 'volatility', 'avg_price']]
)

# Step 3: Cluster stocks using K-means
X_cluster = stock_features[['avg_volume', 'volatility', 'avg_price']]
num_clusters = 5  # Adjust the number of clusters to speed up training
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
stock_features['cluster'] = kmeans.fit_predict(X_cluster)

# Merge cluster labels back into the main data
data = data.reset_index().merge(stock_features[['context_id', 'cluster']], on='context_id', how='left')

# Step 4: Prepare the time-series data for each cluster with additional seasonal features
n_past = 252
X_dict = {}
y_dict = {}

for cluster in range(num_clusters):
    cluster_data = data[data['cluster'] == cluster].copy()  # Use `.copy()` to avoid the warning

    # Add day of the week and day of the year as additional features using .loc
    cluster_data.loc[:, 'day_of_week'] = cluster_data['date'].dt.dayofweek
    cluster_data.loc[:, 'day_of_year'] = cluster_data['date'].dt.dayofyear

    # Create a DataFrame of average daily close prices for all stocks in the cluster
    avg_cluster_prices = cluster_data.groupby('date')['close'].mean().values.reshape(-1, 1)

    # Scale the close prices for the LSTM
    scaler = StandardScaler()
    prices_scaled = scaler.fit_transform(avg_cluster_prices).flatten()

    # Prepare time-series sequences for LSTM
    X, y = [], []
    for i in range(n_past, len(prices_scaled)):
        past_prices = prices_scaled[i - n_past:i]
        day_of_week = cluster_data['day_of_week'].values[i]
        day_of_year = cluster_data['day_of_year'].values[i]

        # Combine past prices with the seasonal features
        X.append(np.concatenate([past_prices, [day_of_week, day_of_year]]))
        y.append(prices_scaled[i])

    X_dict[cluster] = np.array(X)
    y_dict[cluster] = np.array(y)


# Step 5: Train LSTM models without hyperparameter tuning

# Directory to save the models
save_dir = 'trained_lstm_models'
os.makedirs(save_dir, exist_ok=True)

lstm_models = {}
for cluster in range(num_clusters):
    X_train = X_dict[cluster]
    y_train = y_dict[cluster]

    # Reshape input data for LSTM
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)

    # Define the LSTM model
    model = Sequential()
    model.add(LSTM(units=128, return_sequences=True, input_shape=(n_past + 2, 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(units=64))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

    # Add EarlyStopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    # Train the model
    history = model.fit(X_train, y_train, epochs=12, batch_size=128, validation_data=(X_test, y_test), callbacks=[early_stopping])

    # Store the trained model for this cluster
    lstm_models[cluster] = model

    # Save the trained model for future use
    model_path = os.path.join(save_dir, f'lstm_model_cluster_{cluster}.h5')
    model.save(model_path)
    print(f"Model for cluster {cluster} saved to {model_path}")

    print(f"Trained LSTM model for cluster {cluster}")
    model.summary()

    # Make predictions on the test set
    predictions_test = model.predict(X_test)

    # Calculate performance metrics
    mae_test = mean_absolute_error(y_test, predictions_test)
    mse_test = mean_squared_error(y_test, predictions_test)
    r2_test = r2_score(y_test, predictions_test)
    mape_test = mean_absolute_percentage_error(y_test, predictions_test)

    print(f'Testing Set Metrics for Cluster {cluster} - MAE: {mae_test}, MSE: {mse_test}, R2: {r2_test}, MAPE: {mape_test}')

# At this point, each cluster's model has been trained and evaluated.


Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Model for cluster 0 saved to trained_lstm_models/lstm_model_cluster_0.h5
Trained LSTM model for cluster 0
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 254, 128)          66560     
                                                                 
 dropout_2 (Dropout)         (None, 254, 128)          0         
                                                                 
 lstm_3 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 116033 (453.25 KB)
Tra

  saving_api.save_model(


Testing Set Metrics for Cluster 0 - MAE: 0.23257471949578712, MSE: 0.10968451993091594, R2: 0.7511358516214363, MAPE: 0.125239178104965
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Model for cluster 1 saved to trained_lstm_models/lstm_model_cluster_1.h5
Trained LSTM model for cluster 1
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 254, 128)          66560     
                                                                 
 dropout_4 (Dropout)         (None, 254, 128)          0         
                                                                 
 lstm_5 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
       

  saving_api.save_model(


Testing Set Metrics for Cluster 1 - MAE: 0.12638275203477586, MSE: 0.02897712037178297, R2: 0.888370988457519, MAPE: 0.06702674027341333
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Model for cluster 2 saved to trained_lstm_models/lstm_model_cluster_2.h5
Trained LSTM model for cluster 2
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 254, 128)          66560     
                                                                 
 dropout_6 (Dropout)         (None, 254, 128)          0         
                                                                 
 lstm_7 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_7 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (N

  saving_api.save_model(


Testing Set Metrics for Cluster 2 - MAE: 0.5614537565328929, MSE: 0.6623605118640655, R2: 0.190415135882965, MAPE: 0.247432189744245
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Model for cluster 3 saved to trained_lstm_models/lstm_model_cluster_3.h5
Trained LSTM model for cluster 3
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_8 (LSTM)               (None, 254, 128)          66560     
                                                                 
 dropout_8 (Dropout)         (None, 254, 128)          0         
                                                                 
 lstm_9 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_9 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None,

  saving_api.save_model(


Testing Set Metrics for Cluster 3 - MAE: 0.26111213615513457, MSE: 0.18219778588027907, R2: 0.6918125207825216, MAPE: 0.13884640306852544
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Model for cluster 4 saved to trained_lstm_models/lstm_model_cluster_4.h5
Trained LSTM model for cluster 4
Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_10 (LSTM)              (None, 254, 128)          66560     
                                                                 
 dropout_10 (Dropout)        (None, 254, 128)          0         
                                                                 
 lstm_11 (LSTM)              (None, 64)                49408     
                                                                 
 dropout_11 (Dropout)        (None, 64)                0         
     

  saving_api.save_model(


Testing Set Metrics for Cluster 4 - MAE: 0.13595019500814254, MSE: 0.02797635279429286, R2: 0.8789739484556198, MAPE: 0.08543988440226882


In [4]:
from tensorflow.keras.models import load_model
num_clusters = 5
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
import os

save_dir = 'trained_lstm_models'
os.makedirs(save_dir, exist_ok=True)

# Load the saved models back for inference
lstm_models = {}
for cluster in range(num_clusters):
    model_path = os.path.join(save_dir, f'lstm_model_cluster_{cluster}.h5')
    model = load_model(model_path)
    lstm_models[cluster] = model
    print(f"Model for cluster {cluster} loaded from {model_path}")


Model for cluster 0 loaded from trained_lstm_models/lstm_model_cluster_0.h5
Model for cluster 1 loaded from trained_lstm_models/lstm_model_cluster_1.h5
Model for cluster 2 loaded from trained_lstm_models/lstm_model_cluster_2.h5
Model for cluster 3 loaded from trained_lstm_models/lstm_model_cluster_3.h5
Model for cluster 4 loaded from trained_lstm_models/lstm_model_cluster_4.h5


In [6]:
import pandas as pd

# Load the new data (replace 'path_to_new_data.csv' with your file path)
new_data = pd.read_csv('/content/stock_data.csv')

# Ensure 'date' column is in datetime format and set it as the index if necessary
new_data['date'] = pd.to_datetime(new_data['date'])
new_data.set_index('date', inplace=True)


In [7]:
# Perform clustering again if necessary
stock_features = new_data.groupby('context_id').agg({
    'volume': ['mean'],
    'close': ['std', 'mean'],
}).reset_index()

# Flatten column names
stock_features.columns = ['context_id', 'avg_volume', 'volatility', 'avg_price']

# Normalize numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
stock_features[['avg_volume', 'volatility', 'avg_price']] = scaler.fit_transform(
    stock_features[['avg_volume', 'volatility', 'avg_price']]
)

# Apply clustering using the same number of clusters as used during training
from sklearn.cluster import KMeans
num_clusters = 5  # Set the same number of clusters as used in training
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
stock_features['cluster'] = kmeans.fit_predict(stock_features[['avg_volume', 'volatility', 'avg_price']])

# Merge the cluster labels back into `new_data`
new_data = new_data.reset_index().merge(stock_features[['context_id', 'cluster']], on='context_id', how='left')


In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from datetime import timedelta
import os

n_future = 10  # Number of future days you want to predict
n_past = 252
# Create lists to store predictions
all_predictions = []

for cluster in range(num_clusters):
    cluster_data = new_data[new_data['cluster'] == cluster]
    stocks_in_cluster = cluster_data['context_id'].unique()
    cluster_data['day_of_week'] = cluster_data['date'].dt.dayofweek
    cluster_data['day_of_year'] = cluster_data['date'].dt.dayofyear

    for stock in stocks_in_cluster:
        stock_data = cluster_data[cluster_data['context_id'] == stock]
        last_sequence = stock_data.iloc[-n_past:].copy()

        if len(last_sequence) < n_past:
            print(f"Not enough data for stock {stock} in cluster {cluster}. Skipping.")
            continue

        # Scale the last sequence
        last_prices = last_sequence['close'].values.reshape(-1, 1)
        scaler = StandardScaler()
        last_prices_scaled = scaler.fit_transform(last_prices).flatten()

        # Prepare input for the model
        X = np.concatenate([last_prices_scaled, [last_sequence['day_of_week'].iloc[-1], last_sequence['day_of_year'].iloc[-1]]])
        X = X.reshape(1, n_past + 2, 1)

        # Predict future prices
        future_predictions = []
        future_dates = []
        last_date = last_sequence['date'].iloc[-1]

        for _ in range(n_future):
            predicted_scaled_price = lstm_models[cluster].predict(X, verbose=0)
            predicted_price = scaler.inverse_transform(predicted_scaled_price)
            future_predictions.append(predicted_price.flatten()[0])

            new_day_of_week = (last_sequence['day_of_week'].iloc[-1] + 1) % 7
            new_day_of_year = (last_sequence['day_of_year'].iloc[-1] + 1) % 365

            last_prices_scaled = np.append(last_prices_scaled[1:], predicted_scaled_price.flatten())
            X = np.concatenate([last_prices_scaled, [new_day_of_week, new_day_of_year]]).reshape(1, n_past + 2, 1)

            last_date += timedelta(days=1)
            future_dates.append(last_date)

        # Collect predictions for the stock
        stock_predictions = pd.DataFrame({
            'date': future_dates,
            'context_id': stock,
            'predicted_close': future_predictions
        })
        all_predictions.append(stock_predictions)

# Combine all predictions into a single DataFrame and save
predictions_df = pd.concat(all_predictions, ignore_index=True)
predictions_df.to_csv('stock_predictions.csv', index=False)
print("Predictions saved to 'stock_predictions.csv'")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_data['day_of_week'] = cluster_data['date'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_data['day_of_year'] = cluster_data['date'].dt.dayofyear
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_data['day_of_week'] = cluster_data['date'].dt.dayofweek
A value is tr

Not enough data for stock GEV in cluster 1. Skipping.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_data['day_of_week'] = cluster_data['date'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_data['day_of_year'] = cluster_data['date'].dt.dayofyear
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_data['day_of_week'] = cluster_data['date'].dt.dayofweek
A value is tr

Not enough data for stock SOLV in cluster 4. Skipping.
Not enough data for stock SW in cluster 4. Skipping.
Predictions saved to 'stock_predictions.csv'


In [12]:
import pandas as pd

# Load historical stock data
historical_data = pd.read_csv('/content/stock_data.csv')
historical_data['date'] = pd.to_datetime(historical_data['date'])

# Load the predicted data (next 30 days)
predicted_data = pd.read_csv('/content/stock_predictions.csv')
predicted_data['date'] = pd.to_datetime(predicted_data['date'])

# Select relevant columns from both datasets, ensuring the same structure
# Assuming 'context_id' represents stock symbol and 'close' for historical data corresponds to 'predicted_close' for predicted data
predicted_data = predicted_data.rename(columns={'predicted_close': 'close'})

# Combine the datasets while keeping all the historical data and adding the new predicted rows
combined_data = pd.concat([historical_data, predicted_data], ignore_index=True)

# Sort the combined data by 'date' and 'context_id' to maintain chronological order
combined_data = combined_data.sort_values(by=['date', 'context_id'])

# Save the combined data to a new CSV file
combined_data.to_csv('combined_stock_data.csv', index=False)

print("Combined dataset saved as 'combined_stock_data.csv'.")


Combined dataset saved as 'combined_stock_data.csv'.
