In [17]:
import pandas as pd
import numpy as np

# Load the correct dataset for forecasting
df = pd.read_csv("../data/processed_data/forecasting_data.csv")

# Ensure data is sorted by date if not ready
df = df.sort_values(by=["year", "month", "day"])

# Create a new 'date' column for time-series indexing
df["date"] = pd.to_datetime(df[["year", "month", "day"]])

# Set date as index and drop redundant columns
df.set_index("date", inplace=True)
df.drop(columns=["year", "month", "day"], inplace=True)

print(" Data Loaded from forecasting_data.csv!")

 Data Loaded from forecasting_data.csv!


In [18]:
def create_sequences(data, target_col, window_size=6):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data.iloc[i:i+window_size].values)  # Features: Past 6 months
        y.append(data.iloc[i+window_size][target_col])  # Target: Next month’s spending
    return np.array(X), np.array(y)

# Define look-back window size (6 months)
window_size = 6

# Generate sequences from forecasting_data.csv
X, y = create_sequences(df, target_col="amount_log", window_size=window_size)

print(f" Created {X.shape[0]} training sequences!")

 Created 149994 training sequences!


In [19]:
from sklearn.model_selection import train_test_split

# Split into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

print(f" Training Set: {X_train.shape[0]} samples")
print(f" Testing Set: {X_test.shape[0]} samples")

 Training Set: 119995 samples
 Testing Set: 29999 samples


In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Build LSTM Model
lstm_model = Sequential([
    LSTM(50, activation='relu', return_sequences=True, input_shape=(window_size, X_train.shape[2])),
    LSTM(50, activation='relu'),
    Dense(25, activation='relu'),
    Dense(1)  # Output layer predicts log-transformed spending
])

# Compile the model
lstm_model.compile(optimizer='adam', loss='mse')

# Train the model
history = lstm_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

print(" LSTM Model Training Complete!")

Epoch 1/20


  super().__init__(**kwargs)


[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.1437 - val_loss: 0.1181
Epoch 2/20
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.1182 - val_loss: 0.1239
Epoch 3/20
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.1188 - val_loss: 0.1148
Epoch 4/20
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.1175 - val_loss: 0.1153
Epoch 5/20
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.1173 - val_loss: 0.1152
Epoch 6/20
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.1170 - val_loss: 0.1154
Epoch 7/20
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.1171 - val_loss: 0.1160
Epoch 8/20
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.1163 - val_loss: 0.1170
Epoch 9/20
[1m3750/3750[0m [32m━

In [21]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Make predictions
y_train_pred = lstm_model.predict(X_train)
y_test_pred = lstm_model.predict(X_test)

# Compute MAE & RMSE
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"\n LSTM Model Performance:")
print(f"Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
print(f"Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")

[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 592us/step
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 575us/step

 LSTM Model Performance:
Train MAE: 0.2706, Test MAE: 0.2689
Train RMSE: 0.3417, Test RMSE: 0.3398


In [22]:
def predict_future_spending(recent_data, model, window_size=6):
    """
    Predict future spending using the trained LSTM model.
    `recent_data`: Last 6 months of spending.
    """
    # Reshape for LSTM input
    input_seq = np.expand_dims(recent_data.values, axis=0)

    # Predict log spending amount
    predicted_log_spending = model.predict(input_seq)[0][0]

    # Convert back to original scale
    predicted_spending = np.exp(predicted_log_spending)

    return predicted_spending

# Predict next month's spending using last 6 months' data
last_6_months = df.iloc[-6:]
predicted_value = predict_future_spending(last_6_months, lstm_model)

print(f"\n Predicted Spending for Next Month: ${predicted_value:.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step

 Predicted Spending for Next Month: $4.05


In [23]:
import numpy as np

# Reverse log transformation to get actual spending values
df["actual_spending"] = np.exp(df["amount_log"])

# Display statistics
df["actual_spending"].describe()

count    150000.000000
mean          4.348479
std           1.240324
min           1.000000
25%           3.542389
50%           4.571221
75%           5.276666
max           7.873368
Name: actual_spending, dtype: float64

In [24]:
df.describe()

Unnamed: 0,weekday_num,mcc_freq,merchant_category_encoded,is_refund,per_capita_income,yearly_income,total_debt,amount_log,actual_spending
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0
mean,3.001693,0.061277,58.476447,0.04954,-1.114131e-16,2.782959e-17,-2.946384e-17,1.419006,4.348479
std,1.998532,0.040969,28.844909,0.216993,1.000003,1.000003,1.000003,0.340785,1.240324
min,0.0,2.4e-05,0.0,0.0,-2.572172,-2.438146,-1.207768,0.0,1.0
25%,1.0,0.035727,34.0,0.0,-0.7388124,-0.6957376,-0.8573371,1.264801,3.542389
50%,3.0,0.050664,61.0,0.0,-0.2265618,-0.2122563,-0.1059912,1.51978,4.571221
75%,5.0,0.107074,87.0,0.0,0.5519426,0.5428254,0.6033125,1.663294,5.276666
max,6.0,0.11969,107.0,1.0,2.219384,2.20851,2.772522,2.063486,7.873368


In [25]:
# Predict on training data and compare
train_preds = lstm_model.predict(X_train)
train_actuals = np.exp(y_train)  # Convert from log-scale

# Compare the mean predicted vs actual spending
print(f"Mean Predicted Spending: ${np.mean(np.exp(train_preds)):.2f}")
print(f"Mean Actual Spending: ${np.mean(train_actuals):.2f}")

[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 579us/step
Mean Predicted Spending: $4.05
Mean Actual Spending: $4.35


In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define an improved LSTM Model
def build_lstm_model():
    model = Sequential([
        LSTM(units=128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
        Dropout(0.2),  # Reduce overfitting
        LSTM(units=64, return_sequences=False),
        Dropout(0.2),
        Dense(units=32, activation='relu'),
        Dense(units=1)  # Final output
    ])

    # Compile the model with optimized learning rate
    optimizer = Adam(learning_rate=0.001)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mae'])
    return model

# Train the new model
lstm_model = build_lstm_model()
history = lstm_model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test), verbose=1)

Epoch 1/50


  super().__init__(**kwargs)


[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - loss: 0.1543 - mae: 0.2968 - val_loss: 0.1229 - val_mae: 0.2916
Epoch 2/50
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 6ms/step - loss: 0.1204 - mae: 0.2700 - val_loss: 0.1214 - val_mae: 0.2884
Epoch 3/50
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - loss: 0.1186 - mae: 0.2679 - val_loss: 0.1188 - val_mae: 0.2819
Epoch 4/50
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - loss: 0.1177 - mae: 0.2670 - val_loss: 0.1227 - val_mae: 0.2913
Epoch 5/50
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - loss: 0.1180 - mae: 0.2670 - val_loss: 0.1219 - val_mae: 0.2896
Epoch 6/50
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - loss: 0.1169 - mae: 0.2662 - val_loss: 0.1248 - val_mae: 0.2958
Epoch 7/50
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7

In [27]:
# Predict train and test data
train_preds = lstm_model.predict(X_train)
test_preds = lstm_model.predict(X_test)

# Convert predictions from log scale back to actual spending
train_actuals = np.exp(y_train)
test_actuals = np.exp(y_test)
train_preds = np.exp(train_preds)
test_preds = np.exp(test_preds)

# Compute performance metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

train_mae = mean_absolute_error(train_actuals, train_preds)
test_mae = mean_absolute_error(test_actuals, test_preds)

train_rmse = np.sqrt(mean_squared_error(train_actuals, train_preds))
test_rmse = np.sqrt(mean_squared_error(test_actuals, test_preds))

print(f"LSTM Model Performance After Tuning:")
print(f"Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")
print(f"Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")

[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
LSTM Model Performance After Tuning:
Train MAE: 1.0634, Test MAE: 1.0543
Train RMSE: 1.2676, Test RMSE: 1.2591


In [28]:
# Save the trained LSTM forecasting model
lstm_model.save("../data/models/lstm_forecasting_model.h5")  # Save in .h5 format
print("LSTM Forecasting Model Saved Successfully!")



LSTM Forecasting Model Saved Successfully!


In [6]:
import numpy as np
import pandas as pd
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
# Load the trained LSTM forecasting model with explicit loss function
lstm_model = load_model(
    "../data/models/lstm_forecasting_model.h5",
    custom_objects={"mse": MeanSquaredError()}
)
print(" LSTM Forecasting Model Loaded Successfully!")

# Load the classification model for merchant category prediction (LSTM)
lstm_classification_model = load_model("../data/models/lstm_model.h5")
print(" LSTM Classification Model Loaded Successfully!")

# Load tokenizer for text processing
with open("../data/models/tokenizer.pkl", "rb") as file:
    tokenizer = pickle.load(file)
print(" Tokenizer Loaded Successfully!")

# Load label encoder (for merchant category classification)
with open("../data/models/label_encoder.pkl", "rb") as file:
    label_encoder = pickle.load(file)
print(" Label Encoder Loaded Successfully!")

# Load processed dataset (ensure it contains the required features)
df = pd.read_csv("../data/processed_data/forecasting_data.csv")

# Define Features Used in Forecasting Model (Fixed to 8 Features)
feature_columns = [
    "year", "month", "day", "weekday_num", "mcc_freq",
    "merchant_category_encoded", "yearly_income", "total_debt"
]

print(" All Required Data & Models Loaded Successfully!")



 LSTM Forecasting Model Loaded Successfully!
 LSTM Classification Model Loaded Successfully!
 Tokenizer Loaded Successfully!
 Label Encoder Loaded Successfully!
 All Required Data & Models Loaded Successfully!


In [8]:
def get_predicted_category(description):
    """
    Predicts the merchant category based on transaction description using the classification model.
    """
    print(f"\n Processing Description: '{description}'")

    # Convert text to numerical sequence
    desc_sequence = tokenizer.texts_to_sequences([description])
    desc_padded = pad_sequences(desc_sequence, maxlen=20)  # Ensure fixed input size

    # Predict category using LSTM classification model
    predicted_category = lstm_classification_model.predict(desc_padded)
    predicted_category = np.argmax(predicted_category, axis=1)[0]  # Get highest probability class

    # Convert category index to actual label
    category_label = label_encoder.inverse_transform([predicted_category])[0]

    print(f" Predicted Category: {category_label} (Encoded: {predicted_category})")
    return predicted_category

In [15]:
import datetime
def predict_future_spending(year, month, day, amount, description, lstm_model, df, feature_columns, user_profile, window_size=6):
    """
    Predicts future spending using the trained LSTM model based on user input and historical patterns.
    """
    print("\nRunning Future Spending Prediction...")

    # Step 1: Predict merchant category from description
    predicted_category = get_predicted_category(description)

    # Step 2: Extract weekday and encode spending amount
    weekday_num = datetime.date(year, month, day).weekday()
    amount_log = np.log(amount)

    # Step 3: Handle income & debt
    if "yearly_income" not in user_profile or "total_debt" not in user_profile:
        print("First-time user: Income & debt stored for future use.")
        user_profile["yearly_income"] = float(input("Enter your yearly income: "))
        user_profile["total_debt"] = float(input("Enter your total debt: "))
    else:
        print("Returning user detected, fetching stored income & debt.")

    yearly_income = user_profile["yearly_income"]
    total_debt = user_profile["total_debt"]

    # Step 4: Filter last N transactions for that merchant category
    category_history = df[df["merchant_category_encoded"] == predicted_category].copy()
    category_history.sort_values(by=["year", "month", "day"], inplace=True)

    recent_history = category_history.tail(window_size)

    # Step 5: Pad if fewer than window_size transactions exist
    if len(recent_history) < window_size:
        print("Insufficient history. Padding with median values.")
        pad_rows = pd.DataFrame([df[feature_columns].median()] * (window_size - len(recent_history)))
        recent_history = pd.concat([pad_rows, recent_history], ignore_index=True)

    # Step 6: Replace static features in the sequence with current user values
    recent_history["year"] = year
    recent_history["month"] = month
    recent_history["day"] = day
    recent_history["weekday_num"] = weekday_num
    recent_history["merchant_category_encoded"] = predicted_category
    recent_history.loc[:, "yearly_income"] = yearly_income
    recent_history.loc[:, "total_debt"] = total_debt

    # Step 7: Prepare input sequence
    input_seq = recent_history[feature_columns].values.reshape(1, window_size, len(feature_columns))
    print(f"Input Sequence Shape: {input_seq.shape}")

    # Step 8: Predict and return output
    predicted_log_spending = lstm_model.predict(input_seq)[0][0]
    predicted_spending = np.exp(predicted_log_spending)

    print(f" Predicted Spending (log-scale): {predicted_log_spending}")
    print(f" Final Predicted Spending: ${predicted_spending:.2f}")

    return predicted_spending

In [17]:
user_profile = {}

# First-time user input
year = 2025
month = 3
day = 21
amount = 15.00
description = "Uber ride home"

predicted_value = predict_future_spending(
    year, month, day, amount, description,
    lstm_model, df, feature_columns, user_profile
)

print(f"\n Predicted Spending for Next Month: ${predicted_value:.2f}")


Running Future Spending Prediction...

 Processing Description: 'Uber ride home'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
 Predicted Category: 10 (Encoded: 10)
First-time user: Income & debt stored for future use.
Input Sequence Shape: (1, 6, 8)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
 Predicted Spending (log-scale): 1.4330188035964966
 Final Predicted Spending: $4.19

 Predicted Spending for Next Month: $4.19


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent_history["year"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent_history["month"] = month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent_history["day"] = day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value 

In [18]:
# Initialize user profile with stored income & debt
user_profile = {
    "yearly_income": 75000,  # Example income
    "total_debt": 5000       # Example debt
}

# Define test input for a returning user
year = 2025
month = 4
day = 10
amount = 30.75  # Amount in dollars
description = "Starbucks coffee"

# Run the prediction function
predicted_value = predict_future_spending(year, month, day, amount, description, lstm_model, df, feature_columns, user_profile)

# Print the prediction result
print(f"\n Predicted Spending for Next Month: ${predicted_value:.2f}")


Running Future Spending Prediction...

 Processing Description: 'Starbucks coffee'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
 Predicted Category: 10 (Encoded: 10)
Returning user detected, fetching stored income & debt.
Input Sequence Shape: (1, 6, 8)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
 Predicted Spending (log-scale): 1.4225270748138428
 Final Predicted Spending: $4.15

 Predicted Spending for Next Month: $4.15


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent_history["year"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent_history["month"] = month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent_history["day"] = day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value 