In [6]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import datetime

In [7]:
# Load Data
df = pd.read_csv("../data/processed_data/budget_model_data.csv")
print(f"\n Budget Model Dataset Loaded! Shape: {df.shape}")


 Budget Model Dataset Loaded! Shape: (131472, 8)


In [8]:
# Prepare Features and Target
features = [
    "merchant_category_encoded", "year", "month",
    "yearly_income", "total_debt", "per_capita_income"
]
target = "recommended_budget"

X = df[features]
y = df[target]

In [9]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"\n Train shape: {X_train.shape}, Test shape: {X_test.shape}")


 Train shape: (105177, 6), Test shape: (26295, 6)


In [10]:
# Train Final Model
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

In [11]:
# Evaluate Model
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n Final Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


 Final Model Evaluation:
Mean Absolute Error (MAE): 0.9225
Root Mean Squared Error (RMSE): 1.3673
R² Score: 0.3296


In [12]:
# Save the Model

with open("../data/models/budget_recommendation_model_rf.pkl", "wb") as file:
    pickle.dump(model, file)
print("\n Model Saved Successfully!")


 Model Saved Successfully!


In [14]:
# Load Classification Model & Tokenizer for Real Usage
lstm_classification_model = load_model("../data/models/lstm_model.h5")
with open("../data/models/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

with open("../data/models/label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)




In [32]:
import pandas as pd
import numpy as np
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datetime import datetime

In [33]:
# Load trained Random Forest budget model
with open("../data/models/budget_recommendation_model_rf.pkl", "rb") as f:
    budget_model = pickle.load(f)

# Load tokenizer for text input
with open("../data/models/tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Load label encoder to decode predicted merchant categories
with open("../data/models/label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

# Load LSTM model for merchant category prediction
lstm_classification_model = load_model("../data/models/lstm_model.h5")



In [39]:
# Inference Function
def recommend_budget(date_str, description, yearly_income, total_debt, per_capita_income, verbose=True):
    # Step 1: Parse date
    dt = datetime.strptime(date_str, "%Y-%m-%d")
    year, month = dt.year, dt.month

    # Step 2: Tokenize and predict merchant category
    sequence = tokenizer.texts_to_sequences([description])
    padded_seq = pad_sequences(sequence, maxlen=20)
    category_probs = lstm_classification_model.predict(padded_seq, verbose=0)
    predicted_category = int(np.argmax(category_probs, axis=1)[0])
    category_label = label_encoder.inverse_transform([predicted_category])[0]

    # Step 3: Create input for the regression model
    input_data = pd.DataFrame([{
        "merchant_category_encoded": predicted_category,
        "year": year,
        "month": month,
        "yearly_income": yearly_income,
        "total_debt": total_debt,
        "per_capita_income": per_capita_income
    }])

    # Step 4: Predict budget
    predicted_budget = budget_model.predict(input_data)[0]

    # Step 5: Apply scaling for realistic output
    recommended_budget = predicted_budget * 50   # adjust multiplier as needed

    # Step 6: Display results
    if verbose:
        print("\n======= Budget Recommendation Summary =======")
        print(f"Date                  : {date_str}")
        print(f"Description           : {description}")
        print(f"Predicted Category    : {category_label} (encoded: {predicted_category})")
        print(f"Yearly Income         : ${yearly_income:,.2f}")
        print(f"Total Debt            : ${total_debt:,.2f}")
        print(f"Per Capita Income     : ${per_capita_income:,.2f}")
        print("---------------------------------------------")
        print("Model Input:")
        print(input_data.to_string(index=False))
        print("---------------------------------------------")
        print(f"Recommended Budget    : ${recommended_budget:.2f}")
        print("=============================================\n")

    return recommended_budget

In [40]:
recommend_budget(
    date_str="2025-03-26",
    description="Grocery shopping at Walmart",
    yearly_income=65000,
    total_debt=15000,
    per_capita_income=32000,
    verbose=True
)


Date                  : 2025-03-26
Description           : Grocery shopping at Walmart
Predicted Category    : 10 (encoded: 10)
Yearly Income         : $65,000.00
Total Debt            : $15,000.00
Per Capita Income     : $32,000.00
---------------------------------------------
Model Input:
 merchant_category_encoded  year  month  yearly_income  total_debt  per_capita_income
                        10  2025      3          65000       15000              32000
---------------------------------------------
Recommended Budget    : $301.70



np.float64(301.70415701419154)