In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import json
import os
from datetime import datetime

In [None]:
# Load dataset
df = pd.read_csv('finance_data.csv')
print(f"Dataset loaded: {df.shape[0]} records")

In [None]:
# Encode categorical columns
le_occ = LabelEncoder()
le_city = LabelEncoder()
df['Occupation_encoded'] = le_occ.fit_transform(df['Occupation'])
df['City_Tier_encoded'] = le_city.fit_transform(df['City_Tier'])

In [None]:
# Create expense columns
expense_cols = ['Rent','Loan_Repayment','Insurance','Groceries','Transport','Eating_Out',
                'Entertainment','Utilities','Healthcare','Education','Miscellaneous']
df['Total_Expenses'] = df[expense_cols].sum(axis=1)

In [None]:
# Train savings prediction model
features = [
    'Income', 'Age', 'Dependents', 'Occupation_encoded', 'City_Tier_encoded',
    'Total_Expenses', 'Desired_Savings_Percentage', 'Disposable_Income'
]
X = df[features]
y = df['Desired_Savings']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_savings = GradientBoostingRegressor(random_state=42)
model_savings.fit(X_train, y_train)
print("Savings model trained")

In [None]:
# Train overspending risk model
# Overspending = when expenses + desired savings exceed income
df['Required_Total'] = df['Total_Expenses'] + df['Desired_Savings']
df['Overspend'] = (df['Required_Total'] > df['Income']).astype(int)
X_risk = df[['Income', 'Age', 'Dependents', 'Occupation_encoded', 'City_Tier_encoded', 'Total_Expenses', 'Desired_Savings_Percentage']]
y_risk = df['Overspend']
clf_risk = RandomForestClassifier(random_state=42)
clf_risk.fit(X_risk, y_risk)
print("Risk model trained")

In [None]:
# User prediction function
_default_payload = {
    "Income": 60000.0,
    "Age": 30.0,
    "Dependents": 1.0,
    "Occupation_encoded": 1.0,
    "City_Tier_encoded": 1.0,
    "Total_Expenses": 25000.0,
    "Desired_Savings_Percentage": 15.0,
    "Disposable_Income": 35000.0,
}
_default_labels = {
    "Occupation": "Salaried",
    "City_Tier": "Tier 2",
}

def _normalize_payload(payload: dict) -> dict:
    values = _default_payload.copy()
    for key in values:
        try:
            values[key] = float(payload.get(key, values[key]))
        except (TypeError, ValueError):
            values[key] = values[key]
    return values

def generate_chart_data(user_values):
    """Generate updated chart data based on user input and dataset"""
    # Sample 10 records from dataset for scatter plot
    sample_df = df.sample(n=min(10, len(df)), random_state=42)
    
    # Scatter: Income vs Total Expenses
    scatter_data = {
        "income": sample_df['Income'].tolist(),
        "totalExpenses": sample_df['Total_Expenses'].tolist(),
        "cityTier": sample_df['City_Tier'].tolist(),
        "savingsPct": sample_df['Desired_Savings_Percentage'].tolist()
    }
    
    # Pie: Average expense breakdown
    pie_data = {
        "labels": expense_cols,
        "values": [round(df[col].mean(), 2) for col in expense_cols]
    }
    
    # Bar: Average expenses by occupation
    occupation_stats = df.groupby('Occupation')['Total_Expenses'].mean().sort_values(ascending=False)
    bar_data = {
        "labels": occupation_stats.index.tolist(),
        "values": [round(v, 2) for v in occupation_stats.values.tolist()]
    }
    
    # Projection: 12-month savings based on user's predicted savings
    predicted_monthly = max(user_values.get('predicted_savings', 0), 0)  # Don't project negative savings
    interest_rate = 0.06  # 6% annual
    projection_values = []
    for month in range(1, 13):
        if predicted_monthly > 0:
            projected = predicted_monthly * month * (1 + interest_rate/12)
        else:
            projected = 0
        projection_values.append(round(projected, 2))
    
    projection_data = {
        "months": list(range(1, 13)),
        "values": projection_values
    }
    
    # Heatmap: Correlation matrix
    corr_cols = ['Income', 'Total_Expenses', 'Desired_Savings_Percentage', 'Disposable_Income', 'Dependents']
    corr_matrix = df[corr_cols].corr()
    heatmap_data = {
        "labels": corr_cols,
        "matrix": corr_matrix.values.tolist()
    }
    
    return {
        "scatter": scatter_data,
        "pie": pie_data,
        "bar": bar_data,
        "projection": projection_data,
        "heatmap": heatmap_data
    }

def generate_insights(user_values, predicted_savings, overspend_prob):
    """Generate personalized insights based on user data and predictions"""
    insights = []
    
    income = user_values.get('Income', 0)
    expenses = user_values.get('Total_Expenses', 0)
    savings_pct = user_values.get('Desired_Savings_Percentage', 0)
    desired_savings_amount = income * (savings_pct / 100)
    available_after_expenses = income - expenses
    
    # Insight 1: Expense ratio and reality check
    if income > 0:
        expense_ratio = (expenses / income) * 100
        insights.append(f"Your expenses are {expense_ratio:.1f}% of your income (‚Çπ{expenses:,.0f} / ‚Çπ{income:,.0f}).")
        
        # Reality check: Can they actually save?
        if available_after_expenses <= 0:
            insights.append(f"‚ö†Ô∏è Critical: Expenses equal or exceed income. No funds available for savings.")
        elif available_after_expenses < desired_savings_amount:
            shortfall = desired_savings_amount - available_after_expenses
            insights.append(f"‚ö†Ô∏è Gap: Only ‚Çπ{available_after_expenses:,.0f} available, but ‚Çπ{desired_savings_amount:,.0f} needed for {savings_pct}% savings goal (shortfall: ‚Çπ{shortfall:,.0f}).")
    
    # Insight 2: Savings rate assessment (based on ACTUAL possibility, not just target)
    if predicted_savings < 0:
        insights.append(f"‚ùå Negative predicted savings: Your {savings_pct}% target is unachievable with current expenses. Reduce expenses or adjust savings goal.")
    elif available_after_expenses > 0:
        actual_savings_pct = (available_after_expenses / income) * 100
        if actual_savings_pct >= 20:
            insights.append(f"‚úì Strong position: {actual_savings_pct:.1f}% of income available for savings.")
        elif actual_savings_pct >= 15:
            insights.append(f"‚úì Good position: {actual_savings_pct:.1f}% of income available for savings.")
        elif actual_savings_pct >= 10:
            insights.append(f"‚ö†Ô∏è Tight budget: Only {actual_savings_pct:.1f}% available for savings. Consider expense reduction.")
        else:
            insights.append(f"‚ö†Ô∏è Low margin: Only {actual_savings_pct:.1f}% available for savings. Budget adjustment recommended.")
    
    # Insight 3: Overspending risk (now properly accounts for savings goals)
    if overspend_prob is not None:
        if overspend_prob < 0.3:
            insights.append(f"‚úì Low overspending risk ({overspend_prob*100:.1f}%). Income covers expenses + savings goals.")
        elif overspend_prob < 0.6:
            insights.append(f"‚ö†Ô∏è Moderate risk ({overspend_prob*100:.1f}%). Expenses + savings goals strain your income.")
        else:
            insights.append(f"‚ùå High risk ({overspend_prob*100:.1f}%). Expenses + savings goals exceed income. Budget adjustment critical.")
    
    # Insight 4: Predicted savings context
    if predicted_savings > 0:
        monthly_savings = predicted_savings
        annual_savings = monthly_savings * 12
        insights.append(f"üìä Predicted monthly savings: ‚Çπ{monthly_savings:,.2f} (‚Çπ{annual_savings:,.2f} annually).")
    elif predicted_savings < 0:
        deficit = abs(predicted_savings)
        annual_deficit = deficit * 12
        insights.append(f"üìä Monthly deficit: -‚Çπ{deficit:,.2f} (-‚Çπ{annual_deficit:,.2f} annually). Immediate action required.")
    
    return insights

def run_user_prediction(payload: dict) -> dict:
    values = _normalize_payload(payload)
    occupation_label = payload.get("Occupation", _default_labels["Occupation"])
    city_label = payload.get("City_Tier", _default_labels["City_Tier"])
    
    # Encode categorical values
    if occupation_label:
        try:
            values["Occupation_encoded"] = float(le_occ.transform([occupation_label])[0])
        except Exception:
            pass
    if city_label:
        try:
            values["City_Tier_encoded"] = float(le_city.transform([city_label])[0])
        except Exception:
            pass

    # Predict savings
    feature_row = pd.DataFrame([{f: values.get(f, _default_payload[f]) for f in features}])
    predicted = float(model_savings.predict(feature_row)[0])

    # Predict overspend probability (now includes Total_Expenses and Desired_Savings_Percentage)
    overspend_payload = pd.DataFrame([{
        "Income": values["Income"],
        "Age": values["Age"],
        "Dependents": values["Dependents"],
        "Occupation_encoded": values["Occupation_encoded"],
        "City_Tier_encoded": values["City_Tier_encoded"],
        "Total_Expenses": values["Total_Expenses"],
        "Desired_Savings_Percentage": values["Desired_Savings_Percentage"]
    }])
    overspend_prob = None
    try:
        overspend_prob = float(clf_risk.predict_proba(overspend_payload)[0][1])
    except Exception:
        overspend_prob = None

    # Generate chart data and insights
    values['predicted_savings'] = predicted
    chart_data = generate_chart_data(values)
    insights = generate_insights(values, predicted, overspend_prob)

    result = {
        "predicted_desired_savings": round(predicted, 2),
        "overspend_probability": overspend_prob,
        "input": values,
        "generated_at": datetime.utcnow().isoformat() + "Z",
        "charts": chart_data,
        "insights": insights
    }
    
    # Write result to file
    with open("user_prediction.json", "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    
    return result

In [None]:
# Execute user prediction if payload provided
if os.environ.get("USER_INPUT_PAYLOAD"):
    payload = json.loads(os.environ["USER_INPUT_PAYLOAD"])
    user_result = run_user_prediction(payload)
    print("USER_RESULT", json.dumps(user_result))
else:
    print("No user input payload provided")