In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import json
import os
from datetime import datetime

In [None]:
# Load dataset
df = pd.read_csv('finance_data.csv')
print(f"Dataset loaded: {df.shape[0]} records")

In [None]:
# Encode categorical columns
le_occ = LabelEncoder()
le_city = LabelEncoder()
df['Occupation_encoded'] = le_occ.fit_transform(df['Occupation'])
df['City_Tier_encoded'] = le_city.fit_transform(df['City_Tier'])

In [None]:
# Create expense columns
expense_cols = ['Rent','Loan_Repayment','Insurance','Groceries','Transport','Eating_Out',
                'Entertainment','Utilities','Healthcare','Education','Miscellaneous']
df['Total_Expenses'] = df[expense_cols].sum(axis=1)

In [None]:
# Train savings prediction model
features = [
    'Income', 'Age', 'Dependents', 'Occupation_encoded', 'City_Tier_encoded',
    'Total_Expenses', 'Desired_Savings_Percentage', 'Disposable_Income'
]
X = df[features]
y = df['Desired_Savings']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_savings = GradientBoostingRegressor(random_state=42)
model_savings.fit(X_train, y_train)
print("Savings model trained")

In [None]:
# Train overspending risk model
# Overspending = when expenses + desired savings exceed income
df['Required_Total'] = df['Total_Expenses'] + df['Desired_Savings']
df['Overspend'] = (df['Required_Total'] > df['Income']).astype(int)
X_risk = df[['Income', 'Age', 'Dependents', 'Occupation_encoded', 'City_Tier_encoded', 'Total_Expenses', 'Desired_Savings_Percentage']]
y_risk = df['Overspend']
clf_risk = RandomForestClassifier(random_state=42)
clf_risk.fit(X_risk, y_risk)
print("Risk model trained")

In [None]:
# User prediction function
_default_payload = {
    "Income": 60000.0,
    "Age": 30.0,
    "Dependents": 1.0,
    "Occupation_encoded": 1.0,
    "City_Tier_encoded": 1.0,
    "Total_Expenses": 7800.0,
    "Desired_Savings_Percentage": 15.0,
    "Disposable_Income": 35000.0,
    "Rent": 0.0,
    "Loan_Repayment": 0.0,
    "Insurance": 0.0,
    "Groceries": 0.0,
    "Transport": 0.0,
    "Eating_Out": 0.0,
    "Entertainment": 0.0,
    "Utilities": 0.0,
    "Healthcare": 0.0,
    "Education": 0.0,
    "Miscellaneous": 0.0,
}
_default_labels = {
    "Occupation": "Salaried",
    "City_Tier": "Tier 2",
}

def _normalize_payload(payload: dict) -> dict:
    values = _default_payload.copy()
    for key in values:
        try:
            values[key] = float(payload.get(key, values[key]))
        except (TypeError, ValueError):
            values[key] = values[key]
    return values

def generate_chart_data(user_values):
    """Generate updated chart data based on user input and dataset"""
    # Sample 10 records from dataset for scatter plot
    sample_df = df.sample(n=min(10, len(df)), random_state=42)
    
    # Scatter: Income vs Total Expenses
    scatter_data = {
        "income": sample_df['Income'].tolist(),
        "totalExpenses": sample_df['Total_Expenses'].tolist(),
        "cityTier": sample_df['City_Tier'].tolist(),
        "savingsPct": sample_df['Desired_Savings_Percentage'].tolist()
    }
    
    # Pie: Average expense breakdown
    pie_data = {
        "labels": expense_cols,
        "values": [round(df[col].mean(), 2) for col in expense_cols]
    }
    
    # Bar: Average expenses by occupation
    occupation_stats = df.groupby('Occupation')['Total_Expenses'].mean().sort_values(ascending=False)
    bar_data = {
        "labels": occupation_stats.index.tolist(),
        "values": [round(v, 2) for v in occupation_stats.values.tolist()]
    }
    
    # Projection: 12-month savings based on user's predicted savings
    predicted_monthly = max(user_values.get('predicted_savings', 0), 0)  # Don't project negative savings
    interest_rate = 0.06  # 6% annual
    projection_values = []
    for month in range(1, 13):
        if predicted_monthly > 0:
            projected = predicted_monthly * month * (1 + interest_rate/12)
        else:
            projected = 0
        projection_values.append(round(projected, 2))
    
    projection_data = {
        "months": list(range(1, 13)),
        "values": projection_values
    }
    
    # Heatmap: Correlation matrix
    corr_cols = ['Income', 'Total_Expenses', 'Desired_Savings_Percentage', 'Disposable_Income', 'Dependents']
    corr_matrix = df[corr_cols].corr()
    heatmap_data = {
        "labels": corr_cols,
        "matrix": corr_matrix.values.tolist()
    }
    
    return {
        "scatter": scatter_data,
        "pie": pie_data,
        "bar": bar_data,
        "projection": projection_data,
        "heatmap": heatmap_data
    }

def analyze_expense_categories(user_values, income):
    """Analyze each expense category and provide detailed breakdown with savings opportunities"""
    user_expenses = {
        'Rent': user_values.get('Rent', 0),
        'Loan_Repayment': user_values.get('Loan_Repayment', 0),
        'Insurance': user_values.get('Insurance', 0),
        'Groceries': user_values.get('Groceries', 0),
        'Transport': user_values.get('Transport', 0),
        'Eating_Out': user_values.get('Eating_Out', 0),
        'Entertainment': user_values.get('Entertainment', 0),
        'Utilities': user_values.get('Utilities', 0),
        'Healthcare': user_values.get('Healthcare', 0),
        'Education': user_values.get('Education', 0),
        'Miscellaneous': user_values.get('Miscellaneous', 0),
    }
    
    # Calculate dataset averages and recommended percentages
    avg_expenses = {col: df[col].mean() for col in expense_cols}
    
    # Industry-standard recommended percentages of income
    recommended_pct = {
        'Rent': 30.0,
        'Loan_Repayment': 10.0,
        'Insurance': 5.0,
        'Groceries': 10.0,
        'Transport': 10.0,
        'Eating_Out': 5.0,
        'Entertainment': 5.0,
        'Utilities': 5.0,
        'Healthcare': 5.0,
        'Education': 5.0,
        'Miscellaneous': 5.0,
    }
    
    category_analysis = []
    total_potential_savings = 0
    
    for category in expense_cols:
        amount = user_expenses.get(category, 0)
        if amount == 0:
            continue
            
        avg_amount = avg_expenses.get(category, 0)
        recommended_amount = income * (recommended_pct.get(category, 5) / 100)
        user_pct = (amount / income) * 100 if income > 0 else 0
        
        status = "‚úì Good"
        potential_saving = 0
        advice = []
        
        # Determine status and potential savings
        if amount > recommended_amount * 1.3:  # 30% above recommended
            status = "üî¥ High"
            potential_saving = amount - recommended_amount
            advice.append(f"Reduce to recommended ‚Çπ{recommended_amount:,.0f} ({recommended_pct.get(category, 5):.0f}% of income)")
        elif amount > recommended_amount * 1.1:  # 10% above recommended
            status = "üü° Moderate"
            potential_saving = amount - recommended_amount
            advice.append(f"Consider reducing to ‚Çπ{recommended_amount:,.0f}")
        elif amount > avg_amount * 1.2:  # 20% above dataset average
            status = "üü° Above Average"
            potential_saving = amount - avg_amount
            advice.append(f"Dataset average is ‚Çπ{avg_amount:,.0f}")
        else:
            advice.append("Spending is within healthy range")
        
        # Category-specific tips
        if category == 'Rent' and amount > income * 0.30:
            advice.append("üí° Consider relocating or finding roommates")
        elif category == 'Eating_Out' and amount > income * 0.05:
            advice.append("üí° Meal prep at home can save 50-70%")
        elif category == 'Entertainment' and amount > income * 0.05:
            advice.append("üí° Explore free/low-cost activities")
        elif category == 'Transport' and amount > income * 0.10:
            advice.append("üí° Use public transport or carpool")
        elif category == 'Groceries' and amount > income * 0.12:
            advice.append("üí° Use shopping lists, buy in bulk, avoid waste")
        elif category == 'Utilities' and amount > income * 0.05:
            advice.append("üí° Reduce AC/heating usage, switch to LED lights")
        elif category == 'Miscellaneous' and amount > income * 0.05:
            advice.append("üí° Track and eliminate impulse purchases")
        
        total_potential_savings += potential_saving
        
        category_analysis.append({
            "category": category.replace('_', ' '),
            "current_amount": round(amount, 2),
            "current_percentage": round(user_pct, 2),
            "recommended_amount": round(recommended_amount, 2),
            "recommended_percentage": recommended_pct.get(category, 5),
            "dataset_average": round(avg_amount, 2),
            "status": status,
            "potential_saving": round(potential_saving, 2),
            "advice": advice
        })
    
    # Sort by potential savings (highest first)
    category_analysis.sort(key=lambda x: x['potential_saving'], reverse=True)
    
    return {
        "categories": category_analysis,
        "total_potential_savings": round(total_potential_savings, 2),
        "summary": f"Total potential savings across all categories: ‚Çπ{total_potential_savings:,.2f}/month"
    }

def generate_recommendations(user_values, predicted_savings, overspend_prob, category_breakdown):
    """Generate actionable saving recommendations based on expense analysis"""
    recommendations = []
    
    income = user_values.get('Income', 0)
    expenses = user_values.get('Total_Expenses', 0)
    savings_pct = user_values.get('Desired_Savings_Percentage', 0)
    desired_savings_amount = income * (savings_pct / 100)
    available_after_expenses = income - expenses
    
    # Primary recommendation based on shortfall/surplus
    if available_after_expenses < desired_savings_amount:
        shortfall = desired_savings_amount - available_after_expenses
        recommendations.append(f"üéØ Target: Reduce expenses by ‚Çπ{shortfall:,.0f}/month to meet your {savings_pct}% savings goal.")
        
        # Highlight top savings opportunities from category breakdown
        high_priority = [cat for cat in category_breakdown['categories'] if cat['potential_saving'] > 0][:3]
        
        if high_priority:
            recommendations.append("üî• Top 3 savings opportunities:")
            cumulative_savings = 0
            for cat in high_priority:
                cumulative_savings += cat['potential_saving']
                recommendations.append(f"   ‚Ä¢ {cat['category']}: ‚Çπ{cat['current_amount']:,.0f} ‚Üí ‚Çπ{cat['recommended_amount']:,.0f} (save ‚Çπ{cat['potential_saving']:,.0f}/month)")
            
            if cumulative_savings >= shortfall:
                recommendations.append(f"   ‚úì Implementing these changes would cover your ‚Çπ{shortfall:,.0f} shortfall!")
            else:
                remaining = shortfall - cumulative_savings
                recommendations.append(f"   ‚ö†Ô∏è Additional ‚Çπ{remaining:,.0f}/month reduction needed")
        
        # Quick wins
        recommendations.append("üìã Quick wins (easiest to implement):")
        quick_wins = []
        for cat in category_breakdown['categories']:
            if cat['category'] in ['Eating Out', 'Entertainment', 'Miscellaneous'] and cat['potential_saving'] > 0:
                quick_wins.append(f"   ‚Ä¢ {cat['category']}: {cat['advice'][0]}")
        
        if quick_wins:
            recommendations.extend(quick_wins[:3])
        else:
            recommendations.append("   ‚Ä¢ Review all discretionary spending categories")
    
    else:
        # Surplus scenario
        surplus = available_after_expenses - desired_savings_amount
        recommendations.append(f"‚úì Excellent! You have ‚Çπ{surplus:,.0f}/month surplus after meeting your {savings_pct}% savings goal.")
        recommendations.append("üí∞ Smart moves for your surplus:")
        recommendations.append(f"   ‚Ä¢ Build emergency fund: Target 6-months expenses (‚Çπ{expenses * 6:,.0f})")
        recommendations.append(f"   ‚Ä¢ Invest ‚Çπ{surplus * 0.6:,.0f} in SIP/Mutual Funds for long-term growth")
        recommendations.append(f"   ‚Ä¢ Keep ‚Çπ{surplus * 0.4:,.0f} as buffer for unexpected expenses")
        
        # Still show optimization opportunities
        optimizations = [cat for cat in category_breakdown['categories'] if cat['potential_saving'] > 100]
        if optimizations:
            recommendations.append("üéØ Further optimization opportunities:")
            for cat in optimizations[:2]:
                recommendations.append(f"   ‚Ä¢ {cat['category']}: {cat['advice'][0]}")
    
    # Risk-based recommendations
    if overspend_prob and overspend_prob > 0.5:
        recommendations.append("‚ö†Ô∏è Risk mitigation strategies:")
        recommendations.append("   ‚Ä¢ Set up automatic savings transfer on payday")
        recommendations.append("   ‚Ä¢ Use separate accounts for fixed vs discretionary expenses")
        recommendations.append("   ‚Ä¢ Set spending alerts for high-risk categories")
    
    # Category-specific deep dives
    if category_breakdown['total_potential_savings'] > 0:
        recommendations.append(f"üìä Total optimization potential: ‚Çπ{category_breakdown['total_potential_savings']:,.2f}/month (‚Çπ{category_breakdown['total_potential_savings'] * 12:,.2f}/year)")
    
    return recommendations

def generate_insights(user_values, predicted_savings, overspend_prob):
    """Generate personalized insights based on user data and predictions"""
    insights = []
    
    income = user_values.get('Income', 0)
    expenses = user_values.get('Total_Expenses', 0)
    savings_pct = user_values.get('Desired_Savings_Percentage', 0)
    desired_savings_amount = income * (savings_pct / 100)
    available_after_expenses = income - expenses
    
    # Insight 1: Expense ratio and reality check
    if income > 0:
        expense_ratio = (expenses / income) * 100
        insights.append(f"Your expenses are {expense_ratio:.1f}% of your income (‚Çπ{expenses:,.0f} / ‚Çπ{income:,.0f}).")
        
        # Reality check: Can they actually save?
        if available_after_expenses <= 0:
            insights.append(f"‚ö†Ô∏è Critical: Expenses equal or exceed income. No funds available for savings.")
        elif available_after_expenses < desired_savings_amount:
            shortfall = desired_savings_amount - available_after_expenses
            insights.append(f"‚ö†Ô∏è Gap: Only ‚Çπ{available_after_expenses:,.0f} available, but ‚Çπ{desired_savings_amount:,.0f} needed for {savings_pct}% savings goal (shortfall: ‚Çπ{shortfall:,.0f}).")
        else:
            surplus = available_after_expenses - desired_savings_amount
            insights.append(f"‚úì Healthy surplus: ‚Çπ{available_after_expenses:,.0f} available after expenses. Your {savings_pct}% goal (‚Çπ{desired_savings_amount:,.0f}) leaves ‚Çπ{surplus:,.0f} buffer.")
    
    # Insight 2: Savings rate assessment
    if available_after_expenses > 0:
        actual_savings_pct = (available_after_expenses / income) * 100
        if actual_savings_pct >= 20:
            insights.append(f"‚úì Excellent: {actual_savings_pct:.1f}% of income available for savings.")
        elif actual_savings_pct >= 15:
            insights.append(f"‚úì Good: {actual_savings_pct:.1f}% of income available for savings.")
        elif actual_savings_pct >= 10:
            insights.append(f"‚ö†Ô∏è Moderate: {actual_savings_pct:.1f}% available for savings. Consider expense reduction for better financial cushion.")
        else:
            insights.append(f"‚ö†Ô∏è Tight: Only {actual_savings_pct:.1f}% available for savings. Budget adjustment recommended.")
    elif predicted_savings < 0:
        insights.append(f"‚ùå Deficit situation: Your {savings_pct}% target is unachievable with current expenses. Reduce expenses or adjust savings goal.")
    
    # Insight 3: Overspending risk
    if overspend_prob is not None:
        if overspend_prob < 0.3:
            if available_after_expenses >= desired_savings_amount:
                insights.append(f"‚úì Low overspending risk ({overspend_prob*100:.1f}%). Income comfortably covers expenses + savings goals.")
            else:
                insights.append(f"‚úì Low overspending risk ({overspend_prob*100:.1f}%), but savings target exceeds available funds.")
        elif overspend_prob < 0.6:
            insights.append(f"‚ö†Ô∏è Moderate risk ({overspend_prob*100:.1f}%). Expenses + savings goals strain your income.")
        else:
            insights.append(f"‚ùå High risk ({overspend_prob*100:.1f}%). Expenses + savings goals exceed income. Budget adjustment critical.")
    
    # Insight 4: Predicted savings context
    if predicted_savings > 0:
        monthly_savings = predicted_savings
        annual_savings = monthly_savings * 12
        insights.append(f"üìä Achievable monthly savings: ‚Çπ{monthly_savings:,.2f} (‚Çπ{annual_savings:,.2f} annually).")
    elif predicted_savings < 0:
        deficit = abs(predicted_savings)
        annual_deficit = deficit * 12
        insights.append(f"üìä Monthly shortfall: -‚Çπ{deficit:,.2f} (-‚Çπ{annual_deficit:,.2f} annually). Immediate action required.")
    else:
        insights.append(f"üìä Breakeven: Expenses consume all available income after meeting savings target.")
    
    return insights

def run_user_prediction(payload: dict) -> dict:
    values = _normalize_payload(payload)
    occupation_label = payload.get("Occupation", _default_labels["Occupation"])
    city_label = payload.get("City_Tier", _default_labels["City_Tier"])
    
    # Encode categorical values
    if occupation_label:
        try:
            values["Occupation_encoded"] = float(le_occ.transform([occupation_label])[0])
        except Exception:
            pass
    if city_label:
        try:
            values["City_Tier_encoded"] = float(le_city.transform([city_label])[0])
        except Exception:
            pass

    # Calculate savings mathematically
    income = values.get("Income", 0)
    expenses = values.get("Total_Expenses", 0)
    savings_pct = values.get("Desired_Savings_Percentage", 0)
    
    available_after_expenses = income - expenses
    desired_savings_amount = income * (savings_pct / 100)
    
    # Predicted savings is what's left after expenses, capped by desired target
    if available_after_expenses >= desired_savings_amount:
        predicted = desired_savings_amount
    else:
        predicted = available_after_expenses

    # Predict overspend probability
    overspend_payload = pd.DataFrame([{
        "Income": values["Income"],
        "Age": values["Age"],
        "Dependents": values["Dependents"],
        "Occupation_encoded": values["Occupation_encoded"],
        "City_Tier_encoded": values["City_Tier_encoded"],
        "Total_Expenses": values["Total_Expenses"],
        "Desired_Savings_Percentage": values["Desired_Savings_Percentage"]
    }])
    overspend_prob = None
    try:
        overspend_prob = float(clf_risk.predict_proba(overspend_payload)[0][1])
    except Exception:
        overspend_prob = None

    # Generate detailed category breakdown
    category_breakdown = analyze_expense_categories(values, income)
    
    # Generate chart data, insights, and recommendations
    values['predicted_savings'] = predicted
    chart_data = generate_chart_data(values)
    insights = generate_insights(values, predicted, overspend_prob)
    recommendations = generate_recommendations(values, predicted, overspend_prob, category_breakdown)

    result = {
        "predicted_desired_savings": round(predicted, 2),
        "overspend_probability": overspend_prob,
        "input": values,
        "generated_at": datetime.utcnow().isoformat() + "Z",
        "charts": chart_data,
        "insights": insights,
        "recommendations": recommendations,
        "expense_breakdown": category_breakdown
    }
    
    # Write result to file
    with open("user_prediction.json", "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    
    return result


In [None]:
# Execute user prediction if payload provided
if os.environ.get("USER_INPUT_PAYLOAD"):
    payload = json.loads(os.environ["USER_INPUT_PAYLOAD"])
    user_result = run_user_prediction(payload)
    print("USER_RESULT", json.dumps(user_result))
else:
    print("No user input payload provided")