In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import re

df = pd.read_csv("expenses_dataset_smart.csv", quotechar='"')
df = df.dropna()

constant_columns = [col for col in df.columns if df[col].nunique() == 1]
df = df.drop(columns=constant_columns)

class_counts = df['category'].value_counts()
classes_to_drop = class_counts[class_counts < 2].index
df = df[~df['category'].isin(classes_to_drop)]

print(f"Class distribution after filtering: \n{df['category'].value_counts()}")

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['description'] = df['description'].apply(preprocess_text)

label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['category'])

X = df['description']
y = df['category']

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, stop_words='english')
X_vect = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42, stratify=y)

rf_classifier = RandomForestClassifier(n_estimators=300, max_depth=20, random_state=42, class_weight='balanced')
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy * 100:.2f}%")

def predict_category(description):
    try:
        description = preprocess_text(description)
        if len(description.split()) <1:
            return "Input too short. Please provide a little more detail."
        manual_corrections = {
            "grocery": "Groceries",
            "supermarket": "Groceries",
            "food": "Groceries",
            "restaurant": "Groceries",
            "hospital": "Healthcare",
            "clinic": "Healthcare",
            "medical": "Healthcare",
            "rent": "Housing",
            "mortgage": "Housing",
            "apartment": "Housing",
            "house": "Housing",
            "taxi": "Transport",
            "bus": "Transport",
            "car": "Transport",
            "fuel": "Transport",
            "uber": "Transport",
            "train": "Transport",
            "flight": "Transport",
            "movie": "Entertainment",
            "theater": "Entertainment",
            "concert": "Entertainment",
            "event": "Entertainment",
            "show": "Entertainment",
            "sports": "Entertainment",
            "game": "Entertainment",
            "leisure": "Entertainment",
            "electricity": "Utilities",
            "water": "Utilities",
            "gas": "Utilities",
            "internet": "Utilities",
            "phone": "Utilities",
            "cable": "Utilities",
            "tuition": "Education",
            "fees": "Education",
            "college": "Education",
            "school": "Education",
            "textbooks": "Education",
            "study": "Education",
            "online course": "Education",
            "flight": "Travel",
            "hotel": "Travel",
            "vacation": "Travel",
            "trip": "Travel",
            "accommodation": "Travel",
            "tickets": "Travel",
            "holiday": "Travel",
            "clothes": "Shopping",
            "electronics": "Shopping",
            "shoes": "Shopping",
            "apparel": "Shopping",
            "accessories": "Shopping",
            "online store": "Shopping",
            "purchase": "Shopping",
            "subscription": "Bills/Subscriptions",
            "netflix": "Bills/Subscriptions",
            "spotify": "Bills/Subscriptions",
            "insurance": "Bills/Subscriptions",
            "premium": "Bills/Subscriptions",
            "memberships": "Bills/Subscriptions"
        }

        for keyword, category in manual_corrections.items():
            if keyword in description:
                return category

        description_vector = vectorizer.transform([description])
        pred = rf_classifier.predict(description_vector)
        category = label_encoder.inverse_transform(pred)[0]
        return category
    except Exception as e:
        return f"Error in prediction: {e}"
test_inputs = [
    "spent 500 on food",
    "Paid hostel rent for this month of 12000",
    "grocery",
    "paid 500 for rent",
    "hospital bill payment"
]

print("\nManual Test Cases Predictions:\n")
for desc in test_inputs:
    predicted_cat = predict_category(desc)
    print(f"Input: {desc} --> Predicted Category: {predicted_cat}")
Class distribution after filtering:
category
Travel                150
Transport             150
Financial Services    150
Rent & Housing        150
Utilities             150
Shopping              150
Entertainment         150
Education             150
Healthcare            150
Food & Drinks         150
Name: count, dtype: int64
Accuracy on Test Set: 96.67%

Manual Test Cases Predictions:

Input: spent 500 on food --> Predicted Category: Groceries
Input: Paid hostel rent for this month of 12000 --> Predicted Category: Housing
Input: grocery --> Predicted Category: Groceries
Input: paid 500 for rent --> Predicted Category: Housing
Input: hospital bill payment --> Predicted Category: Healthcare
import pandas as pd

expense_history = pd.DataFrame(columns=["description", "category", "amount"])

def add_expense(description, amount):
    category = predict_category(description)
    new_entry = {"description": description, "category": category, "amount": amount}
    global expense_history
    expense_history = pd.concat([expense_history, pd.DataFrame([new_entry])], ignore_index=True)
def total_spent():
    return expense_history["amount"].sum()

def spent_by_category():
    return expense_history.groupby("category")["amount"].sum().sort_values(ascending=False)

def highest_spending_category():
    cat_spent = spent_by_category()
    if not cat_spent.empty:
        return cat_spent.index[0], cat_spent.iloc[0]
    return None, 0

def lowest_spending_category():
    cat_spent = spent_by_category()
    if not cat_spent.empty:
        return cat_spent.index[-1], cat_spent.iloc[-1]
    return None, 0

def average_spent():
    return expense_history["amount"].mean()

def excessive_spends():
    avg = average_spent()
    excessive = expense_history[expense_history["amount"] > avg]
    return excessive

def savings_tips():
    highest, amount = highest_spending_category()
    tips = []
    if highest:
        tips.append(f"You spent the most on {highest}. Consider reducing spending here.")
    if total_spent() > 5000:
        tips.append("You're overspending. Try setting a monthly spending cap!")
    return tips
# Example of adding expenses
add_expense("Bought groceries at supermarket", 500)
add_expense("Paid hostel rent for this month", 1500)
add_expense("Recharged mobile internet plan", 200)
add_expense("Bought a new hoodie from Amazon", 1200)

# Analysis Results
print("Total Spent:", total_spent())
print("Spent by Category:\n", spent_by_category())
print("Highest Spending Category:", highest_spending_category())
print("Lowest Spending Category:", lowest_spending_category())
print("Excessive Spends:\n", excessive_spends())
print("Savings Tips:", savings_tips())
Total Spent: 3400
Spent by Category:
 category
Housing          1500
Food & Drinks    1200
Groceries         500
Utilities         200
Name: amount, dtype: object
Highest Spending Category: ('Housing', 1500)
Lowest Spending Category: ('Utilities', 200)
Excessive Spends:
                        description       category amount
1  Paid hostel rent for this month        Housing   1500
3  Bought a new hoodie from Amazon  Food & Drinks   1200
Savings Tips: ['You spent the most on Housing. Consider reducing spending here.']
import joblib

# Save the trained model
joblib.dump(rf_classifier, 'expenses_model2.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'vectorizers2.pkl')

print("Model and Vectorizer saved successfully.")
Model and Vectorizer saved successfully.
