In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load data
data = pd.read_csv("user_expenditure_data_six_months.csv")
data.head()

# Step 2: Remove unnecessary columns
data = data.drop(['User_ID', 'Transaction_ID', 'Date', 'Total_Spent','Validate', 'Can_Spent'], axis=1)

# Step 3: Convert categorical variables to numeric
label_encoder = LabelEncoder()
data['Reason'] = label_encoder.fit_transform(data['Reason'])

# Encode the target variable
category_encoder = LabelEncoder()
data['Category'] = category_encoder.fit_transform(data['Category'])

# Step 4: Feature and target split
X = data.drop('Category', axis=1)  # Features
y = data['Category']  # Target

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Normalize numerical columns (optional)
scaler = StandardScaler()
X_train[['Income', 'Amount_Spent','Available_Amount']] = scaler.fit_transform(X_train[['Income', 'Amount_Spent','Available_Amount']])
X_test[['Income', 'Amount_Spent','Available_Amount']] = scaler.transform(X_test[['Income', 'Amount_Spent','Available_Amount']])

# Step 7: Train a classifier (Random Forest in this case)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 8: Make predictions and evaluate
y_pred = clf.predict(X_test)

# Step 9: Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))

# Specify the full range of labels to handle the missing class issue
print(classification_report(y_test, y_pred, labels=[0, 1, 2], target_names=category_encoder.classes_, zero_division=0))

Accuracy: 0.9180327868852459
               precision    recall  f1-score   support

          Bad       0.88      0.91      0.90       141
         Good       0.94      0.93      0.94       223
Out of Budget       0.00      0.00      0.00         2

     accuracy                           0.92       366
    macro avg       0.61      0.61      0.61       366
 weighted avg       0.91      0.92      0.92       366



In [7]:
import joblib 

In [8]:
# Save the encoders, scaler, and model
joblib.dump(label_encoder, 'label_encoder_reason_class.pkl')  # Save LabelEncoder for 'Reason'
joblib.dump(category_encoder, 'label_encoder_category_class.pkl')  # Save LabelEncoder for 'Category'
joblib.dump(scaler, 'scaler.pkl')  # Save StandardScaler
joblib.dump(clf, 'random_forest_model.pkl')  # Save Random Forest model

['random_forest_model.pkl']

In [9]:
import joblib
import pandas as pd

# Step 1: Load the pickled files
label_encoder_reason = joblib.load('label_encoder_reason_class.pkl')
category_encoder = joblib.load('label_encoder_category_class.pkl')
scaler = joblib.load('scaler.pkl')
clf = joblib.load('random_forest_model.pkl')

# Step 2: Create a sample data input
sample_data = pd.DataFrame({
    'Income': [20000],  # Replace with appropriate values
    'Amount_Spent': [2500],
    'Available_Amount': [20000],  # Replace with appropriate values
    'Reason': ['Utilities']  # Replace with appropriate values
})

# Step 3: Preprocess the sample data
# Encode the 'Reason' column
sample_data['Reason'] = label_encoder_reason.transform(sample_data['Reason'])

# Step 4: Retrieve the feature names used during training
# clf was trained on certain feature names and we need to retrieve them to ensure alignment
model_feature_names = clf.feature_names_in_

# Step 5: Reorder sample_data columns to match the order of feature names used during training
sample_data = sample_data[model_feature_names]

# Step 6: Scale the numerical columns (Income, Amount_Spent, Available_Amount)
numerical_columns = ['Income', 'Amount_Spent', 'Available_Amount']
sample_data[numerical_columns] = scaler.transform(sample_data[numerical_columns])

# Step 7: Make predictions
predicted_category = clf.predict(sample_data)

# Step 8: Inverse transform the predicted category to the original labels
decoded_category = category_encoder.inverse_transform(predicted_category)

# Print the predicted category
print("Predicted Category:", decoded_category[0])  # Output the predicted category

Predicted Category: Good
