In [1]:
import joblib
import pandas as pd

# Step 1: Load the pickled model and label encoders
label_encoder_reason = joblib.load('label_encoder_reason_reg.pkl')  # Load the LabelEncoder for 'Reason'
label_encoder_category = joblib.load('label_encoder_category_reg.pkl')  # Load the LabelEncoder for 'Category'
model = joblib.load('random_forest_regressor_model.pkl')  # Load the Random Forest Regressor model

# Step 2: Load the user expenditure data
data = pd.read_csv('user_expenditure_data_six_months.csv')

# Step 3: Encode the 'Reason' and 'Category' columns using the saved encoders
data['Reason'] = label_encoder_reason.transform(data['Reason'])
data['Category'] = label_encoder_category.transform(data['Category'])

# Step 4: Select only the columns required by the model (without 'Amount_Spent' if it wasn't used during training)
features = ['Income', 'Reason', 'Category', 'Available_Amount']  # Match training features exactly
input_data = data[features]  # Extract the relevant features for prediction

# Step 5: Make predictions using the model
data['can_spent_predicted'] = model.predict(input_data)

# Step 6: Decode the 'Reason' column back to its original values
data['Reason'] = label_encoder_reason.inverse_transform(data['Reason'])

# Step 7: Save the results with the original columns + predicted column to a new CSV
data.to_csv('user_expenditure_six_months_data_with_predictions.csv', index=False)

print("Predictions saved to 'user_expenditure_six_months_data_with_predictions.csv'")


Predictions saved to 'user_expenditure_six_months_data_with_predictions.csv'


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load data
data = pd.read_csv("user_expenditure_six_months_data_with_predictions.csv")
data.head()

Unnamed: 0,User_ID,Income,Transaction_ID,Reason,Amount_Spent,Date,Category,Total_Spent,Available_Amount,Can_Spent,Validate,can_spent_predicted
0,1,19278,1,Education,409,2024-04-12 02:01:15.637632,1,409,18869,2830,True,2826.11
1,1,19278,2,Travel,102,2024-04-15 02:01:15.637632,1,511,18767,750,True,754.68
2,1,19278,3,Entertainment,507,2024-04-16 02:01:15.637632,1,1018,18260,1826,True,1825.61
3,1,19278,4,Travel,383,2024-04-18 02:01:15.637632,1,1401,17877,715,True,719.13
4,1,19278,5,Travel,376,2024-04-22 02:01:15.637632,1,1777,17501,700,True,699.54


In [3]:
import pandas as pd

# Step 2: Check if Amount_Spent <= can_spent_predicted
data['Category'] = data.apply(lambda row: 'Good' if row['Amount_Spent'] <= row['can_spent_predicted'] else 'Bad', axis=1)

# Step 3: Save the modified DataFrame back to a CSV
data.to_csv("user_expenditure_data_six_months_with_category_update.csv", index=False)

# Optional: Print out the first few rows to verify changes
data.head()

Unnamed: 0,User_ID,Income,Transaction_ID,Reason,Amount_Spent,Date,Category,Total_Spent,Available_Amount,Can_Spent,Validate,can_spent_predicted
0,1,19278,1,Education,409,2024-04-12 02:01:15.637632,Good,409,18869,2830,True,2826.11
1,1,19278,2,Travel,102,2024-04-15 02:01:15.637632,Good,511,18767,750,True,754.68
2,1,19278,3,Entertainment,507,2024-04-16 02:01:15.637632,Good,1018,18260,1826,True,1825.61
3,1,19278,4,Travel,383,2024-04-18 02:01:15.637632,Good,1401,17877,715,True,719.13
4,1,19278,5,Travel,376,2024-04-22 02:01:15.637632,Good,1777,17501,700,True,699.54


In [4]:
data['Category'].value_counts()

Category
Good    2201
Bad     1816
Name: count, dtype: int64

In [5]:
data.head()

Unnamed: 0,User_ID,Income,Transaction_ID,Reason,Amount_Spent,Date,Category,Total_Spent,Available_Amount,Can_Spent,Validate,can_spent_predicted
0,1,19278,1,Education,409,2024-04-12 02:01:15.637632,Good,409,18869,2830,True,2826.11
1,1,19278,2,Travel,102,2024-04-15 02:01:15.637632,Good,511,18767,750,True,754.68
2,1,19278,3,Entertainment,507,2024-04-16 02:01:15.637632,Good,1018,18260,1826,True,1825.61
3,1,19278,4,Travel,383,2024-04-18 02:01:15.637632,Good,1401,17877,715,True,719.13
4,1,19278,5,Travel,376,2024-04-22 02:01:15.637632,Good,1777,17501,700,True,699.54


In [6]:
# Step 2: Remove unnecessary columns
data = data.drop(['User_ID', 'Transaction_ID', 'Date', 'Total_Spent','Validate', 'Can_Spent','can_spent_predicted'], axis=1)

In [7]:
data.head()

Unnamed: 0,Income,Reason,Amount_Spent,Category,Available_Amount
0,19278,Education,409,Good,18869
1,19278,Travel,102,Good,18767
2,19278,Entertainment,507,Good,18260
3,19278,Travel,383,Good,17877
4,19278,Travel,376,Good,17501


In [8]:
data["Category"].value_counts()

Category
Good    2201
Bad     1816
Name: count, dtype: int64

In [9]:
# Step 3: Convert categorical variables to numeric
label_encoder = LabelEncoder()
data['Reason'] = label_encoder.fit_transform(data['Reason'])

# Encode the target variable
category_encoder = LabelEncoder()
data['Category'] = category_encoder.fit_transform(data['Category'])

# Step 4: Feature and target split
X = data.drop('Category', axis=1)  # Features
y = data['Category']  # Target

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Normalize numerical columns (optional)
scaler = StandardScaler()
X_train[['Income', 'Amount_Spent','Available_Amount']] = scaler.fit_transform(X_train[['Income', 'Amount_Spent','Available_Amount']])
X_test[['Income', 'Amount_Spent','Available_Amount']] = scaler.transform(X_test[['Income', 'Amount_Spent','Available_Amount']])

# Step 7: Train a classifier (Random Forest in this case)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 8: Make predictions and evaluate
y_pred = clf.predict(X_test)

# Step 9: Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))

# Specify the full range of labels to handle the missing class issue
print(classification_report(y_test, y_pred, labels=[0, 1, 2], target_names=category_encoder.classes_, zero_division=0))

Accuracy: 0.9614427860696517
              precision    recall  f1-score   support

         Bad       0.96      0.96      0.96       378
        Good       0.96      0.96      0.96       426

    accuracy                           0.96       804
   macro avg       0.64      0.64      0.64       804
weighted avg       0.96      0.96      0.96       804





In [10]:
import joblib

In [11]:
# Save the encoders, scaler, and model
joblib.dump(label_encoder, 'label_encoder_reason_class.pkl')  # Save LabelEncoder for 'Reason'
joblib.dump(category_encoder, 'label_encoder_category_class.pkl')  # Save LabelEncoder for 'Category'
joblib.dump(scaler, 'scaler.pkl')  # Save StandardScaler
joblib.dump(clf, 'random_forest_model.pkl')  # Save Random Forest model

['random_forest_model.pkl']

In [64]:
import joblib
import pandas as pd

# Step 1: Load the pickled files
label_encoder_reason = joblib.load('label_encoder_reason_class.pkl')
category_encoder = joblib.load('label_encoder_category_class.pkl')
scaler = joblib.load('scaler.pkl')
clf = joblib.load('random_forest_model.pkl')

# Step 2: Create a sample data input
sample_data = pd.DataFrame({
    'Income': [25000],  # Replace with appropriate values
    'Amount_Spent': [2350],
    'Available_Amount': [25000],  # Replace with appropriate values
    'Reason': ['Entertainment']  # Replace with appropriate values
})

# Step 3: Preprocess the sample data
# Encode the 'Reason' column
sample_data['Reason'] = label_encoder_reason.transform(sample_data['Reason'])

# Step 4: Retrieve the feature names used during training
# clf was trained on certain feature names and we need to retrieve them to ensure alignment
model_feature_names = clf.feature_names_in_

# Step 5: Reorder sample_data columns to match the order of feature names used during training
sample_data = sample_data[model_feature_names]

# Step 6: Scale the numerical columns (Income, Amount_Spent, Available_Amount)
numerical_columns = ['Income', 'Amount_Spent', 'Available_Amount']
sample_data[numerical_columns] = scaler.transform(sample_data[numerical_columns])

# Step 7: Make predictions
predicted_category = clf.predict(sample_data)

# Step 8: Inverse transform the predicted category to the original labels
decoded_category = category_encoder.inverse_transform(predicted_category)

# Print the predicted category
print("Predicted Category:", decoded_category[0])  # Output the predicted category

Predicted Category: Good
