In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("user_expenditure_data_six_months.csv")

In [3]:
df=df.drop(["User_ID","Transaction_ID","Validate","Date","Total_Spent","Amount_Spent"],axis=1)

In [4]:
from sklearn.preprocessing import LabelEncoder

# Assuming 'Category' and 'Reason' are the columns with string values
label_encoder_reason_reg = LabelEncoder()
label_encoder_category_reg = LabelEncoder()

# Applying LabelEncoder to the 'Reason' and 'Category' columns
df['Reason'] = label_encoder_reason_reg.fit_transform(df['Reason'])
df['Category'] = label_encoder_category_reg.fit_transform(df['Category'])

# Print the classes for 'Reason'
print("Reason Classes:")
for i, reason in enumerate(label_encoder_reason_reg.classes_):
    print(f"{reason}: {i}")

# Print the classes for 'Category'
print("\nCategory Classes:")
for i, category in enumerate(label_encoder_category_reg.classes_):
    print(f"{category}: {i}")


Reason Classes:
Clothing: 0
Dining Out: 1
Education: 2
Entertainment: 3
Groceries: 4
Health: 5
Mall: 6
Travel: 7
Utilities: 8

Category Classes:
Bad: 0
Good: 1
Out of Budget: 2


In [5]:
df.head()

Unnamed: 0,Income,Reason,Category,Available_Amount,Can_Spent
0,18630,8,1,16136,3227
1,18630,1,1,15628,781
2,18630,2,1,14011,2101
3,18630,4,0,10771,3231
4,18630,2,0,8770,1315


In [6]:
df.head()

Unnamed: 0,Income,Reason,Category,Available_Amount,Can_Spent
0,18630,8,1,16136,3227
1,18630,1,1,15628,781
2,18630,2,1,14011,2101
3,18630,4,0,10771,3231
4,18630,2,0,8770,1315


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

# Define X (features) and y (target)
X = df.drop(columns=['Can_Spent'])  # Dropping target and non-relevant columns
y = df['Can_Spent']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Optionally, add predictions to the test set DataFrame for comparison
X_test['Predicted_Can_Spent'] = (y_pred.astype(int))
X_test['True_Can_Spent'] = y_test.values

X_test.head(50)


Mean Squared Error: 469.94539016393463
R-squared: 0.9997461167170167


Unnamed: 0,Income,Reason,Category,Available_Amount,Predicted_Can_Spent,True_Can_Spent
247,20160,1,1,14802,744,740
1293,28050,4,0,3225,911,967
1563,28159,2,1,20963,3153,3144
1101,17989,7,1,13509,542,540
1161,24629,5,1,4616,927,923
382,15835,8,1,9267,1852,1853
1197,24629,6,1,9121,452,456
777,15881,3,1,8033,804,803
643,25359,4,1,12640,3796,3792
275,20160,7,0,4476,187,179


In [8]:
import joblib 

# Step 10: Save the models and encoders
joblib.dump(label_encoder_reason_reg, 'label_encoder_reason_reg.pkl')  # Save LabelEncoder for 'Reason'
joblib.dump(label_encoder_category_reg, 'label_encoder_category_reg.pkl')  # Save LabelEncoder for 'Category'
joblib.dump(model, 'random_forest_regressor_model.pkl')  # Save Random Forest Regressor model

['random_forest_regressor_model.pkl']

In [11]:
import joblib
import pandas as pd

# Step 1: Load the pickled files
label_encoder_reason = joblib.load('label_encoder_reason_reg.pkl') #Load the LabelEncoder for 'Reason'
label_encoder_category = joblib.load('label_encoder_category_reg.pkl') #Load the LabelEncoder for 'Category'
model = joblib.load('random_forest_regressor_model.pkl')  # Load the Random Forest Regressor model

# Step 2: Create a sample data input (new data for prediction)
new_data = pd.DataFrame({
    'Income': [20000],  # Replace with appropriate values
    'Available_Amount': [18000],  # Replace with appropriate values
    'Reason': ['Entertainment'],  # Replace with appropriate values
    'Category': ['Good']  # Ensure 'Category' is also included
})

# Step 3: Preprocess the sample data
# Encode the 'Reason' and 'Category' columns
new_data['Reason'] = label_encoder_reason.transform(new_data['Reason'])
new_data['Category'] = label_encoder_category.transform(new_data['Category'])

# Step 4: Retrieve the feature names used during training
model_feature_names = model.feature_names_in_

# Print model feature names and new_data columns for debugging
print("Model Feature Names:", model_feature_names)
print("New Data Columns:", new_data.columns)

# Step 5: Reorder new_data columns to match the order of feature names used during training
try:
    new_data = new_data[model_feature_names]
except KeyError as e:
    print(f"KeyError: {e}. Please check the columns of new_data and model_feature_names.")

# Step 6: Make predictions (no scaling needed)
predicted_can_spent = model.predict(new_data)

# Print the predicted value
print("Predicted Can_Spent:", predicted_can_spent[0])  # Output the predicted value


Model Feature Names: ['Income' 'Reason' 'Category' 'Available_Amount']
New Data Columns: Index(['Income', 'Available_Amount', 'Reason', 'Category'], dtype='object')
Predicted Can_Spent: 1800.44
