In [11]:
import pandas as pd
import numpy as np
import glob
import os
import psutil
import time

start_time = 0


def log_resource_usage(scenario, timer_end):
    global start_time
    cpu_percent = psutil.cpu_percent(interval=1)
    ram_mb = psutil.virtual_memory().used / (1024 ** 2)
    print(f"{scenario}: CPU Usage: {cpu_percent}%")
    print(f"{scenario}: RAM Usage: {round(ram_mb, 2)} MB")
    if timer_end:
        print(
            f"{scenario}: Execution time: {round(time.time() - start_time, 2)} seconds")
        start_time = 0
    elif start_time == 0:
        start_time = time.time()
    return cpu_percent, ram_mb


all_files = glob.glob(os.path.join('../data/generated/', "generated_*.csv"))
print(all_files)

dataset = pd.DataFrame()
for idx, filename in enumerate(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    dataset = pd.concat([dataset, df])

print(dataset.info())

['../data/generated/generated_data_2.csv', '../data/generated/generated_data_3.csv', '../data/generated/generated_data_1.csv']
<class 'pandas.core.frame.DataFrame'>
Index: 32404 entries, 0 to 10854
Data columns (total 20 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Date                              32404 non-null  object 
 1   Transaction Category              32404 non-null  object 
 2   Amount                            32404 non-null  float64
 3   Credit/Debit                      32404 non-null  object 
 4   Payment Method                    32404 non-null  object 
 5   Inflation Rate                    32404 non-null  float64
 6   Dependent Family Size             32404 non-null  int64  
 7   Age                               32404 non-null  int64  
 8   Months with Higher Spending       32404 non-null  object 
 9   Number of Expenses a Month        32404 non-null  int64  
 10  Most Fre

### Encode the data

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pandas as pd

# Assume df is your DataFrame
X = dataset.drop(['Budget', 'Date', 'Year-Month', 'Credit/Debit', 'Most Frequent Expense Categories',
                 'Months with Higher Spending', 'Number of Expenses a Month', 'Payment Method'], axis=1)  # Features
y = dataset['Budget']  # Labels

# Perform Z-score normalization
numeric_cols = ['Amount', 'Month', 'Year', 'Day', 'Last Month Budget',
                'Cumulative Monthly Spending', 'Average Monthly Budget']
for col in numeric_cols:
    X[col] = (X[col] - X[col].mean()) / X[col].std()

# If you want to normalize the label as well
mean_budget = y.mean()
std_budget = y.std()
y = (y - mean_budget) / std_budget
print(mean_budget)
print(X.info())
print(y.info())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

# Create ColumnTransformer to apply OneHotEncoding only to the 'Description' column
preprocessor = ColumnTransformer(
    transformers=[
        ('desc_onehot', onehot_encoder, ['Transaction Category'])
    ],
    remainder='passthrough'  # keep remaining columns as is
)

# Fit and transform the training data and transform testing data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# X_train_onehot and X_test_onehot now contain the one-hot encoded 'Description' column along with other features.

207458.55789408716
<class 'pandas.core.frame.DataFrame'>
Index: 32404 entries, 0 to 10854
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Transaction Category         32404 non-null  object 
 1   Amount                       32404 non-null  float64
 2   Inflation Rate               32404 non-null  float64
 3   Dependent Family Size        32404 non-null  int64  
 4   Age                          32404 non-null  int64  
 5   Estimated Monthly Expenses   32404 non-null  int64  
 6   Day                          32404 non-null  float64
 7   Month                        32404 non-null  float64
 8   Year                         32404 non-null  float64
 9   Cumulative Monthly Spending  32404 non-null  float64
 10  Last Month Budget            32404 non-null  float64
 11  Average Monthly Budget       32404 non-null  float64
dtypes: float64(8), int64(3), object(1)
memory usage: 3.2+ MB
Non

### Random Forest

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error


# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit Grid Search to the data
grid_search.fit(X_train_transformed, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Train and evaluate the model with the best parameters
best_rf_model = RandomForestRegressor(**best_params, random_state=42)
log_resource_usage("Pre Training", False)
best_rf_model.fit(X_train_transformed, y_train)
log_resource_usage("Post Training", True)
y_pred = best_rf_model.predict(X_test_transformed)
mse = mean_squared_error(y_test, y_pred)

print(f'Best Parameters: {best_params}')
print(f'Mean Squared Error with Best Parameters: {mse}')


Fitting 3 folds for each of 108 candidates, totalling 324 fits


[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   3.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   3.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   4.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   4.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   7.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   7.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   7.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators

### Test the model

In [14]:
new_data = pd.DataFrame({
    'Amount': [1800],
    'Transaction Category': ['transport'],
    'Year': [2019],
    'Month': [5],
    'Day': [15],
    'Age': [25],
    'Inflation Rate': [0.03],
    'Dependent Family Size': [3],
    'Last Month Budget': [197000],
    'Estimated Monthly Expenses': [200000],
    'Cumulative Monthly Spending': [56000],
    'Average Monthly Budget': [110000]
})


In [15]:

# Transform the new data point in the same way as the training data
new_data_transformed = preprocessor.transform(new_data)

predicted_budget = best_rf_model.predict(new_data_transformed)

print(f'Predicted budget: {predicted_budget[0]}')
actual_prediction = (predicted_budget * std_budget) + mean_budget

# Display the actual predicted Budget value
print(f'The actual predicted Budget value is: {np.round(actual_prediction, 2)}')

Predicted budget: 1.103618641688797
The actual predicted Budget value is: [253390.]
