In [4]:
import pandas as pd
import numpy as np
import glob
import os
import psutil
import GPUtil
import time

gpu = GPUtil.getGPUs()[0]
print(gpu.name)

start_time = 0


def log_resource_usage(scenario, timer_end):
    global start_time
    cpu_percent = psutil.cpu_percent(interval=1)
    ram_mb = psutil.virtual_memory().used / (1024 ** 2)
    gpu_usage = gpu.load * 100
    gpu_ram_usage = gpu.memoryUsed
    print(f"{scenario}: CPU Usage: {cpu_percent}%")
    print(f"{scenario}: RAM Usage: {round(ram_mb, 2)} MB")
    print(f"{scenario}: GPU Usage: {round(gpu_usage, 2)}%")
    print(f"{scenario}: GPU RAM Usage: {round(gpu_ram_usage, 2)} MB")
    if timer_end:
        print(
            f"{scenario}: Execution time: {round(time.time() - start_time, 2)} seconds")
        start_time = 0
    elif start_time == 0:
        start_time = time.time()
    return cpu_percent, ram_mb

all_files = glob.glob(os.path.join('../data/generated/' , "generated_*.csv"))
print(all_files)

dataset = pd.DataFrame()
for idx, filename in enumerate(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    dataset = pd.concat([dataset, df])

print(dataset.info())



NVIDIA GeForce RTX 3080 Laptop GPU
['../data/generated/generated_data_2.csv', '../data/generated/generated_data_3.csv', '../data/generated/generated_data_1.csv']
<class 'pandas.core.frame.DataFrame'>
Index: 32404 entries, 0 to 10854
Data columns (total 20 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Date                              32404 non-null  object 
 1   Transaction Category              32404 non-null  object 
 2   Amount                            32404 non-null  float64
 3   Credit/Debit                      32404 non-null  object 
 4   Payment Method                    32404 non-null  object 
 5   Inflation Rate                    32404 non-null  float64
 6   Dependent Family Size             32404 non-null  int64  
 7   Age                               32404 non-null  int64  
 8   Months with Higher Spending       32404 non-null  object 
 9   Number of Expenses a Month        32

### Encode the data

In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pandas as pd

# Assume df is your DataFrame
X = dataset.drop(['Budget', 'Date', 'Year-Month', 'Credit/Debit', 'Most Frequent Expense Categories',
                 'Months with Higher Spending', 'Number of Expenses a Month', 'Payment Method'], axis=1)  # Features
y = dataset['Budget']  # Labels

# Perform Z-score normalization
numeric_cols = ['Amount', 'Month', 'Year', 'Day', 'Last Month Budget',
                'Cumulative Monthly Spending', 'Average Monthly Budget']
for col in numeric_cols:
    X[col] = (X[col] - X[col].mean()) / X[col].std()

# If you want to normalize the label as well
mean_budget = y.mean()
std_budget = y.std()
y = (y - mean_budget) / std_budget
print(mean_budget)
print(X.info())
print(y.info())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

# Create ColumnTransformer to apply OneHotEncoding only to the 'Description' column
preprocessor = ColumnTransformer(
    transformers=[
        ('desc_onehot', onehot_encoder, ['Transaction Category'])
    ],
    remainder='passthrough'  # keep remaining columns as is
)

# Fit and transform the training data and transform testing data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# X_train_onehot and X_test_onehot now contain the one-hot encoded 'Description' column along with other features.

207458.55789408716
<class 'pandas.core.frame.DataFrame'>
Index: 32404 entries, 0 to 10854
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Transaction Category         32404 non-null  object 
 1   Amount                       32404 non-null  float64
 2   Inflation Rate               32404 non-null  float64
 3   Dependent Family Size        32404 non-null  int64  
 4   Age                          32404 non-null  int64  
 5   Estimated Monthly Expenses   32404 non-null  int64  
 6   Day                          32404 non-null  float64
 7   Month                        32404 non-null  float64
 8   Year                         32404 non-null  float64
 9   Cumulative Monthly Spending  32404 non-null  float64
 10  Last Month Budget            32404 non-null  float64
 11  Average Monthly Budget       32404 non-null  float64
dtypes: float64(8), int64(3), object(1)
memory usage: 3.2+ MB
Non

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


### Neural Network

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1, l2, l1_l2
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from keras_tuner.tuners import RandomSearch

# Assume preprocessed X_train_transformed, X_test_transformed, y_train, y_test

# Standardize the data
scaler_x = StandardScaler().fit(X_train_transformed)
scaler_y = StandardScaler().fit(y_train.values.reshape(-1, 1))

X_train_scaled = scaler_x.transform(X_train_transformed)
X_test_scaled = scaler_x.transform(X_test_transformed)
y_train_scaled = scaler_y.transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))

print((X_train_scaled.shape[1],))

# Define the hypermodel
def build_hypermodel(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('input_units', min_value=32, max_value=256, step=32),
                    activation='relu',
                    input_shape=(X_train_scaled.shape[1],),
                    
                    ))
    # model.add(Dropout(rate=hp.Float('input_dropout', min_value=0.1, max_value=0.5, step=0.1)))

    # Additional hidden layer with regularization
    # model.add(Dense(units=hp.Int('hidden_units', min_value=32, max_value=128, step=32),
    #                 activation='relu',
    #                 kernel_regularizer=l1_l2(l1=hp.Float('l1_value', min_value=1e-5, max_value=1e-2, sampling='LOG'),
    #                                          l2=hp.Float('l2_value', min_value=1e-5, max_value=1e-2, sampling='LOG'))
    #                 ))
    # model.add(Dropout(rate=hp.Float('hidden_dropout', min_value=0.1, max_value=0.5, step=0.1)))

    # Output layer
    model.add(Dense(1))

    model.compile(optimizer='adam',
                  loss='mean_squared_error')
    
    return model


# Initialize the tuner
tuner = RandomSearch(
    build_hypermodel,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=5
)

# Perform hyperparameter search
tuner.search(X_train_scaled, y_train_scaled, epochs=50, validation_split=0.2)

# Get the optimal hyperparameters
best_hp = tuner.get_best_hyperparameters()[0]

# Build the model with the optimal hyperparameters and train it on the data
best_model = tuner.hypermodel.build(best_hp)
log_resource_usage("Pre Training", False)
best_model.fit(X_train_scaled, y_train_scaled, epochs=50, batch_size=32)
log_resource_usage("Post Training", True)

# Evaluate the model
y_pred_scaled = best_model.predict(X_test_scaled)
mse = mean_squared_error(y_test_scaled, y_pred_scaled)

print(f'Best Hyperparameters: {best_hp.values}')
print(f'Mean Squared Error with Best Hyperparameters: {mse}')

(17,)
INFO:tensorflow:Reloading Tuner from ./untitled_project/tuner0.json
INFO:tensorflow:Oracle triggered exit


Pre Training: CPU Usage: 0.6%
Pre Training: RAM Usage: 4136.39 MB
Pre Training: GPU Usage: 0.0%
Pre Training: GPU RAM Usage: 708.0 MB
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Post Training: CPU Usage: 2.7%
Post Training: RAM Usage: 3972.45 MB
Post Training: GPU Usage: 0.0%
Post Training: GPU RAM Usage: 708.0 MB
Post Training: Execution time: 42.58 seconds
Best Hyperparameters: {'input_units': 256}
Mean Squared Error with Best Hyperparameters: 0.0

### Test the model

In [7]:
new_data = pd.DataFrame({
    'Amount': [1800],
    'Transaction Category': ['transport'],
    'Year': [2019],
    'Month': [5],
    'Day': [15],
    'Age': [25],
    'Inflation Rate': [0.03],
    'Dependent Family Size': [3],
    'Last Month Budget': [197000],
    'Estimated Monthly Expenses': [200000],
    'Cumulative Monthly Spending': [56000],
    'Average Monthly Budget': [110000]
})


In [8]:

# Transform the new data point in the same way as the training data
new_data_transformed = preprocessor.transform(new_data)
new_data_scaled = scaler_x.transform(new_data_transformed)  # Scale the features

# Predict the Budget using the trained neural network model
predicted_budget_scaled = best_model.predict(new_data_scaled)

# Inverse scale the predicted Budget
predicted_budget = scaler_y.inverse_transform(predicted_budget_scaled)

print(predicted_budget)

print(f'Predicted budget: {round(predicted_budget[0][0], 2)}')


[[188874.7]]
Predicted budget: 188874.703125


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
