In [9]:
import pandas as pd
import numpy as np
import glob
import os
import psutil
import GPUtil
import time

gpu = GPUtil.getGPUs()[0]
print(gpu.name)

start_time = 0


def log_resource_usage(scenario, timer_end):
    global start_time
    cpu_percent = psutil.cpu_percent(interval=1)
    ram_mb = psutil.virtual_memory().used / (1024 ** 2)
    gpu_usage = gpu.load * 100
    gpu_ram_usage = gpu.memoryUsed
    print(f"{scenario}: CPU Usage: {cpu_percent}%")
    print(f"{scenario}: RAM Usage: {round(ram_mb, 2)} MB")
    print(f"{scenario}: GPU Usage: {round(gpu_usage, 2)}%")
    print(f"{scenario}: GPU RAM Usage: {round(gpu_ram_usage, 2)} MB")
    if timer_end:
        print(
            f"{scenario}: Execution time: {round(time.time() - start_time, 2)} seconds")
        start_time = 0
    elif start_time == 0:
        start_time = time.time()
    return cpu_percent, ram_mb

all_files = glob.glob(os.path.join('../data/generated/' , "generated_*.csv"))
print(all_files)

dataset = pd.DataFrame()
for idx, filename in enumerate(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    dataset = pd.concat([dataset, df])

print(dataset.info())



NVIDIA GeForce RTX 3080 Laptop GPU
['../data/generated/generated_data_2.csv', '../data/generated/generated_data_3.csv', '../data/generated/generated_data_1.csv']
<class 'pandas.core.frame.DataFrame'>
Index: 32404 entries, 0 to 10854
Data columns (total 20 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Date                              32404 non-null  object 
 1   Transaction Category              32404 non-null  object 
 2   Amount                            32404 non-null  float64
 3   Credit/Debit                      32404 non-null  object 
 4   Payment Method                    32404 non-null  object 
 5   Inflation Rate                    32404 non-null  float64
 6   Dependent Family Size             32404 non-null  int64  
 7   Age                               32404 non-null  int64  
 8   Months with Higher Spending       32404 non-null  object 
 9   Number of Expenses a Month        32

### Encode the data

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pandas as pd

# Assume df is your DataFrame
X = dataset.drop(['Budget', 'Date', 'Year-Month', 'Credit/Debit', 'Most Frequent Expense Categories',
                 'Months with Higher Spending', 'Number of Expenses a Month', 'Payment Method'], axis=1)  # Features
y = dataset['Budget']  # Labels

# Perform Z-score normalization
numeric_cols = ['Amount', 'Month', 'Year', 'Day', 'Last Month Budget',
                'Cumulative Monthly Spending', 'Average Monthly Budget']
for col in numeric_cols:
    X[col] = (X[col] - X[col].mean()) / X[col].std()

# If you want to normalize the label as well
mean_budget = y.mean()
std_budget = y.std()
y = (y - mean_budget) / std_budget
print(mean_budget)
print(X.info())
print(y.info())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

# Create ColumnTransformer to apply OneHotEncoding only to the 'Transaction Category' column
preprocessor = ColumnTransformer(
    transformers=[
        ('desc_onehot', onehot_encoder, ['Transaction Category'])
    ],
    remainder='passthrough'  # keep remaining columns as is
)

# Fit and transform the training data and transform testing data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# X_train_onehot and X_test_onehot now contain the one-hot encoded 'Transaction Category' column along with other features.
X_train_reshaped = X_train_transformed.reshape(
    (X_train_transformed.shape[0], 1, X_train_transformed.shape[1]))
X_test_reshaped = X_test_transformed.reshape(
    (X_test_transformed.shape[0], 1, X_test_transformed.shape[1]))

207458.55789408716
<class 'pandas.core.frame.DataFrame'>
Index: 32404 entries, 0 to 10854
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Transaction Category         32404 non-null  object 
 1   Amount                       32404 non-null  float64
 2   Inflation Rate               32404 non-null  float64
 3   Dependent Family Size        32404 non-null  int64  
 4   Age                          32404 non-null  int64  
 5   Estimated Monthly Expenses   32404 non-null  int64  
 6   Day                          32404 non-null  float64
 7   Month                        32404 non-null  float64
 8   Year                         32404 non-null  float64
 9   Cumulative Monthly Spending  32404 non-null  float64
 10  Last Month Budget            32404 non-null  float64
 11  Average Monthly Budget       32404 non-null  float64
dtypes: float64(8), int64(3), object(1)
memory usage: 3.2+ MB
Non

In [11]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import RandomizedSearchCV

def create_model(learning_rate=0.01, units=50, dropout_rate=0.0):
    model = Sequential()
    model.add(LSTM(units, input_shape=(
        1, X_train_reshaped.shape[2]), activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse')
    return model

model_file = 'best_lstm_model'

# if not os.path.exists(model_file):
#     # Define hyperparameter grid
#     learning_rate = [0.001, 0.01, 0.1]
#     units = [30, 50, 70]
#     dropout_rate = [0.0, 0.2, 0.4]
#     batch_size = [10, 20, 30]
#     epochs = [50, 100]
#     param_dist = {
#         'learning_rate': learning_rate,
#         'units': units,
#         'dropout_rate': dropout_rate,
#         'batch_size': batch_size,
#         'epochs': epochs
#     }

#     # Wrap the Keras model with KerasRegressor
#     model = KerasRegressor(model=create_model, learning_rate=learning_rate, units=units,
#                         dropout_rate=dropout_rate, epochs=epochs, batch_size=batch_size, verbose=2)

#     # Use RandomizedSearchCV for hyperparameter tuning
#     random_search = RandomizedSearchCV(
#         estimator=model, param_distributions=param_dist, n_iter=10, cv=3)
#     random_search_result = random_search.fit(X_train_reshaped, y_train)


#     # Get the best parameters and best model from RandomizedSearchCV
#     best_params = random_search_result.best_params_
#     best_model = random_search_result.best_estimator_

#     # Save the best model in Keras format
#     best_model.model.save(model_file)

#     # Evaluate on test data
#     score = best_model.score(X_test_reshaped, y_test)
#     print(f'Test score: {score}')

best_params = {'units': 50, 'learning_rate': 0.01, 'epochs': 50, 'dropout_rate': 0.2, 'batch_size': 10}

if os.path.exists(model_file):
    best_model = load_model(model_file)
else:
    best_model = KerasRegressor(model=create_model, learning_rate=best_params['learning_rate'], units=best_params['units'],
                            dropout_rate=best_params['dropout_rate'], epochs=best_params['epochs'], batch_size=best_params['batch_size'], verbose=2)

log_resource_usage("Pre Training", False)
best_model.fit(X_train_reshaped, y_train)
log_resource_usage("Post Training", True)
best_model.model.save(model_file)



Pre Training: CPU Usage: 0.1%
Pre Training: RAM Usage: 11701.68 MB
Pre Training: GPU Usage: 16.0%
Pre Training: GPU RAM Usage: 15832.0 MB
Epoch 1/50
2593/2593 - 24s - loss: 62641.3555 - 24s/epoch - 9ms/step
Epoch 2/50


KeyboardInterrupt: 

### Test the model

In [None]:
new_data = pd.DataFrame({
    'Amount': [1800],
    'Transaction Category': ['transport'],
    'Year': [2019],
    'Month': [5],
    'Day': [15],
    'Age': [25],
    'Inflation Rate': [0.03],
    'Dependent Family Size': [3],
    'Last Month Budget': [197000],
    'Estimated Monthly Expenses': [200000],
    'Cumulative Monthly Spending': [56000],
    'Average Monthly Budget': [110000]
})


In [None]:

# Transform the new data point in the same way as the training data
new_data_transformed = preprocessor.transform(new_data)

# Perform prediction
predicted_budget = loaded_model.predict(new_data_transformed)

print(f'Predicted budget: {predicted_budget[0]}')
actual_prediction = (predicted_budget * std_budget) + mean_budget

# Display the actual predicted Budget value
print(f'The actual predicted Budget value is: {np.round(actual_prediction, 2)}')