In [23]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from sklearn.model_selection import ParameterSampler
from sklearn.preprocessing import MinMaxScaler

data = pd.read_csv('../data/generated/generated_data_1.csv')
categories = data['Transaction Category'].unique()
categories = categories.reshape(len(categories), 1)

columns = ['Amount', 'Day', 'Month', 'Year', 'Transaction Category']
data['Transaction Category'] = data['Transaction Category'].astype('category').cat.codes

generator_path = 'best_generator_model'


def build_generator(input_dim, output_dim, lr):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(32, activation='relu', input_shape=(input_dim,)),
        tf.keras.layers.Dense(output_dim, activation='linear'),
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(loss='binary_crossentropy', optimizer=optimizer)
    return model

def build_discriminator(input_dim, lr):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(32, activation='relu', input_shape=(input_dim,)),
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(loss='binary_crossentropy', optimizer=optimizer)
    return model

def random_search(param_list):
    # Initialize variables
    input_dim = len(columns)  # Day, Month, Year, Amount, Transaction Category
    output_dim = len(columns)
    best_g_loss = np.inf

    # Random Search
    for params in param_list:
        lr, batch_size, epochs = params['learning_rate'], params['batch_size'], params['epochs']
        
        # Create models
        generator = build_generator(input_dim, output_dim, lr)
        discriminator = build_discriminator(input_dim, lr)

        # Create GAN model
        discriminator.trainable = False
        gan_input = tf.keras.Input(shape=(input_dim,))
        x = generator(gan_input)
        gan_output = discriminator(x)
        gan = tf.keras.Model(gan_input, gan_output)
        gan.compile(loss='binary_crossentropy', optimizer='adam')

        # Training Loop
        for e in range(epochs):
            noise = np.random.normal(0, 1, (batch_size, input_dim))
            generated_data = generator.predict(noise)
            real_data = scaled_data[np.random.randint(0, scaled_data.shape[0], batch_size)]
            labels_real = np.ones((batch_size, 1))
            labels_fake = np.zeros((batch_size, 1))

            # Train Discriminator
            d_loss_real = discriminator.train_on_batch(real_data, labels_real)
            d_loss_fake = discriminator.train_on_batch(generated_data, labels_fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Train Generator
            noise = np.random.normal(0, 1, (batch_size, input_dim))
            labels_gan = np.ones((batch_size, 1))
            g_loss = gan.train_on_batch(noise, labels_gan)

            # Save Best Generator Model
            if g_loss < best_g_loss:
                best_g_loss = g_loss
                print('Epoch: {}, Discriminator Loss: {}, Generator Loss: {}'.format(e, d_loss, g_loss))
                generator.save(generator_path)

# MinMax Scaling
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data[columns])

if os.path.exists(generator_path):
    # Load the best model
    best_generator = tf.keras.models.load_model(generator_path)
else:
    # Hyperparameter Grid for Random Search
    param_grid = {'learning_rate': [0.001, 0.005], 'batch_size': [2, 5], 'epochs': [500, 1000]}
    param_list = list(ParameterSampler(param_grid, n_iter=5))
    
    # Random Search
    random_search(param_list)
    best_generator = tf.keras.models.load_model(generator_path)

# Generate new data
num_samples = 100000
noise = np.random.normal(0, 1, (num_samples, len(columns)))
generated_data = best_generator.predict(noise)

# Inverse transform to original scale
generated_data = scaler.inverse_transform(generated_data)
generated_df = pd.DataFrame(generated_data, columns=columns)


print(generated_df)


            Amount        Day     Month         Year  Transaction Category
0       443.632385   4.031215  0.440219  2016.609009              1.365882
1       709.738708   8.056751  1.453709  2011.691650              3.742117
2      2125.222656  16.337690  8.041069  2014.680176              1.572266
3       -97.237236   7.197590  4.488111  2020.087280              2.097701
4       894.797668  13.067809  6.582041  2019.929199              3.511796
...            ...        ...       ...          ...                   ...
99995 -1515.889404  10.458283 -0.334390  2017.974243              1.542753
99996  1081.107422   3.948905  4.182172  2015.163696              1.977825
99997  2956.637695  19.968464  7.161098  2014.551025              3.009734
99998  2344.005371   1.288859  3.055319  2015.143921              0.609876
99999  1727.114136   6.232352  8.769530  2019.055542              2.498441

[100000 rows x 5 columns]


In [24]:
max_amount = data['Amount'].max()
min_amount = data['Amount'].min()
max_day = data['Day'].max()
min_day = data['Day'].min()
max_month = data['Month'].max()
min_month = data['Month'].min()
max_year = data['Year'].max()
min_year = data['Year'].min()
category_count = len(categories)

df_generated = generated_df.copy()

# filter out the generated data that is not in the range of the original data
df_generated = df_generated[(df_generated['Amount'] >= min_amount) & (df_generated['Amount'] <= max_amount)]
df_generated = df_generated[(df_generated['Day'] >= min_day) & (df_generated['Day'] <= max_day)]
df_generated = df_generated[(df_generated['Month'] >= min_month) & (df_generated['Month'] <= max_month)]
df_generated = df_generated[(df_generated['Year'] >= min_year) & (df_generated['Year'] <= max_year)]
df_generated = df_generated[(df_generated['Transaction Category'] >= 0) & (df_generated['Transaction Category'] <= category_count)]

number_of_expenses_a_month = data['Number of Expenses a Month'].sample(n=1).values[0]

# Copy other columns from the original DataFrame to the generated DataFrame
other_columns = set(data.columns) - set(df_generated.columns)

for col in other_columns:
    # Randomly sample from the original DataFrame to populate the other columns in generated_df
    df_generated[col] = data[col].sample(n=1).values[0]

def round_to_multiple(number, multiple):
    return multiple * round(number / multiple)

def get_total_spending_of_the_month_until_that_day(dataset, month, day):
    return dataset.loc[(dataset['Year-Month'] == month) & (dataset['Day'] <= day)]['Amount'].sum()

def random_select(group, n=number_of_expenses_a_month):
    return group.sample(n=min(len(group), n))


df_generated['Day'] = df_generated['Day'].apply(lambda x: int(x))
df_generated['Month'] = df_generated['Month'].apply(lambda x: int(x))
df_generated['Year'] = df_generated['Year'].apply(lambda x: int(x))
df_generated['Date'] = pd.to_datetime(df_generated[['Year', 'Month', 'Day']])
df_generated['Year-Month'] = df_generated['Year'].astype(str) + '-' + df_generated['Month'].astype(str).str.zfill(2) 

# apply number_of_expenses_a_month to each month
df_generated = df_generated.groupby('Year-Month').apply(random_select)
df_generated.sort_values(by=['Year', 'Month', 'Day'], inplace=True)
df_generated.reset_index(drop=True, inplace=True)

df_generated['Transaction Category'] = df_generated['Transaction Category'].apply(lambda x: categories[int(x)])
df_generated['Amount'] = df_generated['Amount'].apply(lambda x: float(round_to_multiple(x, 5)))
df_generated['Estimated Monthly Expenses'] = df_generated['Estimated Monthly Expenses'].apply(lambda x: float(x))
df_generated['Transaction Category'] = df_generated['Transaction Category'].apply(lambda x: x[0])

monthly_budget = df_generated.groupby(['Year-Month'])['Amount'].sum().reset_index()
df_generated['Cumulative Monthly Spending'] = df_generated.apply(lambda x: get_total_spending_of_the_month_until_that_day(df_generated, x['Year-Month'], x['Day']), axis=1)
df_generated['Budget'] = df_generated['Year-Month'].map(monthly_budget.set_index('Year-Month')['Amount'])
df_generated['Last Month Budget'] = df_generated['Year-Month'].map(monthly_budget.set_index('Year-Month')['Amount'].shift(1))
# use this month's budget if last month's budget is not available
df_generated['Last Month Budget'] = df_generated['Last Month Budget'].fillna(df_generated['Budget'])
df_generated['Average Monthly Budget'] = monthly_budget['Amount'].mean().round(2)

df_generated


# df_generated

Unnamed: 0,Amount,Day,Month,Year,Transaction Category,Inflation Rate,Year-Month,Credit/Debit,Age,Budget,Dependent Family Size,Most Frequent Expense Categories,Months with Higher Spending,Number of Expenses a Month,Payment Method,Estimated Monthly Expenses,Date,Cumulative Monthly Spending,Last Month Budget,Average Monthly Budget
0,765.0,2,1,2013,bills,1.4,2013-01,debit,31,122575.0,0,"['transport', 'food']","[1, 3, 12]",90,cash,100000.0,2013-01-02,8275.0,122575.0,156476.57
1,3055.0,2,1,2013,bills,1.4,2013-01,debit,31,122575.0,0,"['transport', 'food']","[1, 3, 12]",90,cash,100000.0,2013-01-02,8275.0,122575.0,156476.57
2,1175.0,2,1,2013,bills,1.4,2013-01,debit,31,122575.0,0,"['transport', 'food']","[1, 3, 12]",90,cash,100000.0,2013-01-02,8275.0,122575.0,156476.57
3,1035.0,2,1,2013,bills,1.4,2013-01,debit,31,122575.0,0,"['transport', 'food']","[1, 3, 12]",90,cash,100000.0,2013-01-02,8275.0,122575.0,156476.57
4,430.0,2,1,2013,bills,1.4,2013-01,debit,31,122575.0,0,"['transport', 'food']","[1, 3, 12]",90,cash,100000.0,2013-01-02,8275.0,122575.0,156476.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8828,3665.0,24,11,2021,transport,1.4,2021-11,debit,31,118335.0,0,"['transport', 'food']","[1, 3, 12]",90,cash,100000.0,2021-11-24,112065.0,122070.0,156476.57
8829,2185.0,25,11,2021,transport,1.4,2021-11,debit,31,118335.0,0,"['transport', 'food']","[1, 3, 12]",90,cash,100000.0,2021-11-25,116280.0,122070.0,156476.57
8830,2030.0,25,11,2021,transport,1.4,2021-11,debit,31,118335.0,0,"['transport', 'food']","[1, 3, 12]",90,cash,100000.0,2021-11-25,116280.0,122070.0,156476.57
8831,1605.0,26,11,2021,others,1.4,2021-11,debit,31,118335.0,0,"['transport', 'food']","[1, 3, 12]",90,cash,100000.0,2021-11-26,117885.0,122070.0,156476.57


In [25]:
import time

df_generated.to_csv(f'../data/generated/synthetic_data_{time.time()}.csv', index=False)
