In [None]:
import pandas as pd
import numpy as np
import os
import random
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from sklearn.metrics import *
from keras.models import Model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Activation, BatchNormalization, Dropout
from tensorflow.keras.layers import LeakyReLU, ELU, PReLU, ReLU
from tensorflow.keras.metrics import * 
from sklearn.preprocessing import *
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import *
from keras.callbacks import EarlyStopping
from utils import *
import warnings 
warnings.filterwarnings('ignore')

seed = 42
os.environ["PYTHONHASHSEED"] = str(seed)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
os.environ['TF_CUDNN_USE_FRONTEND '] = '1'
os.environ["CUDA_VISIBLE_DEVICES"] = '1' 
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)  

In [None]:
def MLP_regression(X_train):   
       
    if isinstance(X_train, pd.DataFrame):
        num = len(X_train.columns)
    elif isinstance(X_train, int) or isinstance(X_train, float): 
        num = X_train   
        
    inputs = Input(shape=(num, ))    
    
    model = Dense(256)(inputs)   
    model = BatchNormalization()(model) 
    model = LeakyReLU()(model)   
    model = Dropout(0.25)(model)
    
    model = Dense(64)(model)   
    model = BatchNormalization()(model)  
    model = LeakyReLU()(model)   
    model = Dropout(0.25)(model)
    
    model = Dense(256)(model) 
    model = BatchNormalization()(model)
    model = LeakyReLU()(model)    
    model = Dropout(0.25)(model)
    
    model = Dense(64)(model)      
    model = BatchNormalization()(model)
    model = LeakyReLU()(model)   
    model = Dropout(0.25)(model)
    
    model = Dense(16)(model)      
    model = BatchNormalization()(model)     
    model = LeakyReLU()(model)
    model = Dropout(0.25)(model)
        
        
    output = Dense(1, activation='relu')(model)

    MLP_model = Model(inputs=[inputs], outputs=[output])
    
    optimizer = Adam(learning_rate=0.001)
   
    MLP_model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])    

    return MLP_model


In [None]:
train = pd.read_csv('train.csv')
train, test = train.drop('ID', axis = 1)

In [None]:
train, val = train_test_split(train, train_size=0.8, random_state=42)

In [None]:
train, val = One_hot_encoder(train, val, variable=['Gender', 'Education_Status', 'Employment_Status',
                                                    'Industry_Status', 'Occupation_Status', 'Race',
                                                    'Hispanic_Origin', 'Martial_Status', 'Household_Status',
                                                    'Household_Summary', 'Citizenship', 'Birth_Country',
                                                    'Birth_Country (Father)', 'Birth_Country (Mother)', 'Tax_Status', 'Income_Status'])

In [None]:
feature = Feature_selection(train, 100)

In [None]:
train.columns = train.columns.astype(str)
val.columns = val.columns.astype(str)

In [None]:
X_train, y_train = train[feature], train['Income']
X_val, y_val = val[feature], val['Income']

In [None]:
X_train, X_val = Log_transformation(X_train, X_val)

In [None]:
X_train, X_val = Scaling(X_train, X_val)

In [None]:
Scaling(train[['Income']], save = 'income_scaler')
train = Scaling(pd.concat([X_train.reset_index(drop = True), y_train.reset_index(drop = True)], axis = 1))
train

In [None]:
from ctgan import CTGAN

continuous_col = find_continuous_col(train)

discrete_columns = train.drop(continuous_col, axis = 1).columns

ctgan = CTGAN(epochs=100)
ctgan.fit(train, discrete_columns)

synthetic_data = ctgan.sample(10000)

In [None]:
X_train_sampled = synthetic_data.drop('Income', axis = 1)
y_train_sampled = Scaling(synthetic_data[['Income']], method='inverse', scaler = 'income_scaler')
y_train_sampled['Income'] = np.where(y_train_sampled['Income'] <=0, 0, y_train_sampled['Income'])

X_train = pd.concat([X_train, X_train_sampled])
y_train = pd.concat([y_train, y_train_sampled])

In [None]:
def Regression_model(model, X_train, y_train, X_val, y_val):

    if model == 'lgbm':
        model = LGBMRegressor(boosting_type = 'gbdt', learning_rate = 0.01, n_estimators=500)
        model.fit(X_train, y_train)

    elif model == 'mlp':
        
        train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train)).batch(64).prefetch(tf.data.AUTOTUNE)  
        val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).shuffle(len(X_val)).batch(1).prefetch(tf.data.AUTOTUNE)  

        model = MLP_regression(X_train)

        early_stopping = EarlyStopping(monitor='val_loss', patience=50, verbose=1, restore_best_weights=True)

        model.fit(train_dataset, validation_data = val_dataset, epochs=1000, callbacks=[early_stopping])


    y_pred = pd.DataFrame(model.predict(X_val), columns = ['Prediction'])
    y_pred['Prediction'] = np.where(y_pred['Prediction'] <= 0, 0, y_pred['Prediction'])
    
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    print(rmse)