consider front loading all the data augmentation -> leak in synthetic data in iterations of sgd<br>

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error
from sdv.tabular import CTGAN

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from pandas.core.common import SettingWithCopyWarning

#find optimal number of synthetic samples that can be generated
#calculated by minimizing the avg mse of the syntheticDataSize evaluation functions 
def sgd(dataframe,target_name,discrete_columns,sampleSize, lr, num_iters):
    start_iter = 0
    prevScore = 0
    theta = sampleSize
    for iter in range(start_iter + 1, num_iters + 1):
        score, grad = getGradients(prevScore,theta)
        prevScore = score
        theta = theta - (lr * grad)
        
    return theta

def getGradients(prevScore,sampleSize):
    scores = []
    scores.append(prevScore)
    currentScore = compareSyntheticDataSize(dataframe,target_name,discrete_columns,sampleSize)
    scores.append(currentScore)
    gradients = np.gradient(scores)
    return currentScore, gradients[0]
    

def generate_data(dataframe, sampleSize):
    if sampleSize == 0 or pd.isna(sampleSize) :
        return pd.DataFrame()
    model = CTGAN()
    model.fit(dataframe)
    data = model.sample(int(sampleSize))
    return data

@ignore_warnings(category=ConvergenceWarning)
@ignore_warnings(category=SettingWithCopyWarning)
def compareSyntheticDataSize(data, target_name, discrete_columns,size,title = ""):
    
    #clean before generating data     
    # cleanedData,target = processData(data,target_name,discrete_columns)
    
    synthetic_data = generate_data(dataframe,size)
    leaky_data = pd.concat([dataframe,synthetic_data], axis = 1)
    
    cleanedMixedData,target = processData(leaky_data,target_name,discrete_columns)
    # return cleanedMixedData
    feat_train, feat_test, target_train, target_test = train_test_split(cleanedMixedData,target, test_size=0.10, random_state=(42))
    
    # establish baseline on models  
    lin_model = LinearRegression().fit(feat_train, target_train)
    # mlp_model = MLPRegressor().fit(feat_train, target_train)
    decision_model = DecisionTreeRegressor().fit(feat_train, target_train)
    
    # models = [lin_model, mlp_model, decision_model]
    
    models = [lin_model, decision_model]
    
    training_scores = [model.score(feat_train, target_train) for model in models]
    test_scores = [model.score(feat_test, target_test) for model in models]
    
    training_predictions = [model.predict(feat_train) for model in models]
    test_predictions = [model.predict(feat_test) for model in models]
    
    # training_mses = [mean_squared_error(target_train,prediction) for prediction in training_predictions]
    # test_mses = [mean_squared_error(target_test,prediction) for prediction in test_predictions]
    
    avg_training_score = sum(training_scores) / len(training_scores)
    avg_test_score = sum(test_scores) / len(test_scores)
    # avg_training_mse = sum(training_mses) / len(training_mses)
    # avg_test_mse = sum(test_mses) / len(test_mses)
    
    return avg_test_score

def processData(dataframe, target, categorical_vars = []):
    
    # drop na, null, etc     
    dataframe = dataframe.replace('%','', regex=True)
    dataframe = dataframe.replace('-','', regex=True)
    indices_to_keep = ~dataframe.isin([np.nan, np.inf, -np.inf]).any(1)
    dataframe = dataframe[indices_to_keep]
    
    dataframe = pd.get_dummies(dataframe,columns=categorical_vars, prefix='dmy')
    
    #Drop unencoded variables
    dataframe = dataframe.drop(categorical_vars, axis = 1, errors = 'ignore')
    
    dataframe = dataframe.apply(pd.to_numeric, errors = 'ignore')
    
    dataframe = dataframe.reset_index().dropna()
    dataframe = dataframe.drop('index', axis = 1)
    
    y = dataframe[target]
    y = y.drop(categorical_vars, errors = 'ignore')
    y = y.apply(pd.to_numeric, errors = 'ignore')
    
    return dataframe, y

def generateOptimalDataSamples(dataframe,optimalSamples):
    if not optimalSamples:
        optimalSamples = 0
    syntheticData = generate_data(dataframe,int(optimalSamples))
    syntheticData.to_csv('syntheticData.csv')

In [175]:
discrete_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
path_to_data = 'data/housing.csv'
target_name = 'price'
dataframe = pd.read_csv(path_to_data)

for sampleSize in [0,100, 200, 500]:
    avg_test_score = compareSyntheticDataSize(dataframe,target_name,discrete_columns,sampleSize)
    print("Sample Size: ", sampleSize,"Score: ", avg_test_score)

Sample Size:  0 Score:  0.9977920478556637
Sample Size:  100 Score:  0.9329187006268795
Sample Size:  200 Score:  0.9534584018608305
Sample Size:  500 Score:  0.9646944076034658


In [184]:
discrete_columns = ['Final Letter Grade']
path_to_data = 'data/CPSC121data.csv'
target_name = 'Final Grade'
dataframe = pd.read_csv(path_to_data)

for sampleSize in [0,100, 200, 500]:
    avg_test_score = compareSyntheticDataSize(dataframe,target_name,discrete_columns,sampleSize)
    print("Sample Size: ", sampleSize,"Score: ", avg_test_score)

Sample Size:  0 Score:  0.9901092805449011


  self[col] = igetitem(value, i)


KeyboardInterrupt: 

In [5]:
discrete_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
path_to_data = 'data/housing.csv'
target_name = 'price'
dataframe = pd.read_csv(path_to_data)

optimalSamples = sgd(dataframe,target_name,discrete_columns,200,0.01,5)
optimalSamples

199.99055816811043

In [8]:
discrete_columns = ['Final Letter Grade']
path_to_data = 'data/CPSC121data.csv'
target_name = 'Final Grade'
dataframe = pd.read_csv(path_to_data)

optimalSamples = sgd(dataframe,target_name,discrete_columns,200,0.01,5)
generateOptimalDataSamples(dataframe,optimalSamples)

  self[col] = igetitem(value, i)
  self[col] = igetitem(value, i)
  self[col] = igetitem(value, i)
  self[col] = igetitem(value, i)
  self[col] = igetitem(value, i)
  self[col] = igetitem(value, i)


In [3]:
discrete_columns = ['Final Letter Grade']
path_to_data = 'data/CPSC121data.csv'
target_name = 'Final Grade'
dataframe = pd.read_csv(path_to_data)

avg_test_score = compareSyntheticDataSize(dataframe,target_name,discrete_columns,0)
print(avg_test_score)

avg_test_score = compareSyntheticDataSize(dataframe,target_name,discrete_columns,len(dataframe))
print(avg_test_score)


0.9890680873995021


  self[col] = igetitem(value, i)


nan


