In [1]:
import warnings

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
from sdmetrics.reports.single_table import DiagnosticReport, QualityReport
from sdmetrics.visualization import get_column_plot
from sdv.metadata import SingleTableMetadata
from sdv.single_table import (
    CopulaGANSynthesizer,
    CTGANSynthesizer,
    GaussianCopulaSynthesizer,
    TVAESynthesizer,
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

warnings.simplefilter("ignore", category=UserWarning)

In [2]:
def tune_and_train_RF(X_train, y_train, n_iter: int = 100):
    param_dist = {
        'n_estimators': [int(x) for x in range(5, 500, 100)],
        'max_depth': [int(x) for x in range(3, 25, 1)] + [None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }
    
    model = RandomForestRegressor()
    
    grid = RandomizedSearchCV(model, param_dist, cv=5, n_iter=n_iter, verbose=10, n_jobs=-1)
    grid.fit(X_train, y_train)
    
    return grid.best_estimator_

def evaluate_RF(model, X_test, y_test):
    
    y_pred = model.predict(X_test)
    print(f'R2 Score: {r2_score(y_test, y_pred)}')
    print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}')
    print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
    print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test, y_pred))}')
    
    return pd.DataFrame({
        'y_true': y_test,
        'y_pred': y_pred,
        'residuals': y_test - y_pred
    })
    

In [3]:
class DataLoader:
    def __init__(self, data_path: str, target: str):
        self.data_path = data_path
        self.target = target
        self.df = None
        self.X = None
        self.y = None
        
        self.df_train = None
        self.df_test = None
        
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        
        self.scaler = None
        self.df_train_norm = None
        self.df_test_norm = None
        self.X_train_norm = None
        self.X_test_norm = None
        
    def _split_data(self):
        self.X = self.df.drop(self.target, axis=1)
        self.y = self.df[self.target]
        
    def load_data(self, index_col: int = None):
        self.df = pd.read_csv(self.data_path, index_col=index_col)
        self._split_data()

    def train_test_split(self, test_size: float = 0.2):
        if self.X is None or self.y is None:
            raise ValueError("X and y should not be None. Please load data first.")
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=test_size)
        self.df_train = pd.concat([self.X_train, self.y_train], axis=1)
        self.df_test = pd.concat([self.X_test, self.y_test], axis=1)

    def normalization(self, scaling_method: str = 'MinMax'):
        if self.X_train is None or self.X_test is None:
            raise ValueError("X_train and X_test should not be None. Please split data first.")
        if scaling_method == 'MinMax':
            self.scaler = MinMaxScaler()
        elif scaling_method == 'Standard':
            self.scaler = StandardScaler()
        else:
            raise ValueError("scaling_method should be either 'MinMax' or 'Standard'.")
        
        self.X_train_norm = self.scaler.fit_transform(self.X_train)
        self.X_test_norm = self.scaler.transform(self.X_test)
        self.df_train_norm = pd.concat([pd.DataFrame(self.X_train_norm, columns=self.X_train.columns), self.y_train], axis=1)
        self.df_test_norm = pd.concat([pd.DataFrame(self.X_test_norm, columns=self.X_test.columns), self.y_test], axis=1)

In [4]:
class Modelling(DataLoader):
    def __init__(self, data_path: str, target: str):
        super().__init__(data_path, target)
        self.estimator = None
        
    def get_trained_model(self, model: object):
        self.estimator = model

In [5]:
class SyntheticData:
    def __init__(self, data: pd.DataFrame, generator: object, target: str, params: dict = None):
        self.data = data
        self.target = target
        self.generator = generator
        self.params = params or {}  # Add this line to ensure params is not None
        self.synthetic_data = None
        
        self.metadata = SingleTableMetadata()
        self.metadata.detect_from_dataframe(self.data)
        
        self.X_synth = None
        self.y_synth = None
        
        self.X_synth_norm = None        
        
        self.quality_report = None
        
    def generate_synthetic_data(self, num_rows: int, concat: bool = False):
        generator = self.generator(metadata=self.metadata, **self.params)
        generator.fit(self.data )
        self.synthetic_data = generator.sample(num_rows)
        self.X_synth = self.synthetic_data.drop(self.target, axis=1)
        self.y_synth = self.synthetic_data[self.target]
         
        if concat:
            self.synthetic_data = pd.concat([self.data, self.synthetic_data], ignore_index=True)
            
    def normalize(self, scaler: object):
        self.X_synth_norm = scaler.transform(self.X_synth)
        
    def get_quality_report(self):
        self.quality_report = QualityReport()
        self.quality_report.generate(self.data, self.synthetic_data, self.metadata.to_dict())
        
    def get_column_plot(self, column_name: str):
        fig = get_column_plot(self.data, self.synthetic_data, column_name)
        pio.renderers.default = "vscode"
        fig.show()
    
    def get_column_pair_trends(self):
        fig = self.quality_report.get_visualization('Column Pair Trends')    
        pio.renderers.default = "vscode"
        fig.show()

In [6]:
FILENAME = './data/carotenoid_production.csv'

m = Modelling(FILENAME, target='prod')
m.load_data()
m.train_test_split(test_size=0.2)
m.normalization(scaling_method='MinMax')

In [7]:
rf = tune_and_train_RF(m.X_train_norm, m.y_train, n_iter=250)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits
[CV 2/5; 1/250] START max_depth=9, min_samples_leaf=4, min_samples_split=10, n_estimators=305
[CV 3/5; 3/250] START max_depth=9, min_samples_leaf=4, min_samples_split=2, n_estimators=305
[CV 2/5; 2/250] START max_depth=23, min_samples_leaf=2, min_samples_split=5, n_estimators=105
[CV 3/5; 4/250] START max_depth=21, min_samples_leaf=2, min_samples_split=5, n_estimators=205
[CV 1/5; 4/250] START max_depth=21, min_samples_leaf=2, min_samples_split=5, n_estimators=205
[CV 2/5; 2/250] END max_depth=23, min_samples_leaf=2, min_samples_split=5, n_estimators=105;, score=0.019 total time=   0.1s
[CV 5/5; 13/250] START max_depth=24, min_samples_leaf=4, min_samples_split=10, n_estimators=5
[CV 5/5; 13/250] END max_depth=24, min_samples_leaf=4, min_samples_split=10, n_estimators=5;, score=0.417 total time=   0.0s
[CV 1/5; 14/250] START max_depth=24, min_samples_leaf=4, min_samples_split=5, n_estimators=105
[CV 2/5; 1/250] END max_dept

In [8]:
m.get_trained_model(rf)
results_rf = evaluate_RF(m.estimator, m.X_test_norm, m.y_test)

R2 Score: 0.3278479503979985
Mean Absolute Error: 1.5031712319102994
Mean Squared Error: 3.242698550664129
Root Mean Squared Error: 1.80074944138937


__Synthetic Data Generation__

In [9]:
print(f'Training dataset (df_train) size: {len(m.df_train)}')

Training dataset (df_train) size: 134


In [61]:
# CopulaGANSynthetizer
params = {
    'embedding_dim': 6,
    'generator_dim': (32,),
    'discriminator_dim': (32,),
    'generator_lr': 0.0001,
    'discriminator_lr': 0.0001,
    'batch_size': 1000,
    'epochs': 1000
}

synth = SyntheticData(data=m.df_train, generator=CTGANSynthesizer, target='prod', params=params)
synth.generate_synthetic_data(num_rows=int(134/1), concat=True)
synth.normalize(scaler=m.scaler)

In [62]:
synth.get_quality_report()
synth.get_column_pair_trends()

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 9/9 [00:00<00:00, 1073.78it/s]|
Column Shapes Score: 90.05%

(2/2) Evaluating Column Pair Trends: |██████████| 36/36 [00:00<00:00, 196.32it/s]|
Column Pair Trends Score: 95.08%

Overall Score (Average): 92.57%



In [63]:
synth.get_column_plot('prod')

In [64]:
rf_synth = tune_and_train_RF(synth.X_synth_norm, synth.y_synth, n_iter=250)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits
[CV 1/5; 1/250] START max_depth=4, min_samples_leaf=2, min_samples_split=5, n_estimators=5
[CV 2/5; 1/250] START max_depth=4, min_samples_leaf=2, min_samples_split=5, n_estimators=5
[CV 3/5; 1/250] START max_depth=4, min_samples_leaf=2, min_samples_split=5, n_estimators=5
[CV 4/5; 1/250] START max_depth=4, min_samples_leaf=2, min_samples_split=5, n_estimators=5
[CV 5/5; 1/250] START max_depth=4, min_samples_leaf=2, min_samples_split=5, n_estimators=5
[CV 1/5; 2/250] START max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=405
[CV 3/5; 2/250] START max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=405
[CV 4/5; 2/250] START max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=405
[CV 5/5; 2/250] START max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=405
[CV 2/5; 2/250] START max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=405
[CV 1/

In [65]:
results_rf_synth = evaluate_RF(rf_synth, m.X_test_norm, m.y_test)

R2 Score: 0.23315235067848672
Mean Absolute Error: 1.4945056520962474
Mean Squared Error: 3.699543522194834
Root Mean Squared Error: 1.9234197467518197
