In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
import os
import pandas as pd
import pickle

###Preprocessing

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder,MinMaxScaler

def encode_df(df_raw,c_col):
    df=df_raw.copy()
    cat_dict={}
    for i in c_col:
      df[i]=df[i].astype('category')
      cat_dict[i] = dict(enumerate(df[i].cat.categories))
      df[i]=df[i].cat.codes
      df[i]=df[i].astype('int')

    return df,cat_dict

def real_process(reald,cat_cols,y_map=None,weight=False):

  real=reald.copy().reset_index(drop=True)
  # real,_=encode_df(real,cat_cols)

  X,y=real.iloc[:,:-1],real.iloc[:,-1]
  col_names=X.columns.to_list()
  n_col=[s for s in col_names if s not in cat_cols]
  cat_cols = [s for s in col_names if s in cat_cols]

  ss=MinMaxScaler()
  real_scaled=pd.DataFrame(ss.fit_transform(X[n_col]), index=None,columns=n_col)
  ohe = OneHotEncoder(sparse_output=False,handle_unknown="ignore")

  #One-hot-encode the categorical columns.
  ohe_real = ohe.fit_transform(X[cat_cols])
  #Convert it to df
  real_encoded = pd.DataFrame(ohe_real, index=None,columns=ohe.get_feature_names_out())

  if weight:
    #One-hot-encoding with the frequency as weight
    col_idx=0
    for i in cat_cols:
      n_cat=real[i].nunique()
      real_encoded.iloc[:,col_idx:col_idx+n_cat] = real_encoded.iloc[:,col_idx:col_idx+n_cat]*1/n_cat
      col_idx=col_idx+n_cat

  if y_map is not None:
    real_processed=pd.concat([real_scaled, real_encoded,y.map(y_map)], axis=1)
  else:
    real_processed=pd.concat([real_scaled, real_encoded,y], axis=1)

  return real_processed,ss,ohe

def fake_process(faked,cat_cols,ss,ohe,y_map=None,weight=False):
  fake=faked.copy().reset_index(drop=True)
  X,y=fake.iloc[:,:-1],fake.iloc[:,-1]

  col_names=X.columns.to_list()
  n_col=[s for s in col_names if s not in cat_cols]
  cat_cols = [s for s in col_names if s in cat_cols]

  fake_scaled=pd.DataFrame(ss.transform(X[n_col]), index=None,columns=n_col)

  #One-hot-encode the categorical columns.
  ohe_fake = ohe.transform(X[cat_cols])
  #Convert it to df
  fake_encoded = pd.DataFrame(ohe_fake, index=None,columns=ohe.get_feature_names_out())

  if weight:
    #One-hot-encoding with the frequency as weight
    col_idx=0
    for i in cat_cols:
      n_cat=real[i].nunique()
      fake_encoded.iloc[:,col_idx:col_idx+n_cat] = fake_encoded.iloc[:,col_idx:col_idx+n_cat]*1/n_cat
      col_idx=col_idx+n_cat

  if y_map is not None:
    fake_processed=pd.concat([fake_scaled, fake_encoded,y.map(y_map)], axis=1)
  else:
    fake_processed=pd.concat([fake_scaled, fake_encoded,y], axis=1)

  return fake_processed

###Define EKCD

In [12]:
import numpy as np ##numpy for sorting distance
import scipy##for distance metrics
import numbers

from imblearn.base import BaseSampler
from sklearn.utils import _safe_indexing

class EKCD(BaseSampler):

    _parameter_constraints: dict = {
        "n_jobs": [numbers.Integral, None]
    }

    def __init__(self,*,sampling_strategy="auto", n_neighbors=15,n_vote=3,kind_sel="cd",n_jobs=None,sampling_type="under-sampling"):
        super().__init__()
        self.n_neighbors = n_neighbors
        self.kind_sel = kind_sel
        self.n_jobs = n_jobs
        self._sampling_strategy=sampling_strategy
        self.sampling_type=sampling_type
        self._sampling_type=sampling_type
        self.n_vote=n_vote

        SAMPLING_TARGET_KIND=["minority","majority", "not minority","not majority","all","auto"]

        if isinstance(self.sampling_type, str):
           if self._sampling_strategy not in SAMPLING_TARGET_KIND:
            raise ValueError(
                f"When 'sampling_strategy' is a string, it needs"
                f" to be one of {SAMPLING_TARGET_KIND}. Got '{self._sampling_strategy}' "
                f"instead.")


    def _fit_resample(self, X, y, X_real=None, y_real=None):
        self.X_= X
        self.y_= y

        if X_real is not None:
          self.X_= X_real
          self.y_=y_real

        #calculate distance
        d=scipy.spatial.distance.cdist(X,self.X_)
        #get k lowest distance and save to Sx
        indexes_all=np.argsort(d)[:,1:self.n_neighbors+1] # return k indexes of lowest value in d

        ##check if the top n neareast names are from same group, if not then use CDNN
        single_key = np.max(self.y_[indexes_all[:,:self.n_vote]],axis=1) == np.min(self.y_[indexes_all[:,:self.n_vote]],axis=1)
        indexes=indexes_all[~single_key]##use KCDNN for the uncertain ones or hard ones

        idx_under = np.empty((0,), dtype=int)
        input_dim=X.shape[1]

        if self.kind_sel=="all":
          idx_under=np.flatnonzero(np.max(y[indexes_all],axis=1) == np.min(y[indexes_all],axis=1))
        elif self.kind_sel=="cd":
          y_pred=[] ##set y_predict list
          for n,index in enumerate(indexes): ##looping through k indexes over the whole test dataset
            Sx = dict()
            for idx in range(self.n_neighbors):
              key = index[idx]
              if y[key] in Sx:
                Sx[y[key]].append(X[key])
              else:
                Sx[y[key]] = []
                Sx[y[key]].append(X[key])

            #calculate current centroids within training dataset
            px = dict()
            for key in Sx:
              sum_item = np.zeros(input_dim)
              for i in range(len(Sx[key])):
                sum_item += Sx[key][i]

              px_item = sum_item/len(Sx[key])

              px[key] = px_item

            #calculate new centroid by adding new test data
            qx = dict()
            for key in Sx:
              sum_item = np.zeros(input_dim)
              for i in range(len(Sx[key])):
                sum_item+=Sx[key][i]
              sum_item += X[n]
              qx_item = sum_item/(len(Sx[key]) + 1)
              qx[key] = qx_item

            #calculate displacement
            theta = dict()
            for key in px:
              if key in qx:
                theta[key] = np.linalg.norm(px[key] - qx[key])

            label=min(theta, key=theta.get)
            y_pred.append(label)

          idx_under=np.flatnonzero(np.array(y_pred)==y[~single_key])

        minority_class=np.argmin(np.bincount(y))
        majority_class=np.array(np.argmax(np.bincount(y)))
        all_class=np.unique(y)
        non_minority=np.setdiff1d(all_class,minority_class)
        non_majority=np.setdiff1d(all_class,majority_class)

        idx_under=np.unique(np.concatenate((np.array(np.where(single_key)[0]),np.array(np.where(~single_key)[0])[idx_under])),axis=0)
        if self._sampling_strategy in ['not minority','auto']:
          target_class_indices= np.flatnonzero(y == int(minority_class))
          idx_under = np.unique(np.concatenate((idx_under,target_class_indices),axis=0))
        elif self._sampling_strategy=='not majority':
          target_class_indices= np.flatnonzero(y == majority_class)
          idx_under = np.unique(np.concatenate((idx_under,target_class_indices),axis=0))
        elif self._sampling_strategy=='all':
          pass
        elif self._sampling_strategy=='majority':
          for target_class in non_majority:
            target_class_indices= np.flatnonzero(y == target_class)
            idx_under = np.unique(np.concatenate((idx_under,target_class_indices),axis=0))
        elif self._sampling_strategy=='minority':
          for target_class in non_minority:
            target_class_indices= np.flatnonzero(y == target_class)
            idx_under = np.unique(np.concatenate((idx_under,target_class_indices),axis=0))

        self.sample_indices_ = idx_under

        return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under)

    def _more_tags(self):
        return {"sample_indices": True}

###Calculate Density Score

In [13]:
from joblib import Parallel, delayed
from typing import Union, List, Optional,Tuple, Dict, Callable,Any
from sklearn.metrics import pairwise_distances

def compute_pairwise_distance(data_x: np.ndarray, data_y: Optional[np.ndarray] = None) -> np.ndarray:
    """
    Args:
        data_x: numpy.ndarray([N, feature_dim], dtype=np.float32)
        data_y: numpy.ndarray([N, feature_dim], dtype=np.float32)
    Returns:
        numpy.ndarray([N, N], dtype=np.float32) of pairwise distances.
    """
    if data_y is None:
        data_y = data_x

    dists = pairwise_distances(data_x, data_y)
    return dists

def get_kth_value(unsorted: np.ndarray, k: int, axis: int = -1) -> np.ndarray:
    """
    Args:
        unsorted: numpy.ndarray of any dimensionality.
        k: int
    Returns:
        kth values along the designated axis.
    """
    indices = np.argpartition(unsorted, k, axis=axis)[..., :k]
    k_smallests = np.take_along_axis(unsorted, indices, axis=axis)
    kth_values = k_smallests.max(axis=axis)
    return kth_values

def compute_nearest_neighbour_distances(input_features: np.ndarray, nearest_k: int) -> np.ndarray:
    """
    Args:
        input_features: numpy.ndarray
        nearest_k: int
    Returns:
        Distances to kth nearest neighbours.
    """
    distances = compute_pairwise_distance(input_features)
    radii = get_kth_value(distances, k=nearest_k + 1, axis=-1)
    return radii

def compute_density(real: np.ndarray, fake: np.ndarray,nearest_k: int) -> Dict:
    """
    Args:
        real: numpy.ndarray([N, feature_dim], dtype=np.float32)
        fake: numpy.ndarray([N, feature_dim], dtype=np.float32)
    Returns:
        density score
    """
    real_nearest_neighbour_distances = compute_nearest_neighbour_distances(real, nearest_k)
    distance_real_fake = compute_pairwise_distance(real, fake)

    density = (1.0 / float(nearest_k)) * (distance_real_fake< np.expand_dims(real_nearest_neighbour_distances, axis=1)).sum(axis=0).mean()
    return density

###Experiment

In [14]:
Syn_folder='/content/drive/MyDrive/TDS/synthetic/adult'
GMs=['tvae.csv','ctgan.csv','copulagan.csv','tabddpm.csv','stasy.csv','tabsyn.csv']
cat_cols=["workclass","education","education.num","marital.status","occupation","relationship", "race", "sex", "native.country"]
y_map={' <=50K':0,' >50K':1}

In [15]:
real_train=pd.read_csv(os.path.join(Syn_folder, 'real.csv'))
real_test=pd.read_csv(os.path.join(Syn_folder, 'test.csv'))

real_processed,ss,ohe=real_process(real_train,cat_cols,y_map)
test_processed=fake_process(real_test,cat_cols,ss,ohe,y_map)
X_test=test_processed.iloc[:,:-1]
y_test=test_processed.iloc[:,-1]

In [19]:
scores={}
for i in GMs:
  print(i)
  syn_data=pd.read_csv(os.path.join(Syn_folder, i))
  fake_processed=fake_process(syn_data,cat_cols,ss,ohe,y_map)

  d_score=compute_density(real_processed, fake_processed,5)
  scores[i]=d_score

  syn_data=fake_processed
  Refiner=EKCD(sampling_strategy='all',n_neighbors=21,n_vote=5)
  xx,yy=Refiner.fit_resample(syn_data.iloc[:,:-1],syn_data.iloc[:,-1].astype(int))
  fake_filtered=pd.concat([xx,yy],axis=1)

  # Export to CSV
  fake_filtered.to_csv(os.path.join(Syn_folder,f"{i}_filtered.csv"), index=False)

scores

{'tvae.csv': 1.18700001,
 'ctgan.csv': 0.61200034,
 'copulagan.csv': 0.53840001,
 'tabddpm.csv': 1.30625001,
 'stasy.csv': 1.06860001,
 'tabsyn.csv': 1.53220001}

In [20]:
ensembles = []
ensembles_filter=[]
# Loop through the top 3 filenames
for i in ['tvae.csv', 'tabddpm.csv', 'tabsyn.csv']:
    filename = os.path.join(Syn_folder,i)
    syn_data = pd.read_csv(filename)
    ensembles.append(syn_data)

    filename = os.path.join(Syn_folder, f"{i}_filtered.csv")
    syn_data = pd.read_csv(filename)
    ensembles_filter.append(syn_data)

# Combine all dataframes into one
ensembles_df = pd.concat(ensembles, ignore_index=True)
ensembles_filter_df = pd.concat(ensembles_filter, ignore_index=True)
# Export the combined dataframe to 'ensemble.csv'
ensembles_df.to_csv(os.path.join(Syn_folder, 'ensemble.csv'), index=False)
ensembles_filter_df.to_csv(os.path.join(Syn_folder, 'ensemble_filtered.csv'), index=False)

### Run ML experiment

In [21]:
import math
import numpy as np

from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier,IsolationForest,ExtraTreesClassifier,ExtraTreesRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA,KernelPCA
from sklearn.manifold import TSNE
from sklearn.pipeline import make_pipeline
from sklearn import cluster

from sklearn.metrics import roc_curve, precision_recall_curve, auc, matthews_corrcoef,confusion_matrix, average_precision_score, roc_auc_score, accuracy_score

from sklearn.model_selection import cross_val_score

import lightgbm as lgb
from sklearn.metrics import roc_auc_score,average_precision_score,accuracy_score,precision_score,recall_score,silhouette_score,f1_score,jaccard_score,pairwise

def printPerformance(labels, probs,all=True):
  predicted_labels = np.round(probs)
  tn, fp, fn, tp = confusion_matrix(labels, predicted_labels).ravel()
  acc = (tp + tn) / (tn + tp + fn + fp)
  sen = tp / (tp + fn)             # sensitivity, recall, hit rate, or true positive rate (TPR)
  spe = tn / (tn + fp)             # specificity, selectivity or true negative rate (TNR)
  ppv = tp / (tp + fp)             # precision or positive predictive value (PPV)
  npv = tn / (tn + fn)
  f1 = (2*tp) / (2*tp + fp + fn)
  dor = (tp * tn) / (fp * fn)
  mcc = (tp*tn - fp*fn) / math.sqrt((tp+fp) * (tp+fn) * (tn+fp) * (tn+fn))
  roc_auc = roc_auc_score(labels, probs)
  precision, recall, _ = precision_recall_curve(labels, probs)
  pr_auc = auc(recall, precision)

  if all:
    result=[roc_auc,pr_auc,acc,sen,spe,ppv,npv,f1,dor,mcc]
  return result


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [22]:
Syn_folder='/content/drive/MyDrive/TDS/synthetic/adult'
GMs=['tvae.csv','ctgan.csv','copulagan.csv','tabddpm.csv','stasy.csv','tabsyn.csv']
cat_cols=["workclass","education","education.num","marital.status","occupation","relationship", "race", "sex", "native.country"]
y_map={' <=50K':0,' >50K':1}

real_train=pd.read_csv(os.path.join(Syn_folder, 'real.csv'))
real_test=pd.read_csv(os.path.join(Syn_folder, 'test.csv'))

real_processed,ss,ohe=real_process(real_train,cat_cols,y_map)
test_processed=fake_process(real_test,cat_cols,ss,ohe,y_map)
X_test=test_processed.iloc[:,:-1]
y_test=test_processed.iloc[:,-1]

In [23]:
dfs={}
ensemble=pd.read_csv(os.path.join(Syn_folder, 'ensemble.csv'))
dfs['ensemble.csv']=ensemble
ensemble=pd.read_csv(os.path.join(Syn_folder, 'ensemble_filtered.csv'))
dfs['ensemble_filtered.csv']=ensemble
for i in GMs:
  print(i)
  syn_data=pd.read_csv(os.path.join(Syn_folder, i))
  dfs[i]=syn_data
  syn_data_filtered=pd.read_csv(os.path.join(Syn_folder,f"{i}_filtered.csv"))
  dfs[f"{i}_filtered"]=syn_data_filtered

tvae.csv
ctgan.csv
copulagan.csv
tabddpm.csv
stasy.csv
tabsyn.csv


In [None]:
estimators = [
          KNeighborsClassifier(),
          lgb.LGBMClassifier(random_state=1,verbose=-1 ),
          RandomForestClassifier( random_state=1),
          LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=500, random_state=1),
          DecisionTreeClassifier(random_state=1),
          MLPClassifier(solver='adam', activation='relu', learning_rate='adaptive', random_state=1)
      ]
estimators_names = ['KNN','LGBM','RF','LR','DT','MLP']

r=[]

zipped_estimators= zip(estimators_names,estimators)
for est_name,est in zipped_estimators:
  print(est_name)
  print("-"*50)
  print("real training data")
  est.fit(real_processed.iloc[:,:-1],real_processed.iloc[:,-1])
  y_proba=est.predict_proba(X_test)[:, 1]
  r.append([est_name,'real',printPerformance(y_test, y_proba)])

  for syn_name,fake in dfs.items():
    print(syn_name)
    print("-"*30)
    fake_processed=fake_process(fake,cat_cols,ss,ohe,y_map)
    est.fit(fake_processed.iloc[:,:-1],fake_processed.iloc[:,-1].astype(int))
    y_proba=est.predict_proba(X_test)[:, 1]
    r.append([est_name,syn_name,printPerformance(y_test, y_proba)])

###Table Plot

In [None]:
from sklearn.decomposition import PCA,KernelPCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

def table_plot(reals,fakes,dimentionality_reduction='PCA',filename=''):


  if dimentionality_reduction=='PCA':
    model=KernelPCA(n_components=2,kernel="rbf")#, whiten=True)
  elif dimentionality_reduction=='TSNE': ##takes too long for big dataset
    model=TSNE(n_components = 2, perplexity=10,random_state = 1)

  if reals.shape[0]>10000:
    reals=reals.sample(10000)
    fakes=fakes.sample(10000)
  realt=model.fit_transform(reals)
  faket=model.fit_transform(fakes)

  print(f'real shape {realt.shape}')
  print(f'fake shape {faket.shape}')
  #fit the model to our data and extract the results
  #create a dataframe from the dataset
  real_pca = pd.DataFrame(data = realt ,columns = ["Component 1","Component 2"])
  fake_pca = pd.DataFrame(data = faket ,columns = ["Component 1","Component 2"])

  real_pca['dataset']='real'
  fake_pca['dataset']='fake'

  #plot the resulting data from two dimensions
  g = sns.jointplot(data = pd.concat([real_pca,fake_pca]),
                  x = "Component 1",
                  y = "Component 2",
                    palette=["#2171B5","#6BAED6"],
                    joint_kws={'alpha': 0.8},
                    hue="dataset")
  g.fig.subplots_adjust(top=0.95)  # Adjust the top margin for the title
  g.fig.suptitle(filename, y=0.99)  # Move title above the figure
  g.fig.set_size_inches((5, 5))
  if filename!='':
    g.savefig(filename+'.pdf')

In [None]:
for syn_name,fake in dfs.items():
  print(syn_name)
  print("-"*30)
  fake_processed=fake_process(fake,cat_cols,ss,ohe,y_map)
  table_plot(real_processed,fake_processed,filename=syn_name)

###Optuna for LGBM

In [24]:
!pip install optuna --quiet
import optuna
from optuna.samplers import TPESampler

In [29]:
def objective_lgbm(trial, X, y):
    cv_inner = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)
    param_grid = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
#        "l2_leaf_reg": trial.suggest_int('l2_leaf_reg',0,3,1),
        # "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        # "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 50),
        # "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.6, 0.8),
        # "subsample": trial.suggest_uniform("subsample", 0.6, 0.8),
        # "subsample_freq": trial.suggest_int("subsample_freq", 1, 4)
#        "min_child_samples": trial.suggest_int('min_child_samples', 70, 170)
        }
    model = lgb.LGBMClassifier(objective="binary", random_state=0, verbosity=-1, **param_grid)
    return cross_val_score(model, X, y, cv=cv_inner, scoring='average_precision', n_jobs=-1).mean()

In [None]:
for syn_name,fake in dfs.items():
  print(syn_name)
  print("-"*30)
  fake_processed=fake_process(fake,cat_cols,ss,ohe,y_map)
  X_train, y_train=fake_processed.iloc[:,:-1],fake_processed.iloc[:,-1].astype(int)

  lgbm_study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=0))
  func = lambda trial: objective_lgbm(trial, X_train, y_train)
  lgbm_study.optimize(func, n_trials=100)

  print('Fold: {} - Best trial: val-score {}, params {}'.format(i+1,
                                                            lgbm_study.best_trial.value,
                                                            lgbm_study.best_trial.params))

  best_model = lgb.LGBMClassifier(objective="binary", random_state=0,
                                                        **lgbm_study.best_trial.params)


  best_model.fit(X_train, y_train)
  y_proba=best_model.predict_proba(X_test)[:, 1]
  r.append(['Optuna',syn_name,printPerformance(y_test, y_proba)])