In [2]:
from pathlib import Path
import colorcet as cc
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset, Dataset, load_from_disk

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


# set some defaults
warnings.filterwarnings("ignore")
np.set_printoptions(precision=4, 
                    floatmode='fixed')
pd.set_option('display.precision', 3)

# set default font size for matplotlib
plt.rcParams.update({'font.size': 12,})
# set default font
plt.rcParams['font.family'] = 'Roboto'

heatmap_parms = {
                # 'linewidths' : 0.5,
                'linecolor' : 'white',
                'cmap' : cc.cm.bkr,
                }


In [3]:
# reload magics
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from project_modules.utils import get_logger
logger = get_logger("log-synthetic-data.log")
# read the parameter file

from project_modules.utils import read_parameters
parms = read_parameters("/Users/david/projects/lc-project-data/project.yaml")

[2024-04-05 14:42:08] [get_logger] Logger initialized.
[2024-04-05 14:42:08] [get_logger] Logging to file: log-synthetic-data.log
[2024-04-05 14:42:08] [read_parameters] Reading parameters.
[2024-04-05 14:42:08] [read_parameters] ... reading /Users/david/projects/lc-project-data/project.yaml




In [4]:
from pathlib import Path
input_dir = Path(parms["dir_raw_data"])
output_dir = Path(parms["dir_proc_data"])

In [5]:
# import blobs from sklearn
from sklearn.datasets import make_blobs, make_classification


In [6]:
N_SAMPLES = 1000
N_FEATURES = 25
N_CLASSES = 3

In [7]:
types = ["clinical", "hx", "labs", "demographics"]

dfX = pd.DataFrame()
dfy = pd.DataFrame()

In [8]:

X, y = make_blobs(n_samples=N_SAMPLES,
                n_features=N_FEATURES,
                cluster_std=3.6,
                random_state=50
                )

# X, y = make_classification(n_samples=N_SAMPLES,
#                             n_features=N_FEATURES,
#                             n_classes=N_CLASSES,
#                             n_clusters_per_class=1,
#                             n_informative=10,
#                             n_redundant=0,
#                             n_repeated=0,
#                             random_state=50
#                 )


# rescale the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# add X to dfX
dfX = pd.DataFrame(X)

# add y to dfy
dfy = pd.DataFrame(y, columns=['TRUE_LABEL'])


# convert to dataset format and save
from datasets import Dataset

features_dataset = Dataset.from_pandas(dfX)


features_dataset.save_to_disk(Path(output_dir, "SYNTHETIC_features"))

In [9]:
dfX

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.303,0.521,0.191,0.467,0.627,0.678,0.753,0.508,0.544,0.826,...,0.364,0.799,0.602,0.439,0.769,0.548,0.625,0.363,0.436,0.363
1,0.258,0.464,0.330,0.533,0.729,0.771,0.888,0.677,0.376,0.677,...,0.733,0.549,0.600,0.455,0.595,0.490,0.288,0.279,0.652,0.190
2,0.394,0.801,0.785,0.539,0.651,0.194,0.311,0.408,0.259,0.251,...,0.820,0.399,0.339,0.666,0.308,0.565,0.525,0.306,0.437,0.642
3,0.622,0.509,0.552,0.662,0.295,0.678,0.496,0.758,0.497,0.261,...,0.427,0.176,0.416,0.572,0.293,0.164,0.216,0.630,0.576,0.477
4,0.729,0.602,0.371,0.464,0.317,0.730,0.390,0.618,0.784,0.487,...,0.803,0.225,0.536,0.689,0.425,0.598,0.428,0.757,0.573,0.749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.493,0.641,0.484,0.196,0.568,0.300,0.568,0.274,0.405,0.446,...,0.497,0.433,0.410,0.763,0.321,0.334,0.570,0.263,0.371,0.628
996,0.298,0.631,0.459,0.363,0.651,0.665,0.720,0.346,0.284,0.756,...,0.618,0.723,0.454,0.610,0.463,0.710,0.361,0.168,0.781,0.459
997,0.498,0.636,0.226,0.404,0.665,0.489,0.693,0.601,0.446,0.837,...,0.752,0.641,0.503,0.388,0.828,0.510,0.626,0.285,0.642,0.035
998,0.251,0.467,0.635,0.167,0.666,0.188,0.210,0.736,0.000,0.223,...,0.317,0.353,0.371,0.622,0.204,0.470,0.427,0.401,0.565,0.655


In [10]:
features_dataset

Dataset({
    features: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24'],
    num_rows: 1000
})

In [11]:
# make tje labels dataset and save

# labels = pd.DataFrame(dfy, columns=['label'])

labels_dataset = Dataset.from_pandas(dfy)
labels_dataset.save_to_disk(Path(output_dir, "SYNTHETIC_labels"))


In [12]:
# combine features and labels into a datasetdict
from datasets import DatasetDict

dataset_dict = DatasetDict({
    "features": features_dataset,
    "labels": labels_dataset,
})

# save it
dataset_dict.save_to_disk(Path(output_dir, "SYNTHETIC_dataset_dict"))