In [12]:
from pathlib import Path
import colorcet as cc
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset, Dataset, load_from_disk

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


# set some defaults
warnings.filterwarnings("ignore")
np.set_printoptions(precision=4, 
                    floatmode='fixed')
pd.set_option('display.precision', 3)

# set default font size for matplotlib
plt.rcParams.update({'font.size': 12,})
# set default font
plt.rcParams['font.family'] = 'Roboto'

heatmap_parms = {
                # 'linewidths' : 0.5,
                'linecolor' : 'white',
                'cmap' : cc.cm.bkr,
                }


In [13]:
# reload magics
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from project_modules.utils import get_logger
logger = get_logger("log-synthetic-data.log")
# read the parameter file

from project_modules.utils import read_parameters
parms = read_parameters("/Users/david/projects/lc-project-data/project.yaml")

[2024-04-05 08:14:18] [get_logger] Logger initialized.
[2024-04-05 08:14:18] [get_logger] Logging to file: log-synthetic-data.log
[2024-04-05 08:14:18] [read_parameters] Reading parameters.
[2024-04-05 08:14:18] [read_parameters] ... reading /Users/david/projects/lc-project-data/project.yaml


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
from pathlib import Path
input_dir = Path(parms["dir_raw_data"])
output_dir = Path(parms["dir_proc_data"])

In [15]:
# import blobs from sklearn
from sklearn.datasets import make_blobs, make_classification


In [16]:
N_SAMPLES = 1000
N_FEATURES = 25
N_CLASSES = 3

In [17]:
types = ["clinical", "hx", "labs", "demographics"]

dfX = pd.DataFrame()
dfy = pd.DataFrame()

In [18]:

# X, y = make_blobs(n_samples=N_SAMPLES,
#                 n_features=N_FEATURES,
#                 cluster_std=3.6,
#                 random_state=50
#                 )

X, y = make_classification(n_samples=N_SAMPLES,
                            n_features=N_FEATURES,
                            n_classes=N_CLASSES,
                            n_clusters_per_class=1,
                            n_informative=10,
                            n_redundant=0,
                            n_repeated=0,
                            random_state=50
                )


# rescale the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# add X to dfX
dfX = pd.DataFrame(X)

# add y to dfy
dfy = pd.DataFrame(y, columns=['TRUE_LABEL'])


# convert to dataset format and save
from datasets import Dataset

features_dataset = Dataset.from_pandas(dfX)


features_dataset.save_to_disk(Path(output_dir, "SYNTHETIC_features"))

In [19]:
dfX

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.344,0.335,0.660,0.434,0.559,0.536,0.609,0.312,0.432,0.572,...,0.657,0.527,0.746,0.362,0.505,0.498,0.272,0.651,0.573,0.233
1,0.312,0.473,0.260,0.782,0.210,0.396,0.659,0.579,0.360,0.726,...,0.868,0.507,0.518,0.399,0.679,0.401,0.581,0.505,0.488,0.643
2,0.280,0.514,0.437,0.242,0.414,0.837,0.413,0.674,0.716,0.579,...,0.442,0.634,0.557,0.681,0.576,0.590,0.487,0.627,0.274,0.091
3,0.741,0.303,0.171,0.400,0.270,0.665,0.515,0.583,0.388,0.634,...,0.579,0.596,0.372,0.784,0.525,0.500,0.500,0.321,0.516,0.556
4,0.552,0.143,0.230,0.200,0.417,0.742,0.864,0.272,0.534,0.149,...,0.506,0.705,0.241,0.512,0.393,0.214,0.171,0.711,0.388,0.335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.374,0.597,0.578,0.537,0.628,0.405,0.596,0.531,0.558,0.263,...,0.455,0.082,0.596,0.596,0.661,0.479,0.698,0.495,0.587,0.426
996,0.595,0.411,0.724,0.835,0.555,0.600,0.149,0.480,0.526,0.534,...,0.399,0.609,0.497,0.442,0.556,0.376,0.259,0.380,0.754,0.361
997,0.311,0.468,0.249,0.530,0.361,0.754,0.712,0.273,0.742,0.705,...,0.285,0.525,0.640,0.409,0.512,0.436,0.567,0.338,0.641,0.484
998,0.689,0.182,0.477,0.369,0.480,0.637,0.671,0.507,0.180,0.545,...,0.700,0.327,0.566,0.540,0.153,0.460,0.351,0.439,0.650,0.172


In [20]:
features_dataset

Dataset({
    features: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24'],
    num_rows: 1000
})

In [21]:
# make tje labels dataset and save

# labels = pd.DataFrame(dfy, columns=['label'])

labels_dataset = Dataset.from_pandas(dfy)
labels_dataset.save_to_disk(Path(output_dir, "SYNTHETIC_labels"))


In [22]:
# combine features and labels into a datasetdict
from datasets import DatasetDict

dataset_dict = DatasetDict({
    "features": features_dataset,
    "labels": labels_dataset,
})

# save it
dataset_dict.save_to_disk(Path(output_dir, "SYNTHETIC_dataset_dict"))