In [2]:
import csv
import pandas as pd

# install libraries the first time you are using this notebook, 
# you can skip the installation process later by commenting the following three lines:
!pip install sdv==0.18.0
!pip install sdmetric==v0.9.1

from sdv.metrics.tabular import KSComplement
from sdv.lite import TabularPreset
from sdv.evaluation import evaluate
from dataclasses import dataclass

In [13]:
# This Notebook uses a config file to create interpolated data from an input data set. 
# The interpolation is customizable by config params. See readme.md for more details.

config = pd.read_json("config.json") #config file(dictionary)
data_file = config["Input File"][0]

target_file = config["Output File"][0]
data = pd.read_csv(str(data_file), sep=config["Input Column Separator"][0])
headers = data.columns

feature_dict = config["Features"]

features = [] #names of relevant headers
for feature in config["Features"].keys():
    features.append(feature) # wanted headers

allowed_percentage = config["Percentage"][0]
number_of_samples = int(config["N_samples"][0])
print(features)

print("Initialization:\nconfig file read, input is taken from file with name '",data_file, "'")

In [6]:
#Remove unspecified columns
for header in headers:
    if not header in features:
        data.pop(header)
        #print("Deleting column", header, "because of configuration.")

#Remove columns with to many NaNs
headers = data.columns #fixed list of columns we need
for header in headers:
    nan_percentage = data[header].isna().sum() / len(data[header].index)
    #print("NaN in % for", header, ":", nan_percentage)
    if nan_percentage > allowed_percentage:
        data.pop(header)
        headers = headers.drop(header)
        print("Deleting column", header, "because of" ,"{:.2%}".format(nan_percentage), "NaNs.")

#removing unwanted nans and interpolating others
numericals = []
for x in headers:
    #print(feature_dict, feature_dict[x][0])
    if feature_dict[x][0] == "numerical":
        numericals.append(x)
        
print("configured columns:",len(features))
print("after removing columns with too many NaNs:", len(headers))
print("how many columns have numericals:", len(numericals))
if len(numericals)<1:
    print("---------------------------------------------------------------------\n", 
          "Please specify at least one numerical column for input in config.json!")
else:
    for numerical in numericals:
        data[numerical] = pd.to_numeric(data[numerical], errors='coerce')
    
    data = data.interpolate(method='pad')

configured columns: 8
after removing columns with too many NaNs: 2
how many columns have numericals: 0
---------------------------------------------------------------------
 Please specify at least one numerical column for input in config.json!


In [8]:
#create validation set

validation_set = data.sample(frac=0.3)
#removal = data.sample(frac=0.9)
#data = data.drop(removal.index)
validation_set.reset_index()

print("rows in original data:", len(data))
print("rows for validation:", len(validation_set))

rows in original data: 66
rows for validation: 20


In [9]:
#Generating Metadata for SV
metadata = {}
metadata["fields"] = {}

for x in headers:
    metadata["fields"][x] = {}
    metadata["fields"][x]["type"] = feature_dict[x][0]
    if feature_dict[x][0] == "numerical":
        metadata["fields"][x]["subtype"] = feature_dict[x][1]
metadata["constraints"] = []

print(metadata)

{'fields': {'chemical_formula': {'type': 'categorical'}, 'metabolite_identification': {'type': 'categorical'}}, 'constraints': []}


In [10]:
#Fit model to the data and generating data with SDV's TabularPreset
#model = TabularPreset(name='FAST_ML', metadata=metadata)
print("model fit preparation ...")
def makeData(x):
    model = TabularPreset(name='FAST_ML', metadata=metadata) #SDV's FAST_ML preset uses ML to model your data
    model.fit(data)

    new_data = model.sample(num_rows=number_of_samples)
    new_data = new_data.round(decimals = 2)
    new_data.to_csv(target_file, index=False)
    print(x+1, ".: Writing to", str(target_file), "The generated data is", 
          "{:.2%}".format(evaluate(new_data, validation_set, metrics=['KSComplement'])), 
          "accurate to the original data.",
          "(if unsatisfactory try another config)")# using less or diff. features or more data or removing more NaNs.")
    
    return "{:.2%}".format(evaluate(new_data, validation_set, metrics=['KSComplement']))
print("done.")

model fit preparation ...
done.


In [11]:
# create the new interpolated data with dynamic file name depending on config params
nanInPercent = (allowed_percentage*100).astype(int)
nrOfFeatures = len(features)
nrOfRows = len(data)
dataframe = []
filename = str(nanInPercent)+"%NaNs_Interpolated_"+str(nrOfFeatures)+"Features_"+str(nrOfRows)+"InputLines.csv"

print("Making data in iterations:\n")
for x in range(0,10):
    dataframe.append(makeData(x))

print("\nAccuracies from", x+1, "generated results written to", filename, "for statistical analysis.")
print("\nThe actual newly generated output with synthetically generated data has been successfully written to",target_file,".")
print("Use it to hand it over to someone else or tweak your synthetic data by achanging the config.")

file = open(filename, "w", newline="")

writer = csv.writer(file)

for val in dataframe:
    writer.writerow([val])

file.close()

1 .: Writing to output_NMR.csv The generated data is nan% accurate to the original data. (if unsatisfactory try another config)
2 .: Writing to output_NMR.csv The generated data is nan% accurate to the original data. (if unsatisfactory try another config)
3 .: Writing to output_NMR.csv The generated data is nan% accurate to the original data. (if unsatisfactory try another config)
4 .: Writing to output_NMR.csv The generated data is nan% accurate to the original data. (if unsatisfactory try another config)
5 .: Writing to output_NMR.csv The generated data is nan% accurate to the original data. (if unsatisfactory try another config)
6 .: Writing to output_NMR.csv The generated data is nan% accurate to the original data. (if unsatisfactory try another config)
7 .: Writing to output_NMR.csv The generated data is nan% accurate to the original data. (if unsatisfactory try another config)
8 .: Writing to output_NMR.csv The generated data is nan% accurate to the original data. (if unsatisfact