In [11]:
import csv
import pandas as pd
from sdv.lite import TabularPreset
from sdv.evaluation import evaluate
from dataclasses import dataclass

In [14]:
# This Notebook uses a config file to create interpolated data from an input data set. 
# The interpolation is customizable by config params. See readme.md for more details.

config = pd.read_json("config.json") #config file(dictionary)
data_file = config["Input File"][0]

target_file = config["Output File"][0]
data = pd.read_csv(str(data_file))
headers = data.columns

feature_dict = config["Features"]

features = [] #names of relevant headers
for feature in config["Features"].keys():
    features.append(feature) # wanted headers

allowed_percentage = config["Percentage"][0]
number_of_samples = int(config["N_samples"][0])
#print(features)

print("initialization: config file read, input is", data_file)

initialization: config file read, input is combined.csv


In [15]:
#Remove unspecified columns
for header in headers:
    if not header in features:
        data.pop(header)
        #print("Deleting column", header, "because of configuration.")

#Remove columns with to many NaNs
headers = data.columns #fixed list of columns we need
for header in headers:
    nan_percentage = data[header].isna().sum() / len(data[header].index)
    #print("NaN in % for", header, ":", nan_percentage)
    if nan_percentage > allowed_percentage:
        data.pop(header)
        headers = headers.drop(header)
        print("Deleting column", header, "because of" ,"{:.2%}".format(nan_percentage), "NaNs.")

#removing unwanted nans and interpolating others
numericals = []
for x in headers:
    #print(feature_dict, feature_dict[x][0])
    if feature_dict[x][0] == "numerical":
        numericals.append(x)

for numerical in numericals:
    data[numerical] = pd.to_numeric(data[numerical], errors='coerce')

data = data.interpolate(method='pad')


Deleting column Neoplasm Histologic Grade because of 76.59% NaNs.
Deleting column Treatment Details before PDX because of 94.68% NaNs.


In [16]:
#create validation set

validation_set = data.sample(frac=0.3)
#removal = data.sample(frac=0.9)
#data = data.drop(removal.index)
validation_set.reset_index()

print("rows in original data:", len(data))
print("rows for validation:", len(validation_set))


rows in original data: 1542
rows for validation: 463


In [17]:
#Generating Metadata for SV
metadata = {}
metadata["fields"] = {}

for x in headers:
    metadata["fields"][x] = {}
    metadata["fields"][x]["type"] = feature_dict[x][0]
    if feature_dict[x][0] == "numerical":
        metadata["fields"][x]["subtype"] = feature_dict[x][1]
metadata["constraints"] = []

#print(metadata)

In [18]:
#Fit model to the data and generating data with SDV's TabularPreset
#model = TabularPreset(name='FAST_ML', metadata=metadata)
def makeData():
    model = TabularPreset(name='FAST_ML', metadata=metadata) #SDV's FAST_ML preset uses ML to model your data
    model.fit(data)

    new_data = model.sample(num_rows=number_of_samples)
    new_data = new_data.round(decimals = 2)
    new_data.to_csv(target_file, index=False)

    return "{:.2%}".format(evaluate(new_data, validation_set, metrics=['KSTest']))

    print("The generated data is", 
          "{:.2%}".format(evaluate(new_data, validation_set, metrics=['KSTest'])), 
          "accurate to the original data. ",
          "If this is unsatisfactory try using less or different features, more data or removing more NaNs.")

In [63]:
# create the new interpolated data with dynamic file name depending on config params
nanInPercent = (allowed_percentage*100).astype(int)
nrOfFeatures = len(features)
nrOfRows = len(data)
dataframe = []
filename = str(nanInPercent)+"%NaNs_Interpolated_"+str(nrOfFeatures)+"Features_"+str(nrOfRows)+"InputLines.csv"

print("making data")
for x in range(0,1000):
    dataframe.append(makeData())
   
print("writing to ", filename)

file = open(filename, "w", newline="")

writer = csv.writer(file)

for val in dataframe:
    writer.writerow([val])

file.close()

making data
writing to  10%NaNs_Interpolated_18Features_1542InputLines.csv


In [67]:
#!pip install session_info
# print jupyter session used for the output above
import session_info
session_info.show()