In [1]:
import csv
import pandas as pd

# install libraries the first time you are using this notebook by uncommenting the following:
#!pip install --upgrade sdv #(v0.18.0)
#!pip install sdmetrics #(v0.9.1)
#!pip install session_info

from sdv.metrics.tabular import KSComplement
from sdv.lite import TabularPreset
from sdv.evaluation import evaluate
from dataclasses import dataclass

In [11]:
# This Notebook uses a config file to create interpolated data from an input data set. 
# The interpolation is customizable by config params. See readme.md for more details.

config = pd.read_json("config.json") #config file(dictionary)
data_file = config["Input File"][0]

target_file = config["Output File"][0]
data = pd.read_csv(str(data_file))
headers = data.columns

feature_dict = config["Features"]

features = [] #names of relevant headers
for feature in config["Features"].keys():
    features.append(feature) # wanted headers

allowed_percentage = config["Percentage"][0]
number_of_samples = int(config["N_samples"][0])
#print(features)

print("Initialization:\nconfig file read, input is", data_file)

Initialization:
config file read, input is combined.csv


In [12]:
#Remove unspecified columns
for header in headers:
    if not header in features:
        data.pop(header)
        #print("Deleting column", header, "because of configuration.")

#Remove columns with to many NaNs
headers = data.columns #fixed list of columns we need
for header in headers:
    nan_percentage = data[header].isna().sum() / len(data[header].index)
    #print("NaN in % for", header, ":", nan_percentage)
    if nan_percentage > allowed_percentage:
        data.pop(header)
        headers = headers.drop(header)
        print("Deleting column", header, "because of" ,"{:.2%}".format(nan_percentage), "NaNs.")

#removing unwanted nans and interpolating others
numericals = []
for x in headers:
    #print(feature_dict, feature_dict[x][0])
    if feature_dict[x][0] == "numerical":
        numericals.append(x)
        
print("configured columns:",len(features))
print("after remowing columns with too many NaNs:", len(headers))
print("how many columns have numericals:", len(numericals))

for numerical in numericals:
    data[numerical] = pd.to_numeric(data[numerical], errors='coerce')

data = data.interpolate(method='pad')


Deleting column Neoplasm Histologic Grade because of 76.59% NaNs.
Deleting column Treatment Details before PDX because of 94.68% NaNs.
configured columns: 18
after remowing columns with too many NaNs: 16
how many columns have numericals: 3


In [13]:
#create validation set

validation_set = data.sample(frac=0.3)
#removal = data.sample(frac=0.9)
#data = data.drop(removal.index)
validation_set.reset_index()

print("rows in original data:", len(data))
print("rows for validation:", len(validation_set))


rows in original data: 1542
rows for validation: 463


In [26]:
#Generating Metadata for SV
metadata = {}
metadata["fields"] = {}

for x in headers:
    metadata["fields"][x] = {}
    metadata["fields"][x]["type"] = feature_dict[x][0]
    if feature_dict[x][0] == "numerical":
        metadata["fields"][x]["subtype"] = feature_dict[x][1]
metadata["constraints"] = []

print(metadata)

{'fields': {'Cancer Type Detailed': {'type': 'categorical'}, 'Mutation Count': {'type': 'numerical', 'subtype': 'integer'}, 'Oncotree Code': {'type': 'categorical'}, 'Overall Survival (Months)': {'type': 'numerical', 'subtype': 'float'}, 'Sample Type': {'type': 'categorical'}, 'Diagnosis Age': {'type': 'numerical', 'subtype': 'integer'}, 'TERT': {'type': 'categorical'}, 'IDH1': {'type': 'categorical'}, 'PABPC3': {'type': 'categorical'}, 'PTEN': {'type': 'categorical'}, 'SPRY3': {'type': 'categorical'}, 'MUC12': {'type': 'categorical'}, 'GXYLT1': {'type': 'categorical'}, 'SMARCA4': {'type': 'categorical'}, 'FAT1': {'type': 'categorical'}, 'MUC5B': {'type': 'categorical'}}, 'constraints': []}


In [27]:
#Fit model to the data and generating data with SDV's TabularPreset
#model = TabularPreset(name='FAST_ML', metadata=metadata)
def makeData(x):
    model = TabularPreset(name='FAST_ML', metadata=metadata) #SDV's FAST_ML preset uses ML to model your data
    model.fit(data)

    new_data = model.sample(num_rows=number_of_samples)
    new_data = new_data.round(decimals = 2)
    new_data.to_csv(target_file, index=False)
    print(x+1, ".: Writing to", str(target_file), "The generated data is", 
          "{:.2%}".format(evaluate(new_data, validation_set, metrics=['KSComplement'])), 
          "accurate to the original data.",
          "(if unsatisfactory try another config")# using less or diff. features or more data or removing more NaNs.")
    
    return "{:.2%}".format(evaluate(new_data, validation_set, metrics=['KSComplement']))

In [33]:
# create the new interpolated data with dynamic file name depending on config params
nanInPercent = (allowed_percentage*100).astype(int)
nrOfFeatures = len(features)
nrOfRows = len(data)
dataframe = []
filename = str(nanInPercent)+"%NaNs_Interpolated_"+str(nrOfFeatures)+"Features_"+str(nrOfRows)+"InputLines.csv"

print("Making data in iterations:\n")
for x in range(0,10):
    dataframe.append(makeData(x))
   
print("\nAccuracies from", x, "generated results written to", filename, "for statistical analysis.")

file = open(filename, "w", newline="")

writer = csv.writer(file)

for val in dataframe:
    writer.writerow([val])

file.close()

Making data in iterations:

1 .: Writing to output.csv The generated data is 77.67% accurate to the original data. (if unsatisfactory try another config
2 .: Writing to output.csv The generated data is 76.58% accurate to the original data. (if unsatisfactory try another config
3 .: Writing to output.csv The generated data is 76.28% accurate to the original data. (if unsatisfactory try another config
4 .: Writing to output.csv The generated data is 77.09% accurate to the original data. (if unsatisfactory try another config
5 .: Writing to output.csv The generated data is 76.94% accurate to the original data. (if unsatisfactory try another config
6 .: Writing to output.csv The generated data is 76.99% accurate to the original data. (if unsatisfactory try another config
7 .: Writing to output.csv The generated data is 76.79% accurate to the original data. (if unsatisfactory try another config
8 .: Writing to output.csv The generated data is 75.79% accurate to the original data. (if unsati

In [29]:
#for session info uncomment lins below
#!pip install session_info
#print jupyter session used for the output above
#import session_info
#session_info.show()