In [629]:
import pandas as pd
from sdv.lite import TabularPreset
from sdv.evaluation import evaluate
import numpy as np
import math

In [630]:
config_file = "config.csv"
config = pd.read_csv(config_file)
data_file = config["Input File"][0]
target_file = config["Output File"][0]
data = pd.read_csv(data_file)
headers = data.columns
features = config["Features"].to_list()
allowed_percentage = config["Percentage"][0]
number_of_samples = int(config["Number of Samples"][0])
types = config["Type"].to_list()
sub_types = config["Subtype"].to_list()

In [631]:

#Remove unspecified columns
for header in headers:
    if not header in features:
        data.pop(header)
        #print("Deleting column", header, "because of configuration.")

#Remove columns with to many NaNs
headers = data.columns
for header in headers:
    nan_percentage = data[header].isna().sum()/len(data[header].index)
    if nan_percentage > allowed_percentage:
        data.pop(header)
        headers = headers.drop(header)
        print("Deleting column", header, "because of" ,"{:.2%}".format(nan_percentage), "NaNs.")

numericals = []
for i in range(len(features)):
    if features[i] in headers:
        if types[i] == "numerical":
            numericals.append(features[i])

data.dropna(axis = 0, how='any', subset = numericals, inplace= True)

for numerical in numericals:
    data[numerical] = pd.to_numeric(data[numerical], errors='coerce')


Deleting column Neoplasm Histologic Grade because of 91.28% NaNs.
Deleting column Treatment Details before PDX because of 90.58% NaNs.


In [632]:
#create validation set

validation_set = data.sample(frac=0.3)
validation_set.reset_index()


Unnamed: 0,index,Cancer Type Detailed,Mutation Count,Oncotree Code,Overall Survival (Months),Sample Type,Diagnosis Age,TERT,IDH1,PABPC3,PTEN
0,629,Glioblastoma Multiforme,5.0,GBM,10.60,Primary,74.0,Promoter,WT,NS,WT
1,625,Oligodendroglioma,25.0,ODG,95.30,Recurrence,38.0,Promoter,R132H,NS,WT
2,126,Oligodendroglioma,55.0,ODG,132.00,Recurrence,47.0,Promoter,R132H,NS,WT
3,824,Astrocytoma,30.0,ASTR,38.00,Recurrence,35.0,WT,R132H,WT,WT
4,732,Glioblastoma Multiforme,5.0,GBM,6.02,Primary,71.0,Promoter,WT,NS,WT
...,...,...,...,...,...,...,...,...,...,...,...
252,229,Glioblastoma Multiforme,3.0,GBM,21.00,Primary,54.0,WT,WT,NS,WT
253,857,Anaplastic Oligodendroglioma,7.0,AODG,93.90,,45.0,Promoter,R132H,NS,WT
254,479,Glioblastoma Multiforme,4.0,GBM,12.50,Recurrence,53.0,Promoter,WT,NS,H185Ifs*14
255,141,Glioblastoma Multiforme,8.0,GBM,12.40,Primary,67.0,Promoter,WT,NS,WT


In [633]:
#Generating Metadata

metadata = {}
metadata["fields"] = {}
for i in range(len(features)):
    if features[i] in headers:
        metadata["fields"][features[i]] = {}
        metadata["fields"][features[i]]["type"] = types[i]
        if sub_types[i] == sub_types[i]:
            metadata["fields"][features[i]]["subtype"] = sub_types[i]

metadata["constraints"] = []

#print(metadata)



In [634]:
#fit a model to the data and generating data
#model = TabularPreset(name='FAST_ML', metadata=metadata)
model = TabularPreset(name='FAST_ML', metadata=metadata)

model.fit(data)

new_data = model.sample(num_rows=number_of_samples)

new_data = new_data.round(decimals = 2)

new_data.to_csv(target_file, index=False)

print("The generated data is", "{:.2%}".format(evaluate(new_data, validation_set, metrics=['CSTest', 'KSTest'])), "accurate to the original data. If this is unsatisfactory try using less or different features, more data or removing more NaNs.")

The generated data is 38.09% accurate to the original data. If this is unsatisfactory try using less or different features, more data or removing more NaNs.


