In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('./impute_test.csv')
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,,6.0,'Good'
2,7,,9.0,'Excellent'
3,10,11.0,12.0,
4,13,14.0,15.0,'Excellent'
5,16,17.0,,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


In [2]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
imputer = imputer.fit(df[['B', 'C']])
df[['B', 'C']] = imputer.transform(df[['B', 'C']])
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,11.166667,6.0,'Good'
2,7,11.166667,9.0,'Excellent'
3,10,11.0,12.0,
4,13,14.0,15.0,'Excellent'
5,16,17.0,11.428571,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


In [3]:
from classifier_oop import *

In [4]:
processed_data = pd.read_csv('./processed_dataset/recent_propagated_dataset.csv', index_col=0)

In [5]:
year = 1
label = f'cancer_in_next_{year}_years'
source_df = remove_featues_startswith(processed_data, ['cancer_'], [label], show_removed=False).drop_duplicates()
source_df = source_df[source_df[label].notnull()]

In [6]:
source_df = resample_class(source_df, label, 0, 20000)

In [7]:
impute_const_dict = {
            'numcyst': 0,
            'ovcyst_morph': 0,
            'ovcyst_outline': 0,
            'ovcyst_solid': 0,
            'ovcyst_sum': 0,
            'ovcyst_vol': 0,
            'numcyst': 0,
            'tvu_result': 1,
            'numcystl': 0,
            'numcystr': 0,
            'ovcyst_diaml': 0,
            'ovcyst_diamr': 0,
            'ovcyst_morphl': 0,
            'ovcyst_morphr': 0,
            'ovcyst_outlinel': 0,
            'ovcyst_outliner': 0,
            'ovcyst_solidl': 0,
            'ovcyst_solidr': 0,
            'ovcyst_suml': 0,
            'ovcyst_sumr': 0,
            'ovcyst_voll': 0,
            'ovcyst_volr': 0,
            'visboth': 0,
            'viseith': 0,
            'visl': 0,
            'visr': 0
        }
numeric_columns = select_numeric_columns(source_df)
numeric_columns = list(set(numeric_columns) - set(impute_const_dict.keys()))
imputer_util = ImputerUtil(impute_const_dict, impute_mean_cols=numeric_columns, impute_median_cols=[])

In [8]:
data_util = ClassifierDataUtil(label, imputer_util)
unique_id_df = source_df[['plco_id', label]].drop_duplicates(subset='plco_id')
X_train_unique, X_test_unique, y_train, y_test = train_test_split(unique_id_df, unique_id_df[label], test_size = 0.2)
data_util = data_util.process_train_test_split(source_df, X_train_unique['plco_id'], X_test_unique['plco_id'])

In [9]:
data_util = ClassifierDataUtil(label, imputer_util)
train_test_split_util = TrainTestSplitUtil(source_df, data_util, False)
k_fold_lambdas = train_test_split_util.split_kfold(10)

In [10]:
data_utils = []
for i in range(10):
    data_utils.append(k_fold_lambdas[i]())

In [23]:
data_utils[0].test_df

Unnamed: 0,plco_id,age,agelevel,arthrit_f,asp,asppd,bbd,bcontr_f,bcontra,bcontrt,...,uterine_fib,visboth,viseith,visl,visr,volum,was_screened,weight20_f,weight50_f,weight_f
181135,I-066697-7,55.0,0.0,1.0,0.0,0.0,0.0,0.0,1.483398,0.0,...,0.0,0,0,0,0,1.799805,1.0,150.0,180.0,192.0
371445,S-082341-0,65.0,1.0,0.0,0.0,0.0,0.0,1.0,1.000000,5.0,...,0.0,0,0,0,0,14.007812,1.0,120.0,140.0,160.0
535806,W-135580-0,55.0,0.0,1.0,0.0,0.0,0.0,1.0,1.000000,4.0,...,1.0,0,1,1,0,14.007812,1.0,118.0,122.0,122.0
164146,G-146957-8,55.0,0.0,1.0,0.0,0.0,0.0,1.0,1.000000,2.0,...,0.0,0,0,0,0,14.007812,1.0,110.0,105.0,125.0
6591,A-044378-9,60.0,1.0,0.0,1.0,1.0,0.0,1.0,2.000000,5.0,...,0.0,0,0,0,0,14.007812,1.0,215.0,235.0,218.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653051,Z-139804-0,60.0,1.0,1.0,1.0,1.0,0.0,1.0,2.000000,5.0,...,1.0,0,1,1,0,4.199219,1.0,170.0,220.0,250.0
654404,Z-146095-6,75.0,2.0,0.0,1.0,5.0,0.0,0.0,1.483398,0.0,...,0.0,0,1,0,1,8.203125,1.0,110.0,135.0,185.0
655038,Z-148291-8,70.0,3.0,0.0,0.0,0.0,0.0,0.0,1.483398,0.0,...,0.0,0,0,0,0,14.007812,1.0,135.0,135.0,140.0
657068,Z-155932-9,65.0,2.0,1.0,0.0,0.0,0.0,1.0,1.000000,1.0,...,0.0,1,1,1,1,0.700195,1.0,118.0,130.0,170.0


In [22]:
data_utils[9].test_df

Unnamed: 0,plco_id,age,agelevel,arthrit_f,asp,asppd,bbd,bcontr_f,bcontra,bcontrt,...,uterine_fib,visboth,viseith,visl,visr,volum,was_screened,weight20_f,weight50_f,weight_f
181135,I-066697-7,55.0,0.0,1.0,0.0,0.0,0.0,0.0,1.483398,0.0,...,0.0,0,0,0,0,1.799805,1.0,150.0,180.0,192.0
371445,S-082341-0,65.0,1.0,0.0,0.0,0.0,0.0,1.0,1.000000,5.0,...,0.0,0,0,0,0,14.007812,1.0,120.0,140.0,160.0
535806,W-135580-0,55.0,0.0,1.0,0.0,0.0,0.0,1.0,1.000000,4.0,...,1.0,0,1,1,0,14.007812,1.0,118.0,122.0,122.0
164146,G-146957-8,55.0,0.0,1.0,0.0,0.0,0.0,1.0,1.000000,2.0,...,0.0,0,0,0,0,14.007812,1.0,110.0,105.0,125.0
6591,A-044378-9,60.0,1.0,0.0,1.0,1.0,0.0,1.0,2.000000,5.0,...,0.0,0,0,0,0,14.007812,1.0,215.0,235.0,218.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653051,Z-139804-0,60.0,1.0,1.0,1.0,1.0,0.0,1.0,2.000000,5.0,...,1.0,0,1,1,0,4.199219,1.0,170.0,220.0,250.0
654404,Z-146095-6,75.0,2.0,0.0,1.0,5.0,0.0,0.0,1.483398,0.0,...,0.0,0,1,0,1,8.203125,1.0,110.0,135.0,185.0
655038,Z-148291-8,70.0,3.0,0.0,0.0,0.0,0.0,0.0,1.483398,0.0,...,0.0,0,0,0,0,14.007812,1.0,135.0,135.0,140.0
657068,Z-155932-9,65.0,2.0,1.0,0.0,0.0,0.0,1.0,1.000000,1.0,...,0.0,1,1,1,1,0.700195,1.0,118.0,130.0,170.0


In [14]:
original = ClassifierDataUtil(label='test', imputer=imputer, train_size=5000)
copy = original.copy()


In [15]:
original

<classifier_oop.ClassifierDataUtil at 0x166d72b19f0>

In [16]:
copy

<classifier_oop.ClassifierDataUtil at 0x166d72b3760>