In [1]:
from typing import List, Dict

from oop_functions.experiment_helper import *
from oop_functions.experiment_runner import ExperimentRunner
from oop_functions.util_functions import *
from oop_functions.visualization_util import *
from oop_functions.analytics_cv_util import *
from oop_functions.analytics_utils import *

%matplotlib inline
import warnings

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import itertools

from dcurves import dca, plot_graphs


In [2]:
filesuffix = '_for_experiment_participants_screened_single_first_5_RandomForestClassifier_cancer_in_next_1_years__15_trials'

In [3]:
filename = f'./feature_importance/feature_importance_mean_{filesuffix}.csv'
# print(filename)
feature_list = pd.read_csv(filename)

In [4]:
cv_analytics_util = CvAnalyticsUtil.load_cv_analytics_utils(filesuffix)

In [5]:
const_impute_values = cv_analytics_util.analytics_utils[0].data_util.imputer.impute_const_dict

In [6]:
full_dataset = cv_analytics_util.get_dataset_with_predictions()

In [25]:
ranges = {}
ranges_numeric = {}
for column in full_dataset.columns:
    try: 
        min_val = full_dataset[column].min()
        max_val = full_dataset[column].max()
        range_str = f'{min_val:.0f} to {max_val:.0f}'
        ranges[column] = range_str
        ranges_numeric[column] = (min_val, max_val)
    except:
        continue

In [8]:
ranges = pd.DataFrame({
    "column_name": ranges.keys(),
    "Values Range": ranges.values()
})

In [9]:
const_impute_values = pd.DataFrame({
    "column_name": const_impute_values.keys(),
    "imputed_values": const_impute_values.values()
})

In [10]:
mean_impute_values = pd.DataFrame({
    "column_name": cv_analytics_util.analytics_utils[0].data_util.imputer.impute_mean_cols,
    # "imputed_values": cv_analytics_util.analytics_utils[0].data_util.imputer.imputer_mean.statistics_
    "imputed_values": ['Mean'] * len(cv_analytics_util.analytics_utils[0].data_util.imputer.impute_mean_cols)
})

In [11]:
impute_values = pd.concat([const_impute_values, mean_impute_values])

In [12]:
feature_list = pd.merge(feature_list, impute_values, on='column_name', how='left')

In [13]:
feature_list = pd.merge(feature_list, ranges, on='column_name', how='left')

In [14]:
screen_ca125_data_cols = ['ca125ii_level', 'ca125_result', 'ca125ii_level_binary']

# TODO: these have changed
screen_ultra_data_cols = ['detl_p', 'detr_p', 'lvol_p', 'rvol_p', 'lvol_q', 'rvol_q',
       'lantero_p', 'lantero_q', 'llong_p', 'llong_q', 'ltran_p', 'ltran_q',
       'rantero_p', 'rantero_q', 'rlong_p', 'rlong_q', 'rtran_p', 'rtran_q',
       'tvu_ref', 'phycons', 'tvu_result', 'ovar_result',
       'ovcyst_solidr', 'ovcyst_outliner', 'ovcyst_solidl', 'ovcyst_outlinel',
       'ovcyst_solid', 'ovcyst_outline', 'ovcyst_diamr', 'ovcyst_diaml',
       'ovcyst_diam', 'ovcyst_volr', 'ovcyst_voll', 'ovcyst_vol',
       'ovcyst_morphr', 'ovcyst_morphl', 'ovcyst_morph', 'ovcyst_sumr',
       'ovcyst_suml', 'ovcyst_sum', 'ovary_diam', 'ovary_diamr', 'ovary_diaml',
       'ovary_volr', 'ovary_voll', 'ovary_vol', 'visl', 'visr', 'visboth',
       'viseith', 'numcystl', 'numcystr', 'numcyst', 'ovar_days']

screen_abnorm_data_cols = ['solid', 'sepst', 'cyst', 'cystw', 'echo', 'maxdi', 'volum']

screened_cols = ['study_yr', 'plco_id'] + screen_ultra_data_cols + screen_abnorm_data_cols + screen_ca125_data_cols

In [15]:
feature_list["Description"] = ""
feature_list["Screening vs Demographic"] = "Demographic"

In [16]:
for index, row in feature_list.iterrows():
    if row['column_name'] in screened_cols:
        feature_list.at[index, 'Screening vs Demographic'] = 'Screening'

In [17]:
feature_list = feature_list.drop(['count', 'mean', 'percent_missing_after_propagation'], axis=1)

In [18]:
feature_list

Unnamed: 0.1,Unnamed: 0,column_name,percent_missing_before_propagation,imputed_values,Values Range,Description,Screening vs Demographic
0,0,ca125ii_level,0.271959,Mean,1 to 4980,,Screening
1,1,ca125ii_level_binary,0.267316,Mean,1 to 2,,Screening
2,2,ca125_result,0.000000,9,1 to 9,,Screening
3,3,ovar_result,0.000000,Mean,1 to 9,,Screening
4,4,tvu_ref,29.414691,Mean,1 to 4,,Screening
...,...,...,...,...,...,...,...
136,136,hispanic_f,2.891389,Mean,0 to 1,,Demographic
137,137,tubal,0.926651,Mean,0 to 2,,Demographic
138,138,pipe,1.524961,Mean,0 to 2,,Demographic
139,139,cigar,1.702066,Mean,0 to 2,,Demographic


In [19]:
feature_list.columns = ["Rank", "Column Name", "Missing %", "Imputed Value", "Values Range", "Description", "Screening vs Demographic"]

In [20]:
feature_list["Rank"] = feature_list["Rank"] + 1

In [21]:
feature_list = feature_list[["Rank", "Column Name", "Description", "Screening vs Demographic", "Values Range", "Imputed Value", "Missing %"]]

In [23]:
descriptions = pd.read_csv("./paper_outputs/kept_features_df_description.csv")

In [22]:
feature_list.to_csv("./paper_outputs/kept_features_df.csv", index=False)

In [26]:
descriptions = descriptions[:35]

In [58]:
mean_impute_values = pd.DataFrame({
    "column_name": cv_analytics_util.analytics_utils[0].data_util.imputer.impute_mean_cols,
    "imputed_values": cv_analytics_util.analytics_utils[0].data_util.imputer.imputer_mean.statistics_
    # "imputed_values": ['Mean'] * len(cv_analytics_util.analytics_utils[0].data_util.imputer.impute_mean_cols)
})
impute_values = pd.concat([const_impute_values, mean_impute_values])
impute_values = impute_values.reset_index()
# impute_values = impute_values[impute_values['column_name'].duplicated(keep='last')]
impute_values = {val['column_name']: val['imputed_values'] for key, val in impute_values.to_dict(orient='index').items()}

In [34]:
def true_false_question_template(column_name, question, default_val):
    default_str = "True"
    if default_val < 0.5:
        default_str = "False"
    return {
        "type": "radiogroup",
        "name": column_name,
        "title": question,
        "choices": ["True", "False"],
        "defaultValue": [default_str]
    }

In [35]:
def numeric_question_template(column_name, question, min_val, max_val, default_val):
    return {
        "type": "text",
        "name": column_name,
        "title": question,
        "inputType": "number",
        "validators": [
        {
            "type": "numeric",
            "minValue": min_val,
            "maxValue": max_val
        }
        ],
        "defaultValue": [default_val]
    }

In [62]:
survey = []
for index, row in descriptions.iterrows():
    column_name = row["Column Name"]
    question = f'{row["Description"]}. Values range from {row["Values Range"]}.'
    val_range = ranges_numeric[row["Column Name"]]
    min_val = int(val_range[0])
    max_val = int(val_range[1])
    default_val = int(impute_values[row["Column Name"]])
    if row["Values Range"] in ["0 to 1", "1 to 2"]:
        survey.append(true_false_question_template(column_name, question, default_val))
    else: 
        survey.append(numeric_question_template(column_name, question, min_val, max_val, default_val))

In [65]:
import json

In [67]:
json.dumps(survey)

'[{"type": "text", "name": "ca125ii_level", "title": "CA-125 level recorded from a valid screen.", "inputType": "number", "validators": [{"type": "numeric", "minValue": 1, "maxValue": 4980}], "defaultValue": [12]}, {"type": "radiogroup", "name": "ca125ii_level_binary", "title": "Is CA-125 level normal?", "choices": ["True", "False"], "defaultValue": ["True"]}, {"type": "text", "name": "ca125_result", "title": "CA-125 screen result.", "inputType": "number", "validators": [{"type": "numeric", "minValue": 1, "maxValue": 9}], "defaultValue": [9]}, {"type": "text", "name": "ovar_result", "title": "Combined ovarian screening result.", "inputType": "number", "validators": [{"type": "numeric", "minValue": 1, "maxValue": 9}], "defaultValue": [1]}, {"type": "text", "name": "tvu_ref", "title": "The level of referral of TVU exam result.", "inputType": "number", "validators": [{"type": "numeric", "minValue": 1, "maxValue": 4}], "defaultValue": [3]}, {"type": "text", "name": "ovcyst_vol", "title": "

In [68]:
file_path = "./paper_outputs/survey.json"
with open(file_path, 'w') as json_file:
    json.dump(survey, json_file)


In [33]:
descriptions

Unnamed: 0,Rank,Column Name,Description,Screening vs Demographic,Values Range,Imputed Value,Missing %
0,1,ca125ii_level,CA-125 level recorded from a valid screen.,Screening,1 to 4980,Mean,0.3
1,2,ca125ii_level_binary,Is CA-125 level normal?,Screening,1 to 2,Mean,0.3
2,3,ca125_result,CA-125 screen result.,Screening,1 to 9,9,0.0
3,4,ovar_result,Combined ovarian screening result.,Screening,1 to 9,Mean,0.0
4,5,tvu_ref,The level of referral of TVU exam result.,Screening,1 to 4,Mean,29.4
5,6,ovcyst_vol,The worst volume on any cyst found on either o...,Screening,0 to 5644,0,61.8
6,7,ovary_vol,The largest ovary volume of the left and right...,Screening,0 to 977,Mean,62.3
7,8,ovcyst_sum,The worst summary of any cyst found on the lef...,Screening,0 to 9,0,0.0
8,9,detl_p,Sonographically Dectectable Left Side?,Screening,0 to 1,Mean,29.7
9,10,ovcyst_diam,The worst diameter on any cyst found on either...,Screening,0 to 22,Mean,61.8
