In [2]:
from oop_functions.merge_dataset_functions import merge_data_over_years
from oop_functions.missing_values_functions import print_records_vs_unique
from oop_functions.util_functions import get_cols_missing_percentage, print_df, convert_numeric_to_float16, remove_featues_endswith, select_features_startswith, select_features_endswith, remove_featues, convert_columns_to_integers
import pandas as pd
import numpy as np
%matplotlib inline
import warnings

warnings.filterwarnings("ignore")


In [3]:
# reading datasets
personal_data = pd.read_csv('./dataset/Ovarian/ovar_data_mar22_d032222.csv')
screen_data = pd.read_csv('./dataset/Ovarian/Screening/ovar_screen_data_mar22_d032222.csv')
screen_abnorm_data = pd.read_csv('./dataset/Ovarian/Screening Abnormalities/ovar_scrsub_data_mar22_d032222.csv')

## Preprocessing
For some of the datasets we determined some of the features to not be useful for analysis. For every dataset that includes `build` features. 

For personal dataset we removed all `ovar_` features since they are a direct result of the cancer diagnosis and should not be used predict cancer. 

For the screen data dataset it was determined that `examinerid_` features do not bring the value to the dataset since they represet 'Examiner ID made by concatentating center and exam ID for overall screen result.' and cannot be the cause of the cancer. We also drop all of the remaining

In [4]:
# Drop profy features
personal_data_features_remove = [
    'fstcan_exitdays', # Days from trial entry (randomization) to first cancer diagnosis for participants with cancer, or to trial exit otherwise. Only cancers occuring during the trial are used to determine exit.
    'fstcan_exitstat', # Status of the participant at exit for first cancer incidence. Only cancers occuring during the trial are used to determine exit.
    'fstcan_exitage',
    'f_dthovar', # Is ovarian cancer the underlying cause of death? This conclusion is based on the information from the death certificate and death review.
    'f_codeath_cat', # Categorized underlying cause of death.
    'f_cancersite', # Underlying cause of death from cancer.
    'f_seer_death', # Underlying cause of death.
    'd_codeath_cat', # Categorized underlying cause of death.
    'd_dthovar', # Is Ovarian Cancer The Underlying Cause Of Death?
    'd_cancersite', # 
    'd_seer_death', # Underlying cause of death.
    'ph_ovar_muq', # Did the participant have a personal history of ovarian/ft/pt cancer prior to MUQ analysis entry? 
    'ph_ovar_sqx', #
    'ph_ovar_dhq', #  
    'ph_ovar_dqx', 
    'entrydays_muq', # 
    'dth_days', # 
    'bq_compdays',
    'bq_adminm',
    'bq_returned',
    'ssmokea_f',
    'orem_fyro',
    'arm',
    'sex',
    'rndyear',
    'bq_age', # Remove bq_age (redundant with age)
    'center', # Irrelevant
    'in_TGWAS_population', # Irrelevant
    'dual', # Irrelevant
    ]

# Additional fields to be removed as per Graham 06/28/2023
personal_data_questionnaire_features = ['ph_any_sqx', 'ph_any_muq', 'ph_any_dqx', 'ph_any_dhq', 'ph_ovar_trial', 'ph_any_trial']

# Remove all ovar_ features
personal_data_feature_prefixes = ['ovar_', 'mortality', 'reconsent', 'is_dead', 'build', 'biopolink', 'ca125_', 'tvu', 'entrydays', 'entryage']

personal_data_features_remove += personal_data_questionnaire_features

personal_data_features_remove += select_features_startswith(personal_data, personal_data_feature_prefixes)

personal_data_keep_for_analysis = ['ovar_cancer', 'ovar_cancer_diagdays', 'ovar_histtype', 'ovar_behavior']

screen_data_features_remove = [
    'QAMETHOD',
    'tvudays_pvis1',
    'tvudays_pvis2',
    'tvudays_pvis3',
    # 'tvures_qvis1',
    # 'tvures_qvis2',
    # 'tvures_qvis3',
    # 'tvures_pvis3',  # It is completely missing for non-cancer patients
    'tvu_days',
    'tvu_assess_days_q',
    'medcomp', # 
    'physid',
    'phycons',
    'detl_q',
    'detr_q',
    'ca125_src', # 
    'ca125_days', # 
    'ca125ii_src', # '
    'ca125_level',
    'ca125i_assess_days',
    'ca125ii_assess_days',
    'lvol_p', 'rvol_p' # Remove lvol_p and rvol_p (these are redundant to ovary_voll and ovary_volr
]

# Remove all inad_ and examinerid_ and build_ features
screen_data_feature_prefixes = ['inad_', 'examinerid', 'build', 'tvures_']

screen_data_features_remove += select_features_startswith(screen_data, screen_data_feature_prefixes)

# Remove all inad_ and examinerid_ and build_ features
screen_data_feature_suffixes = ['_q']

screen_data_features_remove += select_features_endswith(screen_data, screen_data_feature_suffixes)

abnorm_data_features_remove = ['VISIT', 'side', 'sbcd', 'source']

# Drop not useful cols and convert categorical into numerical in screen abnormalities
abnorm_data_feature_prefixes = ['build']

abnorm_data_features_remove += select_features_startswith(screen_abnorm_data, abnorm_data_feature_prefixes)

personal_data = remove_featues(personal_data, personal_data_features_remove, personal_data_keep_for_analysis)
screen_data = remove_featues(screen_data, screen_data_features_remove)
screen_abnorm_data = remove_featues(screen_abnorm_data, abnorm_data_features_remove)


In [5]:
all_dataset_names = (['Primary Dataset'] * len(personal_data_features_remove)) \
        + (['Screen Dataset'] * len(screen_data_features_remove)) \
        + (['Abnormalities Dataset'] * len(abnorm_data_features_remove))
all_featues = personal_data_features_remove + screen_data_features_remove + abnorm_data_features_remove
reasons_for_removal = [''] * len(all_featues)
features_df = pd.DataFrame({
    'Dataset Name': all_dataset_names,
    'Feature': all_featues,
    'Reason for Removal': reasons_for_removal
})

In [6]:
features_df.to_csv('./paper_outputs/removed_features_df.csv')

In [7]:
print_records_vs_unique(personal_data, 'plco_id', 'personal')
# Check the number of people with cancer
personal_data_cancer = personal_data[personal_data['ovar_cancer']==1]
print(f"Num of patients with cancer: {len(personal_data_cancer)}")
# Check the number of people without cancer
personal_data_no_cancer = personal_data[personal_data['ovar_cancer']==0]
print(f"Num of patients without cancer: {len(personal_data_no_cancer)}")

Num of records in personal dataset: 78209
Num of unique plco_id in personal dataset: 78209
Num of patients with cancer: 613
Num of patients without cancer: 77596


In [8]:
# Convert discrete float columns to floats with lower number of bytes
personal_data = convert_numeric_to_float16(personal_data)
screen_data = convert_numeric_to_float16(screen_data)
screen_abnorm_data = convert_numeric_to_float16(screen_abnorm_data)

In [9]:
column = 'study_yr'
screen_data[column] = screen_data[column].astype(np.int8)
screen_abnorm_data[column] = screen_abnorm_data[column].astype(np.int8)


In [10]:
# personal_data = convert_columns_to_integers(personal_data)
# screen_data = convert_columns_to_integers(screen_data)
# screen_abnorm_data = convert_columns_to_integers(screen_abnorm_data)

In [11]:
screen_data['plco_id'].nunique() / personal_data['plco_id'].nunique()

0.3944430947844877

In [12]:
screen_abnorm_data['plco_id'].nunique() / personal_data['plco_id'].nunique()

0.08940147553350637

## Merging data into features and mean imputing

Deciding which patients will be in training vs test set to learn the mean imputation parameters.

Mean imputation: https://www.kaggle.com/general/226554

To predict whether person will get cancer in the next 1, 3, 5, 10 years we need to determine what feature describes when person got cancer. In our case that features is `ovar_cancer_diagdays`. We are going to use that feature in the following manner: we are going to have a sliding window to determine whether the person gets cancer withing the period of that window. Then, we are going to slide that window by one year and determine a new batch of people who are going to get cancer in that window and so on. For each of the features that are measured across multiple years we only going to use a record that is at the beginning of the interval. 

Since `ovar_cancer_diagdays` is described in days and we are concerned about the years when people will get cancer we need to convert this feature into years. Since people got cancer from year 1 to year 19, we are going to bucket it into 19 buckets corresponding to year when person got cancer. 

In [13]:
ovar_cancer_diagdays_range = personal_data['ovar_cancer_diagdays'].max() / 365
print(f'Max value of ovar_cancer_diagdays in years is {ovar_cancer_diagdays_range}')


Max value of ovar_cancer_diagdays in years is 18.893150684931506


In [14]:
# personal_data['ovar_cancer_years'] = pd.cut(personal_data['ovar_cancer_diagdays'], bins=19, labels=list(range(0, 19)), include_lowest=True)
# personal_data['ovar_cancer_years'] = pd.to_numeric(personal_data['ovar_cancer_years'])

In [15]:
personal_data['ovar_cancer_years'] = personal_data['ovar_cancer_diagdays'] / 365
personal_data['ovar_cancer_years'] = personal_data['ovar_cancer_years'].fillna(100)
personal_data['ovar_cancer_years'] = personal_data['ovar_cancer_years'].apply(np.int32)

We can also set every healthy persons '`ovar_cancer_years`' to 100 so that it is easier to filter them

In [16]:
# personal_data['ovar_cancer_years'] = personal_data['ovar_cancer_years'].fillna(100)

When merging abnormalities dataset we are going to keep each record for each `study_yr`, select the latest `VISIT` if there are multiple and take the largest value of each column over all of the remaining records. 

See below what I found about CA125 vs CA125ii......it sounds like we should actually just be using CA125ii.  When they switched tests, they actually used the new test (CA125ii) to re-test the frozen samples from all of the patients who had the original test.  So CA125ii should be present for all patients who had testing, and CA125 is only left in as a variable for a perspective on what patients were initially told at that time.


So I think that the only variable we need to use for this tumor marker is ca125ii_level0-5, which is from the ovary person dataset, and we can disregard values for ca125_level0-5.


CA-125 ASSAYS, VERSIONS 1 AND 2 

When the trial began, PLCO used the first version of the assay for all CA-125 exams. On October 1, 1995, CA-125II became available and the protocol was switched to use this for all subsequent screens. A few years after this transition, all of the original samples were re-assayed using version two. So, use CA125_LEVEL0-5 to get the result of a screen from a clinical perspective of what the participant was told following their screening visit. Use CA125II_LEVEL0-5 to get the result of a screen from an epidemiologic perspective, with all values coming from the same assay. 


Since ca125ii only present in the personal dataset, I only kept the feature that describes the level of ca125ii for a year that is a baseline year (for the windown when they got cancer) of people when merging data.

In [17]:
# drop non-cancer records without screen records
personal_filtered = personal_data
condition = (personal_data['plco_id'].isin(screen_data['plco_id'])) | (personal_data['ovar_cancer'] == 1)
condition = (personal_data['plco_id'].isin(screen_data['plco_id']))
# personal_filtered = personal_filtered[condition]

In [18]:
personal_filtered[personal_filtered['ovar_cancer_years']<50][['ovar_cancer_years']].describe()

Unnamed: 0,ovar_cancer_years
count,613.0
mean,7.052202
std,4.768896
min,0.0
25%,3.0
50%,7.0
75%,11.0
max,18.0


In [19]:
personal_filtered = personal_filtered.drop(['ovar_cancer_diagdays'], axis=1)
merged_df = merge_data_over_years(personal_filtered, screen_data, screen_abnorm_data, screen_join='left', abrorm_join='left')


In [20]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 658784 entries, 0 to 78208
Columns: 156 entries, ovar_cancer to was_screened
dtypes: float16(144), float64(7), int32(1), int64(3), object(1)
memory usage: 243.8+ MB


In [21]:
# Convert discrete float columns to floats with lower number of bytes
# Maybe discretise some
# merged_summary = summarize_features(merged_df)
# cols_unique_under_90 = merged_summary[merged_summary['unique count'] < 90].index
# merged_df[cols_unique_under_90] = merged_df[cols_unique_under_90].astype(np.float16)

In [22]:
# Drop features that are 100% missing
# merged_df = drop_cols_missing_percentage(100, merged_df, 'merged_df')

In [23]:
# get_cols_missing_percentage(50, merged_df, 'imputed_df', show_missing=True)

In [24]:
# TODO: why not have ph_ovar for trial instead of bq?
# Adding a feature of ph_any_bq but with ovar cancer marked as negative
merged_df['ph_any_not_ovar_bq'] = merged_df['ph_any_bq']
merged_df.loc[merged_df[merged_df['ph_ovar_bq'] == 1].index, 'ph_any_not_ovar_bq'] = 0
merged_df = merged_df.drop(['ph_any_bq', 'ph_ovar_bq'], axis=1)

In [25]:
# TODO add index column
merged_df.reset_index(inplace=True)
merged_df = merged_df.drop(['index'], axis=1)
merged_df['index'] = merged_df.index
# missing_df.to_csv('./missing_features/combined_missing.csv')
merged_df.to_csv('./processed_dataset/recent_propagated_dataset.csv')