# Summary Statistics

***In order to evaluate the viability of our study, in the following, I will:***<br /> 
*(the concept of `2~6 years before index date as study time horizon` will also be considered)*<br /> 
 (1) count total patient numbers<br /> 
 (2) count total total patient numbers by LOINC code, this gives a sense of the size of patient cohort for study<br /> 
 (3) count total total number of lab resuls, this gives a sense of average number of lab results per patient. The higher, the more viability to build a prediction model for a certain lab type.<br /> 
 ***The above results were summarized in the spreadsheet in the dropbox:<br /> ~/Dropbox (UFL)/2022-PersonalizedLabTest/04. Codes***

In [2]:
import sklearn
import pandas as pd
import numpy as np
import numba as nb
import pickle 
import os
import functools as ft
import swifter

In [3]:
lab_result_cm_paired = pickle.load(open('/data/datasets/changyuyin/1_Personalized_Lab_Test/lab_result_cm_processed.pkl','rb'))
lung_patient = pickle.load(open('/data/datasets/changyuyin/1_Personalized_Lab_Test/lung_patient.pkl','rb'))
colorectal_patient = pickle.load(open('/data/datasets/changyuyin/1_Personalized_Lab_Test/colorectal_patient.pkl','rb'))
breast_patient = pickle.load(open('/data/datasets/changyuyin/1_Personalized_Lab_Test/breast_patient.pkl','rb'))

## patient count

In [9]:
#total number of patients in the `lab` table
len(lab_result_cm_paired['ID'].drop_duplicates())

151784

In [10]:
#total number of patients in the `lab` table, with lab data within 2~6 years before index diagnosis data
len(lab_result_cm_paired.loc[(lab_result_cm_paired['LAB_ORDER_DATE']>=lab_result_cm_paired['study_range_min']) &
                     (lab_result_cm_paired['LAB_ORDER_DATE']<=lab_result_cm_paired['study_range_max'])]['ID'].drop_duplicates())

30406

In [11]:
#total number of patients in the `lab` table, with lab data within 1~6 years before index diagnosis data
len(lab_result_cm_paired.loc[(lab_result_cm_paired['LAB_ORDER_DATE']>=lab_result_cm_paired['study_range_min']) &
                     (lab_result_cm_paired['LAB_ORDER_DATE']<=lab_result_cm_paired['study_range_max_1'])]['ID'].drop_duplicates())

41042

In [12]:
#total number of patients in the `lab` table, with lab data before 1 years before index diagnosis data
len(lab_result_cm_paired.loc[(lab_result_cm_paired['LAB_ORDER_DATE']<=lab_result_cm_paired['study_range_max_1'])]['ID'].drop_duplicates())

43018

In [14]:
len(lab_result_cm_paired.loc[(lab_result_cm_paired['LAB_ORDER_DATE']>=lab_result_cm_paired['study_range_min']) &
                     (lab_result_cm_paired['LAB_ORDER_DATE']<=lab_result_cm_paired['study_range_max'])]['ID'].drop_duplicates())

30406

In [15]:
len(lab_result_cm_paired.loc[(lab_result_cm_paired['LAB_ORDER_DATE']>=lab_result_cm_paired['study_range_min']) &
                     (lab_result_cm_paired['LAB_ORDER_DATE']<=lab_result_cm_paired['study_range_max_1'])]['ID'].drop_duplicates())

41042

In [None]:
len(lab_result_cm_paired.loc[(lab_result_cm_paired['LAB_ORDER_DATE']<=lab_result_cm_paired['study_range_max_1'])]['ID'].drop_duplicates())

In [16]:
len(lab_result_cm_paired.loc[(lab_result_cm_paired['LAB_ORDER_DATE']<=lab_result_cm_paired['study_range_max_05'])]['ID'].drop_duplicates())

50448

In [17]:
len(lab_result_cm_paired.loc[(lab_result_cm_paired['LAB_ORDER_DATE']>=lab_result_cm_paired['study_range_min']) &
                     (lab_result_cm_paired['LAB_ORDER_DATE']<=lab_result_cm_paired['study_range_max_05'])]['ID'].drop_duplicates())

48648

In [19]:
len(lab_result_cm_paired.loc[(lab_result_cm_paired['LAB_ORDER_DATE']>=lab_result_cm_paired['index_date']) ]['ID'].\
    drop_duplicates())

140664

In [21]:
#build table within study range
lab_result_cm_paired_inrange = lab_result_cm_paired.loc[(lab_result_cm_paired['LAB_ORDER_DATE']
                                                         >=lab_result_cm_paired['study_range_min']) &
                                                         (lab_result_cm_paired['LAB_ORDER_DATE']
                                                          <=lab_result_cm_paired['study_range_max'])]

In [22]:
#build table within study range
lab_result_cm_paired_inrange_1 = lab_result_cm_paired.loc[(lab_result_cm_paired['LAB_ORDER_DATE']
                                                         >=lab_result_cm_paired['study_range_min']) &
                                                         (lab_result_cm_paired['LAB_ORDER_DATE']
                                                          <=lab_result_cm_paired['study_range_max_1'])]

In [23]:
#build table within study range
lab_result_cm_paired_inrange_05 = lab_result_cm_paired.loc[(lab_result_cm_paired['LAB_ORDER_DATE']
                                                         >=lab_result_cm_paired['study_range_min']) &
                                                         (lab_result_cm_paired['LAB_ORDER_DATE']
                                                          <=lab_result_cm_paired['study_range_max_05'])]

In [24]:
#build table within study range
lab_result_cm_paired_inrange_less1 = lab_result_cm_paired.loc[
                                                         (lab_result_cm_paired['LAB_ORDER_DATE']
                                                          <=lab_result_cm_paired['study_range_max_1'])]

In [25]:
#build table within study range
lab_result_cm_paired_inrange_less05 = lab_result_cm_paired.loc[
                                                         (lab_result_cm_paired['LAB_ORDER_DATE']
                                                          <=lab_result_cm_paired['study_range_max_05'])]

## count by LOINC 

In [26]:
#total patient cocunts by LOINC code
loinc_patient_count = lab_result_cm_paired[['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total"})

In [27]:
#total lab result cocunts by LOINC code
loinc_lab_count = lab_result_cm_paired[['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total"})

In [28]:
#total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange = lab_result_cm_paired_inrange[['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_6_2"})

In [29]:
#total lab result cocunts by LOINC code
loinc_lab_count_inrange = lab_result_cm_paired_inrange[['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_6_2"})

In [30]:
#total patient cocunts by LOINC code, only within 1~6 year before index date
loinc_patient_count_inrange_1 = lab_result_cm_paired_inrange_1[['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_6_1"})

In [31]:
#total lab result cocunts by LOINC code
loinc_lab_count_inrange_1 = lab_result_cm_paired_inrange_1[['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_6_1"})

In [32]:
#3.27 total patient cocunts by LOINC code, only within 1~6 year before index date
loinc_patient_count_inrange_less1 = lab_result_cm_paired_inrange_less1[['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_less1"})

In [33]:
#3.26 total lab result cocunts by LOINC code
loinc_lab_count_inrange_less1 = lab_result_cm_paired_inrange_less1[['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_less1"})

In [34]:
#3.25 total patient cocunts by LOINC code, only within 1~6 year before index date
loinc_patient_count_inrange_05 = lab_result_cm_paired_inrange_05[['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_6_05"})
#3.26 total lab result cocunts by LOINC code
loinc_lab_count_inrange_05 = lab_result_cm_paired_inrange_05[['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_6_05"})
#3.27 total patient cocunts by LOINC code, only within 1~6 year before index date
loinc_patient_count_inrange_less05 = lab_result_cm_paired_inrange_less05[['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_less05"})
#3.26 total lab result cocunts by LOINC code
loinc_lab_count_inrange_less05 = lab_result_cm_paired_inrange_less05[['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_less05"})

***Repeat for subcategories***

### lung cancer

In [36]:
p_id = lung_patient['ID'].values

In [37]:
#3.21 total patient cocunts by LOINC code
loinc_patient_count_lung = lab_result_cm_paired.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_lung"})
#3.22 total lab result cocunts by LOINC code
loinc_lab_count_lung = lab_result_cm_paired.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_lung"})
#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_lung = lab_result_cm_paired_inrange.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_lung_6_2"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_lung = lab_result_cm_paired_inrange.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_lung_6_2"})

#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_lung_1 = lab_result_cm_paired_inrange_1.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_lung_6_1"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_lung_1 = lab_result_cm_paired_inrange_1.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_lung_6_1"})

#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_lung_less1 = lab_result_cm_paired_inrange_less1.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_lung_less1"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_lung_less1 = lab_result_cm_paired_inrange_less1.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_lung_less1"})



#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_lung_05 = lab_result_cm_paired_inrange_05.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_lung_6_05"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_lung_05 = lab_result_cm_paired_inrange_05.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_lung_6_05"})

#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_lung_less05 = lab_result_cm_paired_inrange_less05.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_lung_less05"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_lung_less05 = lab_result_cm_paired_inrange_less05.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_lung_less05"})



### colorectal cancer

In [38]:
p_id = colorectal_patient['ID'].values

In [39]:
#3.21 total patient cocunts by LOINC code
loinc_patient_count_colorectal = lab_result_cm_paired.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_colorectal"})
#3.22 total lab result cocunts by LOINC code
loinc_lab_count_colorectal = lab_result_cm_paired.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_colorectal"})
#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_colorectal = lab_result_cm_paired_inrange.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_colorectal_6_2"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_colorectal = lab_result_cm_paired_inrange.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_colorectal_6_2"})



#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_colorectal_1 = lab_result_cm_paired_inrange_1.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_colorectal_6_1"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_colorectal_1 = lab_result_cm_paired_inrange_1.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_colorectal_6_1"})

#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_colorectal_less1 = lab_result_cm_paired_inrange_less1.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_colorectal_less1"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_colorectal_less1 = lab_result_cm_paired_inrange_less1.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_colorectal_less1"})




#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_colorectal_05 = lab_result_cm_paired_inrange_05.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_colorectal_6_05"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_colorectal_05 = lab_result_cm_paired_inrange_05.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_colorectal_6_05"})

#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_colorectal_less05 = lab_result_cm_paired_inrange_less05.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_colorectal_less05"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_colorectal_less05 = lab_result_cm_paired_inrange_less05.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_colorectal_less05"})

### breast cancer

In [40]:
p_id = breast_patient['ID'].values

In [41]:
#3.21 total patient cocunts by LOINC code
loinc_patient_count_breast = lab_result_cm_paired.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_breast"})
#3.22 total lab result cocunts by LOINC code
loinc_lab_count_breast = lab_result_cm_paired.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_breast"})
#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_breast = lab_result_cm_paired_inrange.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_breast_6_2"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_breast = lab_result_cm_paired_inrange.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_breast_6_2"})


#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_breast_1 = lab_result_cm_paired_inrange_1.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_breast_6_1"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_breast_1 = lab_result_cm_paired_inrange_1.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_breast_6_1"})

#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_breast_less1 = lab_result_cm_paired_inrange_less1.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_breast_less1"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_breast_less1 = lab_result_cm_paired_inrange_less1.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_breast_less1"})



#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_breast_05 = lab_result_cm_paired_inrange_05.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_breast_6_05"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_breast_05 = lab_result_cm_paired_inrange_05.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_breast_6_05"})

#3.23 total patient cocunts by LOINC code, only within 2~6 year before index date
loinc_patient_count_inrange_breast_less05 = lab_result_cm_paired_inrange_less05.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"ID": "Num_Patient_Total_inrange_breast_less05"})
#3.24 total lab result cocunts by LOINC code
loinc_lab_count_inrange_breast_less05 = lab_result_cm_paired_inrange_less05.loc[lab_result_cm_paired['ID'].isin(p_id)][['LAB_LOINC','LAB_RESULT_CM_ID']].drop_duplicates().\
                groupby('LAB_LOINC').count().\
                reset_index().\
                sort_values('LAB_RESULT_CM_ID',ascending=False).\
                reset_index(drop=True).\
                rename(columns={"LAB_RESULT_CM_ID": "Num_LabResult_Total_inrange_breast_less05"})

***assemble all stats***

In [42]:
print("number of LOINC labs types (whole data set)\n",
     len(loinc_patient_count),len(loinc_lab_count),len(loinc_patient_count_inrange),len(loinc_lab_count_inrange),len(loinc_patient_count_inrange_1),len(loinc_lab_count_inrange_1),len(loinc_patient_count_inrange_less1),len(loinc_lab_count_inrange_less1),len(loinc_patient_count_inrange_05),len(loinc_lab_count_inrange_05),len(loinc_patient_count_inrange_less05),len(loinc_lab_count_inrange_less05))
print("number of LOINC labs types (lung cancer, identified by icd in data request form)\n",
     len(loinc_patient_count_lung),len(loinc_lab_count_lung),len(loinc_patient_count_inrange_lung),len(loinc_lab_count_inrange_lung),len(loinc_patient_count_inrange_lung_1),len(loinc_lab_count_inrange_lung_1),len(loinc_patient_count_inrange_lung_less1),len(loinc_lab_count_inrange_lung_less1),len(loinc_patient_count_inrange_lung_05),len(loinc_lab_count_inrange_lung_05),len(loinc_patient_count_inrange_lung_less05),len(loinc_lab_count_inrange_lung_less05))
print("number of LOINC labs types (colorectal cancer, identified by icd in data request form)\n",
     len(loinc_patient_count_colorectal),len(loinc_lab_count_colorectal),len(loinc_patient_count_inrange_colorectal),len(loinc_lab_count_inrange_colorectal),len(loinc_patient_count_inrange_colorectal_1),len(loinc_lab_count_inrange_colorectal_1),len(loinc_patient_count_inrange_colorectal_less1),len(loinc_lab_count_inrange_colorectal_less1),len(loinc_patient_count_inrange_colorectal_05),len(loinc_lab_count_inrange_colorectal_05),len(loinc_patient_count_inrange_colorectal_less05),len(loinc_lab_count_inrange_colorectal_less05))
print("number of LOINC labs types (breast cancer, identified by icd in data request form)\n",
     len(loinc_patient_count_breast),len(loinc_lab_count_breast),len(loinc_patient_count_inrange_breast),len(loinc_lab_count_inrange_breast),len(loinc_patient_count_inrange_breast_1),len(loinc_lab_count_inrange_breast_1),len(loinc_patient_count_inrange_breast_less1),len(loinc_lab_count_inrange_breast_less1),len(loinc_patient_count_inrange_breast_05),len(loinc_lab_count_inrange_breast_05),len(loinc_patient_count_inrange_breast_less05),len(loinc_lab_count_inrange_breast_less05))

number of LOINC labs types (whole data set)
 4585 4585 2420 2420 2836 2836 2855 2855 3059 3059 3078 3078
number of LOINC labs types (lung cancer, identified by icd in data request form)
 3627 3627 1910 1910 2232 2232 2246 2246 2439 2439 2450 2450
number of LOINC labs types (colorectal cancer, identified by icd in data request form)
 3527 3527 1780 1780 2201 2201 2220 2220 2388 2388 2406 2406
number of LOINC labs types (breast cancer, identified by icd in data request form)
 3899 3899 1929 1929 2134 2134 2147 2147 2337 2337 2347 2347


In [43]:
dfs = [loinc_patient_count,
       loinc_lab_count,
       loinc_patient_count_inrange,
       loinc_lab_count_inrange,
       loinc_patient_count_inrange_1,
       loinc_lab_count_inrange_1,
       loinc_patient_count_inrange_less1,
       loinc_lab_count_inrange_less1,
       loinc_patient_count_inrange_05,
       loinc_lab_count_inrange_05,
       loinc_patient_count_inrange_less05,
       loinc_lab_count_inrange_less05,
       
       loinc_patient_count_lung,
       loinc_lab_count_lung,
       loinc_patient_count_inrange_lung,
       loinc_lab_count_inrange_lung,
       loinc_patient_count_inrange_lung_1,
       loinc_lab_count_inrange_lung_1,
       loinc_patient_count_inrange_lung_less1,
       loinc_lab_count_inrange_lung_less1,
       loinc_patient_count_inrange_lung_05,
       loinc_lab_count_inrange_lung_05,
       loinc_patient_count_inrange_lung_less05,
       loinc_lab_count_inrange_lung_less05,
       
       
       loinc_patient_count_colorectal,
       loinc_lab_count_colorectal,
       loinc_patient_count_inrange_colorectal,
       loinc_lab_count_inrange_colorectal,
       loinc_patient_count_inrange_colorectal_1,
       loinc_lab_count_inrange_colorectal_1,
       loinc_patient_count_inrange_colorectal_less1,
       loinc_lab_count_inrange_colorectal_less1,
       loinc_patient_count_inrange_colorectal_05,
       loinc_lab_count_inrange_colorectal_05,
       loinc_patient_count_inrange_colorectal_less05,
       loinc_lab_count_inrange_colorectal_less05,
       
       loinc_patient_count_breast,
       loinc_lab_count_breast,
       loinc_patient_count_inrange_breast,
       loinc_lab_count_inrange_breast,
       loinc_patient_count_inrange_breast_1,
       loinc_lab_count_inrange_breast_1,
       loinc_patient_count_inrange_breast_less1,
       loinc_lab_count_inrange_breast_less1,
       loinc_patient_count_inrange_breast_05,
       loinc_lab_count_inrange_breast_05,
       loinc_patient_count_inrange_breast_less05,
       loinc_lab_count_inrange_breast_less05]

loinc_count_assemble = ft.reduce(lambda left, right: pd.merge(left, right, on='LAB_LOINC',how='left'), dfs)

In [44]:
loinc_count_assemble = loinc_count_assemble.fillna(0)

In [45]:
loinc_count_assemble.head(20)

Unnamed: 0,LAB_LOINC,Num_Patient_Total,Num_LabResult_Total,Num_Patient_Total_inrange_6_2,Num_LabResult_Total_inrange_6_2,Num_Patient_Total_inrange_6_1,Num_LabResult_Total_inrange_6_1,Num_Patient_Total_inrange_less1,Num_LabResult_Total_inrange_less1,Num_Patient_Total_inrange_6_05,...,Num_Patient_Total_inrange_breast_6_2,Num_LabResult_Total_inrange_breast_6_2,Num_Patient_Total_inrange_breast_6_1,Num_LabResult_Total_inrange_breast_6_1,Num_Patient_Total_inrange_breast_less1,Num_LabResult_Total_inrange_breast_less1,Num_Patient_Total_inrange_breast_6_05,Num_LabResult_Total_inrange_breast_6_05,Num_Patient_Total_inrange_breast_less05,Num_LabResult_Total_inrange_breast_less05
0,718-7,143364,2816725,27045.0,209031.0,36656.0,326349.0,38646.0,363425.0,43479.0,...,11332.0,66530.0,15447.0,101510.0,16389.0,115312.0,18376.0,126193.0,19257.0,139995.0
1,2160-0,143104,3000442,27206.0,221019.0,36867.0,346025.0,38717.0,384140.0,43699.0,...,11227.0,70119.0,15387.0,107603.0,16239.0,121224.0,18331.0,133805.0,19128.0,147426.0
2,3094-0,133077,2197373,21598.0,142239.0,30156.0,230569.0,31291.0,252162.0,36233.0,...,8663.0,42184.0,12237.0,67107.0,12732.0,74405.0,14771.0,85184.0,15241.0,92482.0
3,2345-7,131690,2191883,21105.0,142812.0,29522.0,231294.0,30557.0,252545.0,35520.0,...,8479.0,42580.0,11985.0,67689.0,12434.0,74872.0,14485.0,85867.0,14909.0,93050.0
4,2823-3,131682,2241476,21011.0,144335.0,29436.0,234278.0,30457.0,255722.0,35419.0,...,8421.0,42595.0,11930.0,67851.0,12371.0,75087.0,14416.0,86230.0,14835.0,93466.0
5,17861-6,131649,2175532,21005.0,140340.0,29428.0,227634.0,30452.0,248460.0,35416.0,...,8424.0,41617.0,11934.0,66239.0,12379.0,73238.0,14425.0,84129.0,14847.0,91128.0
6,2951-2,131625,2202062,20970.0,141205.0,29369.0,228917.0,30389.0,249857.0,35336.0,...,8404.0,41685.0,11898.0,66310.0,12338.0,73339.0,14375.0,84302.0,14793.0,91331.0
7,2075-0,131549,2164607,20967.0,138999.0,29372.0,225739.0,30390.0,246352.0,35344.0,...,8405.0,41189.0,11901.0,65604.0,12339.0,72521.0,14385.0,83355.0,14801.0,90272.0
8,1975-2,123443,1325111,18665.0,80563.0,26300.0,131089.0,27295.0,143561.0,31712.0,...,7516.0,26649.0,10690.0,42199.0,11134.0,46829.0,12901.0,53185.0,13323.0,57815.0
9,1920-8,122342,1285056,18308.0,76906.0,25840.0,125578.0,26783.0,137082.0,31160.0,...,7342.0,25204.0,10436.0,40082.0,10848.0,44217.0,12591.0,50533.0,12979.0,54668.0


In [46]:
loinc_count_assemble.columns

Index(['LAB_LOINC', 'Num_Patient_Total', 'Num_LabResult_Total',
       'Num_Patient_Total_inrange_6_2', 'Num_LabResult_Total_inrange_6_2',
       'Num_Patient_Total_inrange_6_1', 'Num_LabResult_Total_inrange_6_1',
       'Num_Patient_Total_inrange_less1', 'Num_LabResult_Total_inrange_less1',
       'Num_Patient_Total_inrange_6_05', 'Num_LabResult_Total_inrange_6_05',
       'Num_Patient_Total_inrange_less05',
       'Num_LabResult_Total_inrange_less05', 'Num_Patient_Total_lung',
       'Num_LabResult_Total_lung', 'Num_Patient_Total_inrange_lung_6_2',
       'Num_LabResult_Total_inrange_lung_6_2',
       'Num_Patient_Total_inrange_lung_6_1',
       'Num_LabResult_Total_inrange_lung_6_1',
       'Num_Patient_Total_inrange_lung_less1',
       'Num_LabResult_Total_inrange_lung_less1',
       'Num_Patient_Total_inrange_lung_6_05',
       'Num_LabResult_Total_inrange_lung_6_05',
       'Num_Patient_Total_inrange_lung_less05',
       'Num_LabResult_Total_inrange_lung_less05',
       'Num_Pati

In [47]:
loinc_count_assemble.to_csv('LAB_LOINC_STAT.csv')