# Data Preprocessing

In [2]:
import numpy as np # Linear algebra
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt  # Matlab-style plotting
# Make sure plot shows immediately
%matplotlib inline 
import seaborn as sns # Library for plotting
color = sns.color_palette()
sns.set_style('darkgrid')

In [3]:
df_claim = pd.read_csv("./data/Claims.csv")
df_member = pd.read_csv("./data/Members.csv", index_col=["MemberID"])
df_lab = pd.read_csv("./data/LabCount.csv")
df_drug = pd.read_csv("./data/DrugCount.csv")

# Preprocessing

Convert `PayDelay` to numeric by converting `162+` to `162

In [4]:
df_claim["PayDelay"].replace({'162+': 162}, inplace=True)
df_claim["PayDelay"] = df_claim["PayDelay"].astype("int32")

Covert `LengthOfStay` to average number of days

In [5]:
%%time
# convert los to day
los_value = [ "1 day", "2 days", "3 days", "4 days", "5 days", "6 days", "1- 2 weeks", "2- 4 weeks", "4- 8 weeks", "8- 12 weeks", "12- 26 weeks", "26+ weeks"]
los_replace = [ 1, 2, 3, 4, 5, 6, 11, 21, 42, 84, 133, 182]

for i, j in zip(los_value, los_replace):
    df_claim.loc[df_claim["LengthOfStay"] == i, "LengthOfStay"] = j

df_claim["LengthOfStay"] = df_claim["LengthOfStay"].astype("float64")

CPU times: user 1.25 s, sys: 132 ms, total: 1.38 s
Wall time: 1.9 s


Convert `DSFS` to number of months

In [6]:
%%time
# convert to month
dsfs_value = [ '0- 1 month', '1- 2 months', '2- 3 months', '3- 4 months', '4- 5 months', '5- 6 months', '6- 7 months', '7- 8 months', '8- 9 months', '9-10 months', '10-11 months', '11-12 months']
dsfs_replace = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ]

try:
    for i, j in zip(dsfs_value, dsfs_replace):
        df_claim["DSFS"].replace({i: j}, inplace=True)

    df_claim["DSFS"] = df_claim["DSFS"].astype("float64")
except: 
    pass

CPU times: user 1.48 s, sys: 173 ms, total: 1.65 s
Wall time: 2.33 s


Convert `CharlsonIndex` to number

In [7]:
%%time
# convert to month
ch_value = [ '0', '1-2', '3-4', '5+' ]
ch_replace = [ 0, 2, 4, 6 ]

try:
    for i, j in zip(ch_value, ch_replace):
        df_claim["CharlsonIndex"].replace({i: j}, inplace=True)
       
    df_claim["CharlsonIndex"] = df_claim["CharlsonIndex"].astype("float64")
except: 
    pass

CPU times: user 847 ms, sys: 116 ms, total: 963 ms
Wall time: 1.16 s


Convert `LabCount` to number

In [8]:
df_lab["LabCount"].replace({ "10+": 10 }, inplace=True)
df_lab["LabCount"] = df_lab["LabCount"].astype("float64")

Convert `DrugCount` to number

In [9]:
df_drug["DrugCount"].replace({ "7+": 7 }, inplace=True)
df_drug["DrugCount"] = df_drug["DrugCount"].astype("float64")

## Member

Convert `AgeAtFirstClaim` to mean value. Fill `NaN` by `-1`

In [10]:
age_list = list(df_member["AgeAtFirstClaim"].value_counts().index)
age_list.sort()
age_list

['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80+']

In [11]:
mean_age_list = [ 5, 15, 25, 35, 45, 55, 65, 75, 85]

In [12]:
for age, mean_age in zip(age_list, mean_age_list):
    df_member.loc[df_member["AgeAtFirstClaim"] == age, "AgeAtFirstClaim"] = mean_age

df_member.loc[df_member["AgeAtFirstClaim"].isna(), "AgeAtFirstClaim"] = -1

In [13]:
df_member.to_csv("./processed/Members.csv")

## ClaimCount

Calculate `ClaimCount` for each `MemberID` in `Y1`, `Y2` and both `Y1-Y2`

In [14]:
df_claim_y1 = df_claim[df_claim["Year"] == "Y1"]
print(df_claim_y1.shape)
df_claim_y2 = df_claim[df_claim["Year"] == "Y2"]
print(df_claim_y2.shape)
df_claim_y12 = df_claim[(df_claim["Year"] == "Y1") | (df_claim["Year"] == "Y2")]
print(df_claim_y12.shape)

(865689, 14)
(898872, 14)
(1764561, 14)


In [15]:
def value_counts(df, count_by, col_name):
    cnt = df[count_by].value_counts()
    return pd.DataFrame({ count_by: list(cnt.index), col_name: list(cnt)}).set_index(count_by)

In [16]:
df_claim_count_y1 = value_counts(df_claim_y1, "MemberID", "ClaimCount")
df_claim_count_y2 = value_counts(df_claim_y2, "MemberID", "ClaimCount")
df_claim_count_y12 = value_counts(df_claim_y12, "MemberID", "ClaimCount")

In [17]:
df_claim_count_y1.to_csv("./processed/ClaimCount_Y1.csv")
df_claim_count_y2.to_csv("./processed/ClaimCount_Y2.csv")
df_claim_count_y12.to_csv("./processed/ClaimCount_Y12.csv")

## Provider

In [18]:
def value_counts_unique(df, count_by, count_unique_col, col_name):
    cnt = df.groupby(count_by)[count_unique_col].nunique(dropna=True)
    return pd.DataFrame({ count_by: cnt.index, col_name: cnt}).set_index(count_by)

Count unique `ProviderID` each `MemberID` in `Y1`, `Y2`, `Y1-Y2`

In [19]:
df_pvd_count_y1 = value_counts_unique(df_claim_y1, "MemberID", "ProviderID", "ProviderID")
df_pvd_count_y2 = value_counts_unique(df_claim_y2, "MemberID", "ProviderID", "ProviderID")
df_pvd_count_y12 = value_counts_unique(df_claim_y12, "MemberID", "ProviderID", "ProviderID")

In [20]:
df_pvd_count_y1.to_csv("./processed/ProviderID_Y1.csv")
df_pvd_count_y2.to_csv("./processed/ProviderID_Y2.csv")
df_pvd_count_y12.to_csv("./processed/ProviderID_Y12.csv")

## Vendor

Count unique `Vendor` each `MemberID` in `Y1`, `Y2`, `Y1-Y2`

In [21]:
df_v_count_y1 = value_counts_unique(df_claim_y1, "MemberID", "Vendor", "Vendor")
df_v_count_y2 = value_counts_unique(df_claim_y1, "MemberID", "Vendor", "Vendor")
df_v_count_y12 = value_counts_unique(df_claim_y1, "MemberID", "Vendor", "Vendor")

In [22]:
df_v_count_y1.to_csv("./processed/Vendor_Y1.csv")
df_v_count_y2.to_csv("./processed/Vendor_Y2.csv")
df_v_count_y12.to_csv("./processed/Vendor_Y12.csv")

## PCP

Count unique `PCP` each `MemberID` in `Y1`, `Y2`, `Y1-Y2`

In [23]:
df_pcp_count_y1 = value_counts_unique(df_claim_y1, "MemberID", "PCP", "PCP")
df_pcp_count_y2 = value_counts_unique(df_claim_y2, "MemberID", "PCP", "PCP")
df_pcp_count_y12 = value_counts_unique(df_claim_y12, "MemberID", "PCP", "PCP")

In [24]:
df_pcp_count_y1.to_csv("./processed/PCP_Y1.csv")
df_pcp_count_y2.to_csv("./processed/PCP_Y2.csv")
df_pcp_count_y12.to_csv("./processed/PCP_Y12.csv")

## Specialty

Count each kind of `Speciaty` each `MemberID` in `Y1`, `Y2`, `Y1-Y2`

In [28]:
def one_hot_count(index_col, count_value_col, df, one_hot_cols):
    # get unique index
    index_list = list(df[index_col].unique())
    # create df
    result_df = pd.DataFrame(index=index_list)
    result_df.index.name = index_col
    # fill one hot columns
    for col in one_hot_cols:
        col_name = "{}Count_{}".format(count_value_col, col)
        result_df[col_name] = 0
    # query and fill count
    query = df.groupby([index_col])[count_value_col].value_counts(dropna=False)
    i = 0
    length = len(index_list)
    for index in index_list:
        for col in one_hot_cols:
            col_name = "{}Count_{}".format(count_value_col, col)
            try:
                result_df.loc[index][col_name] = query.loc[(index, col)]
            except:
                continue

        if i % 10000 == 0:
            print("Processed: {}/{}".format(i, length))
        i += 1
    print("Processed: {}/{} DONE".format(i, length))
    return result_df

In [29]:
spe_unique_y1 = list(df_claim_y1["Specialty"].unique())
spe_unique_y2 = list(df_claim_y2["Specialty"].unique())
spe_unique_y12 = list(df_claim_y12["Specialty"].unique())

In [30]:
%%time
df_spe_count_y1 = one_hot_count("MemberID", "Specialty", df_claim_y1, spe_unique_y1)
df_spe_count_y2 = one_hot_count("MemberID", "Specialty", df_claim_y2, spe_unique_y2)
df_spe_count_y12 = one_hot_count("MemberID", "Specialty", df_claim_y12, spe_unique_y12)

Processed: 0/76038
Processed: 10000/76038
Processed: 20000/76038
Processed: 30000/76038
Processed: 40000/76038
Processed: 50000/76038
Processed: 60000/76038
Processed: 70000/76038
Processed: 76038/76038 DONE
Processed: 0/71435
Processed: 10000/71435
Processed: 20000/71435
Processed: 30000/71435
Processed: 40000/71435
Processed: 50000/71435
Processed: 60000/71435
Processed: 70000/71435
Processed: 71435/71435 DONE
Processed: 0/95507
Processed: 10000/95507
Processed: 20000/95507
Processed: 30000/95507
Processed: 40000/95507
Processed: 50000/95507
Processed: 60000/95507
Processed: 70000/95507
Processed: 80000/95507
Processed: 90000/95507
Processed: 95507/95507 DONE
CPU times: user 6min 34s, sys: 6.93 s, total: 6min 41s
Wall time: 11min 15s


In [31]:
df_spe_count_y1.to_csv("./processed/SpecialtyCount_Y1.csv")
df_spe_count_y2.to_csv("./processed/SpecialtyCount_Y2.csv")
df_spe_count_y12.to_csv("./processed/SpecialtyCount_Y12.csv")

## PlaceSvc

Count each kind of `PlaceSvc` each `MemberID` in `Y1`, `Y2`, `Y1-Y2`

In [32]:
psvc_unique_y1 = list(df_claim_y1["PlaceSvc"].unique())
psvc_unique_y2 = list(df_claim_y2["PlaceSvc"].unique())
psvc_unique_y12 = list(df_claim_y12["PlaceSvc"].unique())

In [33]:
%%time
df_psvc_count_y1 = one_hot_count("MemberID", "PlaceSvc", df_claim_y1, psvc_unique_y1)
df_psvc_count_y2 = one_hot_count("MemberID", "PlaceSvc", df_claim_y2, psvc_unique_y2)
df_psvc_count_y12 = one_hot_count("MemberID", "PlaceSvc", df_claim_y12, psvc_unique_y12)

Processed: 0/76038
Processed: 10000/76038
Processed: 20000/76038
Processed: 30000/76038
Processed: 40000/76038
Processed: 50000/76038
Processed: 60000/76038
Processed: 70000/76038
Processed: 76038/76038 DONE
Processed: 0/71435
Processed: 10000/71435
Processed: 20000/71435
Processed: 30000/71435
Processed: 40000/71435
Processed: 50000/71435
Processed: 60000/71435
Processed: 70000/71435
Processed: 71435/71435 DONE
Processed: 0/95507
Processed: 10000/95507
Processed: 20000/95507
Processed: 30000/95507
Processed: 40000/95507
Processed: 50000/95507
Processed: 60000/95507
Processed: 70000/95507
Processed: 80000/95507
Processed: 90000/95507
Processed: 95507/95507 DONE
CPU times: user 4min 7s, sys: 3.88 s, total: 4min 11s
Wall time: 5min 46s


In [34]:
df_psvc_count_y1.to_csv("./processed/PlaceSvcCount_Y1.csv")
df_psvc_count_y2.to_csv("./processed/PlaceSvcCount_Y2.csv")
df_psvc_count_y12.to_csv("./processed/PlaceSvcCount_Y12.csv")

## PayDelay

Calculate some metrics `min`, `max`, `avg`, `std`, `sum` of `PayDelay` in `Y1`, `Y2`, `Y1-Y2`

In [146]:
def process_paydelay(df):
    pdl_min = df.groupby(["MemberID"])["PayDelay"].min()
    pdl_max = df.groupby(["MemberID"])["PayDelay"].max()
    pdl_avg = df.groupby(["MemberID"])["PayDelay"].mean()
    pdl_std = df.groupby(["MemberID"])["PayDelay"].std()
    pdl_sum = df.groupby(["MemberID"])["PayDelay"].sum()

    result = pd.concat([ 
        pd.DataFrame(np.array(pdl_min).T, columns=["PayDelayMin"], index=pdl_min.index),
        pd.DataFrame(np.array(pdl_max).T, columns=["PayDelayMax"], index=pdl_max.index),
        pd.DataFrame(np.array(pdl_avg).T, columns=["PayDelayAvg"], index=pdl_avg.index),
        pd.DataFrame(np.array(pdl_std).T, columns=["PayDelayStd"], index=pdl_std.index),
        pd.DataFrame(np.array(pdl_sum).T, columns=["PayDelaySum"], index=pdl_sum.index)
    ], axis=1)
    result["PayDelayStd"].fillna(-1)
    result.fillna(0)
    return result

In [147]:
df_pdl_y1 = process_paydelay(df_claim_y1)
df_pdl_y2 = process_paydelay(df_claim_y2)
df_pdl_y12 = process_paydelay(df_claim_y12)

In [148]:
df_pdl_y1.to_csv("./processed/PayDelay_Y1.csv")
df_pdl_y2.to_csv("./processed/PayDelay_Y2.csv")
df_pdl_y12.to_csv("./processed/PayDelay_Y12.csv")

## LengthOfStay

Calculate some metrics `min`, `max`, `avg`, `std`, `sum` of `LengthOfStay` in `Y1`, `Y2`, `Y1-Y2`

In [125]:
def process_los(df):
    los_min = df.groupby(["MemberID"])["LengthOfStay"].min()
    los_max = df.groupby(["MemberID"])["LengthOfStay"].max()
    los_avg = df.groupby(["MemberID"])["LengthOfStay"].mean()
    los_std = df.groupby(["MemberID"])["LengthOfStay"].std()
    los_sum = df.groupby(["MemberID"])["LengthOfStay"].sum()
    los_nan = df[df["LengthOfStay"].isna()].groupby(["MemberID"])["MemberID"].count()

    return pd.concat([ 
        pd.DataFrame(np.array(los_min).T, columns=["LengthOfStayMin"], index=los_min.index),
        pd.DataFrame(np.array(los_max).T, columns=["LengthOfStayMax"], index=los_max.index),
        pd.DataFrame(np.array(los_avg).T, columns=["LengthOfStayAvg"], index=los_avg.index),
        pd.DataFrame(np.array(los_std).T, columns=["LengthOfStayStd"], index=los_std.index),
        pd.DataFrame(np.array(los_sum).T, columns=["LengthOfStaySum"], index=los_sum.index),
        pd.DataFrame(np.array(los_nan).T, columns=["LengthOfStayCountNan"], index=los_nan.index)
    ], axis=1).fillna(0)

In [126]:
df_los_y1 = process_los(df_claim_y1)
df_los_y2 = process_los(df_claim_y2)
df_los_y12 = process_los(df_claim_y12)

In [127]:
df_los_y1.to_csv("./processed/LengthOfStay_Y1.csv")
df_los_y2.to_csv("./processed/LengthOfStay_Y2.csv")
df_los_y12.to_csv("./processed/LengthOfStay_Y12.csv")

## DSFS

Calculate some metrics `min`, `max` of `DSFS` in `Y1`, `Y2`, `Y1-Y2`

In [42]:
def process_dsfs(df):
    dsfs_min = df.groupby(["MemberID"])["DSFS"].min()
    dsfs_max = df.groupby(["MemberID"])["DSFS"].max()

    return pd.concat([
        pd.DataFrame(np.array(dsfs_min).T, columns=["DSFSMin"], index=dsfs_min.index),
        pd.DataFrame(np.array(dsfs_max).T, columns=["DSFSMax"], index=dsfs_max.index),
    ], axis=1)

In [43]:
%%time
df_dsfs_y1 = process_dsfs(df_claim_y1)
df_dsfs_y2 = process_dsfs(df_claim_y2)
df_dsfs_y12 = process_dsfs(df_claim_y12)

CPU times: user 483 ms, sys: 68.3 ms, total: 552 ms
Wall time: 758 ms


In [44]:
df_dsfs_y1.to_csv("./processed/DSFS_Y1.csv")
df_dsfs_y2.to_csv("./processed/DSFS_Y2.csv")
df_dsfs_y12.to_csv("./processed/DSFS_Y12.csv")

## CharlsonIndex

In [None]:
Calculate some metrics `min`, `max`, `avg` of `CharlsonIndex` in `Y1`, `Y2`, `Y1-Y2`

In [45]:
def process_charlsonindex(df):
    ch_min = df.groupby(["MemberID"])["CharlsonIndex"].min()
    ch_max = df.groupby(["MemberID"])["CharlsonIndex"].max()
    ch_avg = df.groupby(["MemberID"])["CharlsonIndex"].mean()

    return pd.concat([
        pd.DataFrame(np.array(ch_min).T, columns=["CharlsonIndexMin"], index=ch_min.index),
        pd.DataFrame(np.array(ch_max).T, columns=["CharlsonIndexMax"], index=ch_max.index),
        pd.DataFrame(np.array(ch_avg).T, columns=["CharlsonIndexAvg"], index=ch_avg.index),
    ], axis=1)

In [46]:
df_ch_y1 = process_charlsonindex(df_claim_y1)
df_ch_y2 = process_charlsonindex(df_claim_y2)
df_ch_y12 = process_charlsonindex(df_claim_y12)

In [47]:
df_ch_y1.to_csv("./processed/CharlsonIndex_Y1.csv")
df_ch_y2.to_csv("./processed/CharlsonIndex_Y2.csv")
df_ch_y12.to_csv("./processed/CharlsonIndex_Y12.csv")

## PrimaryConditionGroup

Count each kind of `PrimaryConditionGroup` each `MemberID` in `Y1`, `Y2`, `Y1-Y2`

In [48]:
pcg_unique_y1 = list(df_claim_y1["PrimaryConditionGroup"].unique())
pcg_unique_y2 = list(df_claim_y2["PrimaryConditionGroup"].unique())
pcg_unique_y12 = list(df_claim_y12["PrimaryConditionGroup"].unique())

In [49]:
%%time
df_pcg_count_y1 = one_hot_count("MemberID", "PrimaryConditionGroup", df_claim_y1, pcg_unique_y1)
df_pcg_count_y2 = one_hot_count("MemberID", "PrimaryConditionGroup", df_claim_y2, pcg_unique_y2)
df_pcg_count_y12 = one_hot_count("MemberID", "PrimaryConditionGroup", df_claim_y12, pcg_unique_y12)

Processed: 0/76038
Processed: 10000/76038
Processed: 20000/76038
Processed: 30000/76038
Processed: 40000/76038
Processed: 50000/76038
Processed: 60000/76038
Processed: 70000/76038
Processed: 76038/76038 DONE
Processed: 0/71435
Processed: 10000/71435
Processed: 20000/71435
Processed: 30000/71435
Processed: 40000/71435
Processed: 50000/71435
Processed: 60000/71435
Processed: 70000/71435
Processed: 71435/71435 DONE
Processed: 0/95507
Processed: 10000/95507
Processed: 20000/95507
Processed: 30000/95507
Processed: 40000/95507
Processed: 50000/95507
Processed: 60000/95507
Processed: 70000/95507
Processed: 80000/95507
Processed: 90000/95507
Processed: 95507/95507 DONE
CPU times: user 16min, sys: 14.8 s, total: 16min 15s
Wall time: 23min 14s


In [50]:
df_pcg_count_y1.to_csv("./processed/PrimaryConditionGroup_Y1.csv")
df_pcg_count_y2.to_csv("./processed/PrimaryConditionGroup_Y2.csv")
df_pcg_count_y12.to_csv("./processed/PrimaryConditionGroup_Y12.csv")

## ProcedureGroup

Count each kind of `ProcedureGroup` each `MemberID` in `Y1`, `Y2`, `Y1-Y2`

In [51]:
pg_unique_y1 = list(df_claim_y1["ProcedureGroup"].unique())
pg_unique_y2 = list(df_claim_y2["ProcedureGroup"].unique())
pg_unique_y12 = list(df_claim_y12["ProcedureGroup"].unique())

In [52]:
%%time
df_pg_count_y1 = one_hot_count("MemberID", "ProcedureGroup", df_claim_y1, pg_unique_y1)
df_pg_count_y2 = one_hot_count("MemberID", "ProcedureGroup", df_claim_y2, pg_unique_y2)
df_pg_count_y12 = one_hot_count("MemberID", "ProcedureGroup", df_claim_y12, pg_unique_y12)

Processed: 0/76038
Processed: 10000/76038
Processed: 20000/76038
Processed: 30000/76038
Processed: 40000/76038
Processed: 50000/76038
Processed: 60000/76038
Processed: 70000/76038
Processed: 76038/76038 DONE
Processed: 0/71435
Processed: 10000/71435
Processed: 20000/71435
Processed: 30000/71435
Processed: 40000/71435
Processed: 50000/71435
Processed: 60000/71435
Processed: 70000/71435
Processed: 71435/71435 DONE
Processed: 0/95507
Processed: 10000/95507
Processed: 20000/95507
Processed: 30000/95507
Processed: 40000/95507
Processed: 50000/95507
Processed: 60000/95507
Processed: 70000/95507
Processed: 80000/95507
Processed: 90000/95507
Processed: 95507/95507 DONE
CPU times: user 9min 35s, sys: 9.96 s, total: 9min 45s
Wall time: 14min 35s


In [53]:
df_pg_count_y1.to_csv("./processed/ProcedureGroup_Y1.csv")
df_pg_count_y2.to_csv("./processed/ProcedureGroup_Y2.csv")
df_pg_count_y12.to_csv("./processed/ProcedureGroup_Y12.csv")

## LabCount

In [54]:
df_lab_y1 = df_lab[df_lab["Year"] == "Y1"]
print(df_lab_y1.shape)
df_lab_y2 = df_lab[df_lab["Year"] == "Y2"]
print(df_lab_y2.shape)
df_lab_y12 = df_lab[(df_lab["Year"] == "Y1") | (df_lab["Year"] == "Y2")]
print(df_lab_y12.shape)

(120162, 4)
(122416, 4)
(242578, 4)


Calculate some metrics `min`, `max`, `avg`, `std`, `sum` of `LabCount` in `Y1`, `Y2`, `Y1-Y2`

In [157]:
def process_lab(df):
    lab_min = df.groupby(["MemberID"])["LabCount"].min()
    lab_max = df.groupby(["MemberID"])["LabCount"].max()
    lab_avg = df.groupby(["MemberID"])["LabCount"].mean()
    lab_std = df.groupby(["MemberID"])["LabCount"].std()
    lab_sum = df.groupby(["MemberID"])["LabCount"].sum()
    lab_claim_count = df.groupby(["MemberID"])["MemberID"].count()

    result = pd.concat([
        pd.DataFrame(np.array(lab_min).T, columns=["LabCountMin"], index=lab_min.index),
        pd.DataFrame(np.array(lab_max).T, columns=["LabCountMax"], index=lab_max.index),
        pd.DataFrame(np.array(lab_avg).T, columns=["LabCountAvg"], index=lab_avg.index),
        pd.DataFrame(np.array(lab_std).T, columns=["LabCountStd"], index=lab_std.index),
        pd.DataFrame(np.array(lab_sum).T, columns=["LabCountSum"], index=lab_sum.index),
        pd.DataFrame(np.array(lab_claim_count).T, columns=["LabClaimCount"], index=lab_claim_count.index),
    ], axis=1)
    result["LabCountStd"].fillna(-1.0)
    result.fillna(0.0)
    return result


In [158]:
%%time
df_lab_count_y1 = process_lab(df_lab_y1)
df_lab_count_y2 = process_lab(df_lab_y2)
df_lab_count_y12 = process_lab(df_lab_y12)

CPU times: user 384 ms, sys: 62.8 ms, total: 447 ms
Wall time: 998 ms


In [159]:
df_lab_count_y1.to_csv("./processed/LabCount_Y1.csv")
df_lab_count_y2.to_csv("./processed/LabCount_Y2.csv")
df_lab_count_y12.to_csv("./processed/LabCount_Y12.csv")

## DrugCount

In [58]:
df_drug_y1 = df_drug[df_drug["Year"] == "Y1"]
print(df_drug_y1.shape)
df_drug_y2 = df_drug[df_drug["Year"] == "Y2"]
print(df_drug_y2.shape)
df_drug_y12 = df_drug[(df_drug["Year"] == "Y1") | (df_drug["Year"] == "Y2")]
print(df_drug_y12.shape)

(281619, 4)
(276027, 4)
(557646, 4)


Calculate some metrics `min`, `max`, `avg`, `std`, `sum` of `DrugCount` in `Y1`, `Y2`, `Y1-Y2`

In [156]:
def process_drug(df):
    drug_min = df.groupby(["MemberID"])["DrugCount"].min()
    drug_max = df.groupby(["MemberID"])["DrugCount"].max()
    drug_avg = df.groupby(["MemberID"])["DrugCount"].mean()
    drug_std = df.groupby(["MemberID"])["DrugCount"].std()
    drug_sum = df.groupby(["MemberID"])["DrugCount"].sum()
    drug_claim_count = df.groupby(["MemberID"])["DrugCount"].count()

    result = pd.concat([
        pd.DataFrame(np.array(drug_min).T, columns=["DrugCountMin"], index=drug_min.index),
        pd.DataFrame(np.array(drug_max).T, columns=["DrugCountMax"], index=drug_max.index),
        pd.DataFrame(np.array(drug_avg).T, columns=["DrugCountAvg"], index=drug_avg.index),
        pd.DataFrame(np.array(drug_std).T, columns=["DrugCountStd"], index=drug_std.index),
        pd.DataFrame(np.array(drug_sum).T, columns=["DrugCountSum"], index=drug_sum.index),
        pd.DataFrame(np.array(drug_claim_count).T, columns=["DrugClaimCount"], index=drug_claim_count.index),
    ], axis=1)
    result["DrugCountStd"].fillna(-1.0)
    result.fillna(0.0)
    return result


In [160]:
%%time
df_drug_count_y1 = process_drug(df_drug_y1)
df_drug_count_y2 = process_drug(df_drug_y2)
df_drug_count_y12 = process_drug(df_drug_y12)

CPU times: user 543 ms, sys: 111 ms, total: 653 ms
Wall time: 1.56 s


In [161]:
df_drug_count_y1.to_csv("./processed/DrugCount_Y1.csv")
df_drug_count_y2.to_csv("./processed/DrugCount_Y2.csv")
df_drug_count_y12.to_csv("./processed/DrugCount_Y12.csv")

## Build Strategy A dataset

In [62]:
df_dih_y2 = pd.read_csv("./data/DaysInHospital_Y2.csv", index_col=["MemberID"])

In [162]:
df_a_y1 = pd.concat([
        df_member,
        df_claim_count_y1,
        df_pvd_count_y1,
        df_v_count_y1,
        df_pcp_count_y1,
        df_spe_count_y1,
        df_psvc_count_y1,
        df_pdl_y1,
        df_los_y1,
        df_dsfs_y1,
        df_ch_y1,
        df_pcg_count_y1,
        df_pg_count_y1,
        df_lab_count_y1,
        df_drug_count_y1,
        df_dih_y2["DaysInHospital"]
    ], axis=1)

In [129]:
df_a_y1

Unnamed: 0_level_0,AgeAtFirstClaim,Sex,ClaimCount,ProviderID,Vendor,PCP,SpecialtyCount_Surgery,SpecialtyCount_Rehabilitation,SpecialtyCount_Diagnostic Imaging,SpecialtyCount_Emergency,...,LabCountStd,LabCountSum,LabClaimCount,DrugCountMin,DrugCountMax,DrugCountAvg,DrugCountStd,DrugCountSum,DrugClaimCount,DaysInHospital
MemberID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,5,M,,,,,,,,,...,,,,,,,,,,
210,35,,8.0,4.0,4.0,2.0,0.0,0.0,0.0,2.0,...,0.000000,2.0,1.0,1.0,2.0,1.666667,0.57735,5.0,3.0,0.0
3197,5,F,5.0,3.0,3.0,1.0,0.0,0.0,0.0,2.0,...,,,,1.0,2.0,1.250000,0.50000,5.0,4.0,0.0
3457,5,M,,,,,,,,,...,,,,,,,,,,
3713,45,F,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99996214,45,M,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
99997485,15,M,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
99997895,45,M,14.0,5.0,4.0,1.0,2.0,0.0,1.0,0.0,...,1.414214,10.0,2.0,,,,,,,0.0
99998627,35,F,10.0,7.0,7.0,1.0,0.0,0.0,1.0,0.0,...,0.000000,5.0,1.0,1.0,1.0,1.000000,0.00000,1.0,1.0,0.0


In [163]:
df_a_y1 = df_a_y1[df_a_y1["DaysInHospital"].notna()]

In [164]:
df_a_y1.to_csv("./processed/DatasetATrain.csv")

In [72]:
df_dih_y3 = pd.read_csv("./data/DaysInHospital_Y3.csv", index_col=["MemberID"])

In [165]:
df_a_y2 = pd.concat([
        df_member,
        df_claim_count_y2,
        df_pvd_count_y2,
        df_v_count_y2,
        df_pcp_count_y2,
        df_spe_count_y2,
        df_psvc_count_y2,
        df_pdl_y2,
        df_los_y2,
        df_dsfs_y2,
        df_ch_y2,
        df_pcg_count_y2,
        df_pg_count_y2,
        df_lab_count_y2,
        df_drug_count_y2,
        df_dih_y3["DaysInHospital"]
    ], axis=1)

In [166]:
df_a_y2 = df_a_y2[df_a_y2["DaysInHospital"].notna()]

In [167]:
df_a_y2.to_csv("./processed/DatasetATest.csv")

## Build Strategy B dataset

In [172]:
df_b = pd.concat([
        df_member,
        df_claim_count_y12,
        pd.DataFrame(list(df_claim_count_y2["ClaimCount"]), columns=["ClaimCountLatestYear"], index=df_claim_count_y2.index),
        df_pvd_count_y12,
        pd.DataFrame(list(df_pvd_count_y2["ProviderID"]), columns=["ProviderCountLatestYear"], index=df_pvd_count_y2.index),
        df_v_count_y12,
        pd.DataFrame(list(df_v_count_y2["Vendor"]), columns=["VendorCountLatestYear"], index=df_v_count_y2.index),
        df_pcp_count_y12,
        pd.DataFrame(list(df_pcp_count_y2["PCP"]), columns=["PCPCountLatestYear"], index=df_pcp_count_y2.index),
        df_spe_count_y12,
        df_psvc_count_y12,
        df_pdl_y12,
        pd.DataFrame(df_pdl_y2.to_numpy(), columns=map(lambda x: x + "LatestYear", df_pdl_y2.columns), index=df_pdl_y2.index),
        df_los_y12,
        pd.DataFrame(df_los_y2[["LengthOfStaySum", "LengthOfStayMax", "LengthOfStayCountNan"]].to_numpy(), columns=["LengthOfStaySumLatestYear", "LengthOfStayMaxLatestYear", "LengthOfStayCountNanLatestYear"], index=df_los_y2.index),
        df_dsfs_y12,
        pd.DataFrame(list(df_dsfs_y2["DSFSMax"]), columns=["DSFSMaxLatestYear"], index=df_dsfs_y2.index),
        df_ch_y12,
        pd.DataFrame(list(df_ch_y2["CharlsonIndexMax"]), columns=["CharlsonIndexMaxLatestYear"], index=df_ch_y2.index),
        df_pcg_count_y12,
        df_pg_count_y12,
        df_lab_count_y12,
        pd.DataFrame(df_lab_count_y2[["LabCountSum","LabClaimCount"]].to_numpy(), columns=["LabCountSumLatestYear", "LabClaimCountLatestYear"], index=df_lab_count_y2.index),
        df_drug_count_y12,
        pd.DataFrame(df_drug_count_y2[["DrugCountSum","DrugClaimCount"]].to_numpy(), columns=["DrugCountSumLatestYear", "DrugClaimCountLatestYear"], index=df_drug_count_y2.index),
        df_dih_y3["DaysInHospital"]
    ], axis=1)

In [173]:
df_b = df_b[df_b["DaysInHospital"].notna()]

In [170]:
df_b

Unnamed: 0_level_0,AgeAtFirstClaim,Sex,ClaimCount,ClaimCountLatestYear,ProviderID,ProviderCountLatestYear,Vendor,VendorCountLatestYear,PCP,PCPCountLatestYear,...,LabClaimCountLatestYear,DrugCountMin,DrugCountMax,DrugCountAvg,DrugCountStd,DrugCountSum,DrugClaimCount,DrugCountSumLatestYear,DrugClaimCountLatestYear,DaysInHospital
MemberID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,5,M,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,0.0
210,35,,14.0,6.0,5.0,3.0,5.0,3.0,2.0,1.0,...,1.0,1.0,2.0,1.666667,0.577350,5.0,3.0,,,0.0
3197,5,F,10.0,5.0,6.0,4.0,5.0,3.0,1.0,1.0,...,1.0,1.0,2.0,1.333333,0.516398,8.0,6.0,3.0,2.0,0.0
3457,5,M,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,0.0
3713,45,F,10.0,10.0,5.0,5.0,5.0,5.0,2.0,2.0,...,2.0,1.0,6.0,4.250000,2.362908,17.0,4.0,17.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99985996,-1,F,5.0,5.0,2.0,2.0,2.0,2.0,1.0,1.0,...,,3.0,3.0,3.000000,0.000000,18.0,6.0,18.0,6.0,0.0
99987030,25,M,4.0,4.0,3.0,3.0,2.0,2.0,1.0,1.0,...,,2.0,2.0,2.000000,,2.0,1.0,2.0,1.0,0.0
99995391,65,,11.0,11.0,2.0,2.0,2.0,2.0,1.0,1.0,...,1.0,1.0,3.0,2.333333,0.816497,14.0,6.0,14.0,6.0,0.0
99995554,45,M,50.0,15.0,3.0,3.0,3.0,3.0,1.0,1.0,...,1.0,1.0,4.0,2.000000,1.224745,10.0,5.0,2.0,1.0,0.0


In [174]:
df_b.to_csv("./processed/DatasetB.csv")