In [1]:
import os
import pandas as pd
import numpy as np
import sys
import win32com.client
import getpass
import datetime
import pywintypes
import matplotlib.pyplot as plt 

# Function

In [2]:
pd.set_option('display.width', None)
pd.set_option('display.max_column',None)
pd.set_option('display.max_rows',None)

# excel to df
def getDataToDF(xlws,start,end,first = False):
    last_col = xlws.UsedRange.Columns.Count
    header = xlws.Range(xlws.Cells(1, 1), xlws.Cells(1, last_col)).Value
    content = xlws.Range(xlws.Cells(start, 1), xlws.Cells(end, last_col)).Value
    
    if first:
        data = list(content[1:])
    else:
        data = list(content[0:])
        
    for x in range(0,len(data)):
        data[x] = list(data[x])
        for y in range(0,len(data[x])):
            if isinstance(data[x][y], pywintypes.TimeType):
                temp = str(data[x][y]).rstrip("+00:00").strip()
                if temp[:2] == '20' and int(temp[:4]) >= int(now.year):
                    temp = '19' + temp[2:]
                data[x][y] = datetime.datetime.strptime(temp, "%Y-%m-%d")
    df = pd.DataFrame(data, columns=header[0])
    return df

def dropColCDM(CDM):
    # create age
    CDM['dx_date'] = pd.to_datetime(CDM['dx_date'])
    CDM['dob'] = pd.to_datetime(CDM['dob'])
    CDM['age'] = (CDM['dx_date'] - CDM['dob']).astype('<m8[Y]') 

    #fill in all null values
    CDM.fillna(value=pd.np.nan, inplace=True)

    #optional dates
    CDM = CDM.drop(columns=[
                           #on hold can be useful 
                          'Dx_Date > 2005',\
                          'dob',\
                          'previous_dx_date', \
                          'previous_stagingtype','Previous_T', 'Previous_N', 'Previous_M', 'Previous_stage',\
                          'lastseen',\

                          #confirm can drop
                          'sync_ca_date','sync_ca_desc','hx_other_ca_date','hx_other_ca_desc',\
                          'Rx_date','srgdate',\
                          'X.1st_Tamoxifen','Last_Tamoxifen',\
                          'X.1st_Herceptin', 'Last_Herceptin',\
                          'neo_adj_chemo_start','neo_adj_chemo_end',\

                          #can be use to check if chemo happens
                          'chemo_start', 'chemo_end', 'other_chemo_regimen'])

    #drop unused columns
    CDM = CDM.drop(columns=[
                            'Has Bills?',\
                            'Marital_Status','Hospital', 'KKH', "NCCS", 'SGH',\
                            'comorbidity','Presentation','Doctor_In_Charge','firstsee','operation_summ','Age.Menopause', \
                            'Surgeon','Recurrence_Score','HistCode_WWXX',\
                            'er_intensity','er_percentage',\
                            'pr_intensity','pr_percentage',\
                            'cerbB2_intensity','cerbB2_percentage',\
                            'Size','slnb','diff','firstfail',\

                            #leave it out as of now
                            'FISH', 'fish_ratio','hrt2_desc',\
                            #=========================
                            'margins','margins_calc',\
                            #'neo_adj_chemo_regimen_desc',\
                             'treatment_remarks',\
                            'New_mets','Site_mets','Mets.Dx',\
                            'remarks','New_primary',\
                            'Local_Desc','Local_nontrue_desc','Nodal_recur','contra_remarks',\
                            'deathcode','MHA_COD',\

                            #manual entry data all can remove
                            "Date_for_DDFS","Date_for_IBTR",\
                            "Date_for_True_Local_Recurrence","Date_for_Other_Local_Recurrence",\
                            "Date_for_Nodal_Recurrence",\
                            "Date_for_Contra_Rec","event_dt","Date_for_DFS","Date_for_OS","Date_for_CSS",\
                            "Count_as_EVENTS","Count_as_DFS","Count_as_DDFS",\
                            "Count_as_IBTR","Count_as_True_Local_Recurrence","Count_as_Other_Local_Recurrence",\
                            "Count_as_Nodal_Recurrence","Count_as_Contra_Rec","Count_as_OS","Count_as_CSS",\
                            "chestsiz_recode","Chestsiz_dichotomise",\
                            "TNM_collapse","TNM_I_vs_II","Time_OS","Time_CSS","Time_IBTR",\
                            "Time_IBTRTrue","Time_DFS","Time_DDFS","Time_ContraRec",\
                            "Age_@_Dx",\
                            "Histo_subtype","Histo_subtype_collapse","Histo_subtype_collapser",\
                            "subtype_stg2013","subtype_stg2013_lumb",\
                            "Triple_Neg","resident","mastect","surgfirst",\
                            "Date_for_IDFS","Date_for_DSS","Date_for_RFS",\
                            "Count_as_IDFS","Count_as_DSS", "Count_as_RFS",\
                            "Time_IDFS","Time_DSS","Time_RFS","END_OF_ENTRY",\
                            ])
    return CDM

def oneHotEncode(CDM):
    #one hot encoding
    listToDo = ['Side','Gender', 'Race', 'Smoker', 'Alcohol',\
                'ECOG','Oral_Contraceptive', 'famhx','Breast_Feed','Menopause_Status',\
                'Hor_replacement','Breast_surgery','Recon','Reconstruction', 'Genomic_Test', 'Site','Histology',\
                'c_tstage','cNstage','cMstage','c_Staging','tstage','nstage','Mstage',\
                'p_Staging','Ajcc8_phyllodes_tstage','TNM_Stage','ProgStage_AJCC8','AJCC8_Staging','ER',\
                'PR','cerbB2','Her2','Multi_focality','Multi_centricity','Extensive_Intraductal_Component',\
                'Comedo_Necrosis','Pleomorphism','VNPI','invasion','False_Negative_SLNB',\
                'Non_SLN','ac','AClevel','TMX','hrt','hrt2','Targeted','technique',\
                'Neo_Adjuvant','neo_adj_chemo_regimen','chemo','Chemo_Intent_not_neoadj','chemo_regimen','RT',
                'RT_intent','RT_Technique','field','dose','scdose','midplane','intmamm','TreatmentCycleType',\
                'First_Fail','Second_Fail','Third_Fail','status','cause_of_death']
    for x in listToDo:
        CDM = pd.concat([CDM,pd.get_dummies(CDM[x], prefix=x,dummy_na=True)],axis=1)
        CDM.drop([x],axis=1, inplace=True)
        
    return CDM
        
def outToCSV(df,filename):
    if os.path.exists('./{}'.format(filename)):
        df.to_csv(path_or_buf=filename,header=None,index=False,mode="a")
    else:
        df.to_csv(path_or_buf=filename,header=True,index=False)

# Main Process

In [3]:
if os.path.exists('./output.csv'):
    CDM = pd.read_csv('output.csv')
else:
    # primary set up
    xlApp = win32com.client.Dispatch("Excel.Application")
    xlApp.Interactive = False
    xlApp.Visible = False

    # cwd = os.getcwd()
    cwd = "C:"
    path = str(cwd + "\\SMU_v2\\")
    files = os.listdir(path)

    # Put files into dataframe dict
    df_list = {}

    #require user input for password
    pwd = getpass.getpass('Enter file password: ')

    # Pick out 'xlsx' files:
    files_xls = [f for f in files if f[-4:] == 'xlsx']

    x = 1
    now = datetime.datetime.now()

    for f in files_xls:
        xlwb = xlApp.Workbooks.Open(path+f, False, True, None, pwd)
        xlws = xlwb.Worksheets(1) 
        last_row = xlws.UsedRange.Rows.Count

        if "Clinical" in f:

            numberOfTimes = int(last_row/5000) + 1
            
            for i in range(1,numberOfTimes+1):

                start = (i-1)*5000 + 1
                end = i*5000

                ### get data from excel to df ###
                # first set of 1000 rows requires header
                # for all remaining rows
                # last set of 1000 take to the last row used
                # drop all unused cols
                # one hot encode data
                # store to CSV
                # status update
                
                if i == 1:  
                    CDM = getDataToDF(xlws,start,end, True)
                else:
                    if i == numberOfTimes: 
                        end = last_row
                    CDM = getDataToDF(xlws,start,end)
                CDM = dropColCDM(CDM)
                CDM = oneHotEncode(CDM)
                outToCSV(CDM,'output.csv')  
                print("Row {} to {} write to cvs.".format(start,end))
        else:
            # create name for bill dataframe
            billName = 'bill{}'.format(x)

    # #         rows to loop each time
    #         rowToDo = 
    #         cvsName = ""
    # #         put individual bill df to df_list
    #         df_list[billName] = getDataToDF(xlws,rowToDo)
            x+=1

        #reset variables
        xlws = None
        xlwb.Close(False)
        xlwb = None

    #remove buffer and reset system settings
    xlApp.Interactive = True
    xlApp.Visible = True
    xlApp.Quit()
    xlApp = None

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
CDM.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26139 entries, 0 to 26138
Columns: 657 entries, NRIC to cause_of_death_nan
dtypes: float64(127), int64(518), object(12)
memory usage: 141.4 MB


In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = CDM.iloc[:,0:20]  #independent columns
y = CDM.iloc[:,-1]    #target column i.e price range

#apply SelectKBest class to extract top 100 best features
bestfeatures = SelectKBest(score_func=chi2, k=100)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(100,'Score'))  #print 10 best features

# MY

In [None]:
#for bills
test_patient = "4b6ac0036ae2a4e8e6c3"
# 'Bills Data_10-12k (MASKED)v2.xlsx', 'Bills Data_12-14k (MASKED)v2.xlsx', 'Bills Data_14-16k (MASKED)v2.xlsx', 'Bills Data_16-18k (MASKED)v2.xlsx', 'Bills Data_18-20k (MASKED)v2.xlsx', 'Bills Data_1st 2k (MASKED)v2.xlsx', 'Bills Data_2-4k (MASKED)v2.xlsx', 'Bills Data_20-22k (MASKED)v2.xlsx', 'Bills Data_22-24k (MASKED)v2.xlsx', 'Bills Data_24-26k (MASKED)v2.xlsx', 'Bills Data_4-6k (MASKED)v2.xlsx', 'Bills Data_6-8k (MASKED)v2.xlsx', 'Bills Data_8-10k (MASKED)v2.xlsx', 'Bills Data_last 1k (MASKED)v2.xlsx', 'Breast Database Dictionary SMU_v2.xlsx', 'Clinical Data_Masked_v2.xlsx', 'Mapping for service code.xlsx'
bills = df_list['Bills Data_last 1k (MASKED)v2.xlsx']
patients = df_list['Clinical Data_Masked_v2.xlsx']

In [None]:
test_patient_bills = bills[bills["Patient.ID"]==test_patient]
est_patient_dx = patients[patients["NRIC"]==test_patient]['dx_date']

In [None]:
earliest_date = test_patient_dx - pd.tseries.offsets.MonthOffset(6)
earliest_date = earliest_date.values[0]

In [None]:
latest_date =min(pd.to_datetime('now').to_datetime64(), (test_patient_dx + pd.tseries.offsets.MonthOffset(120)).values[0])

In [None]:
#only looking at bills from NCC
test_patient_bills = test_patient_bills[test_patient_bills['Institution.Code'] == "NCC"]

In [None]:
#remove all bills not in the range of dates from 6 months before diagnosis and 10 years after diagnosis
test_patient_bills["Service.Date.From.Date"]

In [None]:
resampled = test_patient_bills[["Net.Payable","Service.Date.From.Date"]].resample("3M",on = "Service.Date.From.Date").sum()

In [None]:
resampled.mean()

In [None]:
test_patient_dx

In [None]:
test_patient_bills[test_patient_bills["Admit.Date"] - test_patient_dx]