# Main script to clean Health data

Modules: N/A <br>
Author: Cornelia Ilin <br>
Email: cilin@ischool.berkeley.edu <br>
Date created: March 24, 2022 <br>

**Citations (online sources)**
1. Birth Data source:
    - California Department of Public Health
2. Patient Dischrarge and Emergency Room:
    - California Office of Statewide Health Planning and Development
    
**Citations (persons)**
1. N/A

**Preferred environment**
1. Code written in Jupyter Notebooks

### Step 1: Import packages

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Step 2: Set working directories

In [None]:
in_dir = 'C:/Users/cilin/Research/CA_hospitals/Input/raw_data/health/'
out_dir = 'C:/Users/cilin/Research/CA_hospitals/Input/interm_data/health/'

### Step 3: Define classes

In [None]:
class ReadFile:
    ''' A class that reads .xlsx and .csv files
    '''
    def __init__(self, localdir, filename):
        self.localdir = localdir
        self.filename = filename
        self.filepath = os.path.join(self.localdir, self.filename).replace('\\', '/')
        
    # read data    
    def get_xlsx(self, sheetname, skiprows, header):
        ''' A method that reads .xlsx files
        # param sheetname: string, indicating the name of the tab in the Excel file
        # param skiprows: integer, indicating the number of rows to skip in a tab
        # param header: integer, set to 0 if  read first row as header; 1 otherwise
        # return df
        '''
        self._data = pd.read_excel(self.filepath, sheetname, skiprows = skiprows, header = header)
        return self._data
    
    def get_csv(self):
        '''
        # param: none
        # return df
        '''
        self._data = pd.read_csv(self.filepath, sep = ',', dtype='str')
        return self._data

### Step 4: Define functions

 ``read csv file names``

In [None]:
def csv_file_names():
    ''' Read .csv file names for each dataset
    params:
    -------
    None
    
    return:
    -------
    csv_files (nested list): csv_files[i], where i indicates the index in data_names
    '''
    temp = []
    csv_files = []
    
    # for each datasets
    for i in range(len(data_names)): 
        # for each .csv file (year) in a dataset
        dir_name = os.path.join(in_dir, data_names[i]).replace('\\','/')
        for idx, file in enumerate(os.listdir(dir_name)): 
            if file.startswith(file_names[i]):
                temp.append(file)
        csv_files.append(temp) 
        temp = []
    
    print('Read .csv file names: Done')

    return csv_files

``select variables of interest``

In [None]:
def var_selection():
    ''' For each .csv file, keep only vars of interest
    params:
    -------
    None
    
    return:
    -------
    var_dict (dict): dictionary with variable names for each dataset = {Birth, PDD, EDD}
    '''
    var_dict = {}
    for idx, key in enumerate(data_names):
        var_dict[key] = pd.DataFrame() 
    
    # for each dataset
    for i, data in enumerate(data_names): 
        # open the data_selection.xlsx file and grab the tab of interest (tab_names)
        var_dict[data] = ReadFile(in_dir, 'data_selection.xlsx') 
        var_dict[data] = var_dict[data].get_xlsx(tab_names[i], 2, 0)
        
        # keep var only if it's of use in the analysis
        var_dict[data] = var_dict[data][
            var_dict[data]['Use in the analysis [behrt]'].eq(1)
        ]
        
        print('Variable selection for', data_names[i], 'data: Done')
        
    return var_dict

``read data``

In [None]:
def read_data():
    ''''''

    # create empty dictionary with a df for each dataset in data_names
    df_dict = {}
    for idx, key in enumerate(data_names):
        df_dict[key] = pd.DataFrame() 

    # for each dataset (Birth/PDD/EDD), read files and concatenate years
    for i, data in enumerate(data_names):
        print('-----------------------------------')
        print(data, 'cleaning for each year...')
        print('-----------------------------------')

        # read .csv files
        for idx, file in enumerate(csv_files[i]):
            print('File:', file)
            temp_df = ReadFile(in_dir + data + '/', file) 
            temp_df = temp_df.get_csv()

            # check if var_dict for the year corresponding to the csv_file has more columns than temp_df
            temp_col = np.setdiff1d(var_dict[data][years[i][idx]], temp_df.columns)
            # if yes, then add the extra columns to temp_df
            for col in temp_col:
                temp_df[col] = np.nan

            # keep only vars of interest
            temp_df = temp_df.loc[:, var_dict[data][years[i][idx]]]

            # standardize var name over time
            temp_df.columns =  var_dict[data]['std_variable_name']  

            # concat years
            df_dict[data] = pd.concat(
                [df_dict[data], temp_df],
                axis=0
            )

        # reset index
        df_dict[data].reset_index(
            drop=True, 
            inplace=True
        )
        
        # save to csv
        #df_dict[data].to_csv(out_dir, data + '_all.csv')
        
    return df_dict

### Step 5: Create global variables

In [None]:
# create list with data names
data_names = ['Birth', 'PDD', 'EDD']

# create list with .csv data names
file_names = ['lb', 'pdd', 'edd']

# create list with tab names of interest as listed in the variable_names.xlsx file
tab_names = ['Birth_all_sorted', 'PDD_all_sorted', 'EDD_all_sorted']

# create list with year ranges for each dataset: Birth, PDD, EDD
years = [np.arange(1991, 2013, 1), np.arange(1991, 2018, 1), np.arange(2005, 2018, 1)]

### Step 6: Read data

In [None]:
## for each dataset (Birth/PDD/EDD) and each year ##

# create list with .csv file names 
csv_files = csv_file_names()

# keep only vars of interest
var_dict = var_selection()

# read data
df_dict = read_data()

In [None]:
# print shape of data
obs = 0
for idx, data in enumerate(data_names):
    print(data + ' shape:', df_dict[data].shape)
    obs+=df_dict[data].shape[0]
    if idx==2:
        print('Total obs across datasets:', obs/1000000, 'mil')

### Step 7: Export data to .csv

In [None]:
for data in data_names:
    print('Writing ' + data + ' to csv')
    df_dict[data].to_csv(
        out_dir + data +'.csv'
)