-
Notifications
You must be signed in to change notification settings - Fork 1
/
post_process_one_hot_table.py
73 lines (56 loc) · 2.7 KB
/
post_process_one_hot_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm
import os
one_hot_df_path = 'Processed_data/one_hot_encoded_DAD.csv'
main_df_path = 'Processed_data/sorted_DAD.csv'
# get column names of the one-hot-encoded df
one_hot_sample = pd.read_csv(one_hot_df_path, index_col=0, nrows=1)
one_hot_cols = one_hot_sample.columns
# Initialize an empty list to store the arrays
arrays = []
def func(x, itr_num):
"""
This function takes a chunk of the main one-hot encoded dataframe and processes it (by resampling to 3 months)
:param x: chunk of the main one-hot encoded dataframe
:param itr_num: which iteration is this (used for debugging)
:return: 0 -- it appends the processed array to the global list of arrays
"""
global arrays
skiprows = x.index[0] + 1
nrows = x.index[-1] - x.index[0] + 1
one_hot_df_group = pd.read_csv(one_hot_df_path, index_col=0, skiprows=skiprows, nrows=nrows, names=one_hot_cols)
df2 = pd.concat([x, one_hot_df_group], axis=1)
df_resampled = df2.groupby('FILE_NO').resample('3M', on='ADMIT_DTTM').max()
df_resampled.dropna(inplace=True)
# # Write to CSV
# # Set writing mode to append after first chunk
# mode = 'w' if itr_num == 0 else 'a'
# # Add header if it is the first chunk
# header = itr_num == 0
# df_resampled.to_csv('one_hot_encoded_resampled.csv', header=header, mode=mode)
# Store in a Numpy array
bool_chunk = df_resampled[one_hot_cols].astype(bool) # Convert the dataframe to boolean values
arrays.append(bool_chunk.values) # Append the boolean array to the list of arrays
return 0
# Read the main dataframe
df = pd.read_csv(main_df_path, dtype=str, usecols=['FILE_NO', 'ADMIT_DTTM'])
df['ADMIT_DTTM'] = pd.to_datetime(df['ADMIT_DTTM'])
df['FILE_NO'] = pd.to_numeric(df['FILE_NO'], downcast='integer')
# find unique patients to iterate over them (to avoid a situation that one patient is split into multiple chunks)
unique_patients = df['FILE_NO'].unique()
# iterate over certain number of unique patients and analyze their data
print("Start Time =", datetime.now().strftime("%H:%M:%S"))
batch_size = 2000 # how many patients in each batch should be processed (we have about 5 samples per patients)
for i in tqdm(range(0, len(unique_patients), batch_size)):
i2 = min(i + batch_size, len(unique_patients)) # to handle the last batch
batch = unique_patients[i:i2]
batch_df = df.loc[df['FILE_NO'].isin(batch)]
func(batch_df, i)
# Concatenate the arrays along the rows axis
np_result = np.concatenate(arrays, axis=0)
print(np_result.shape)
np.save('Processed_data/resampled_one_hot_data', np_result)
print("End Time =", datetime.now().strftime("%H:%M:%S"))
print('Done')