In [1]:
DATA_PATH = "../one/csv/"
FILL_VALUE = -1

In [2]:
### 0- IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import glob


In [3]:
### 1- LOAD DATA
patient_df = pd.read_csv(DATA_PATH + "patients.csv")
medication_df = pd.read_csv(DATA_PATH + "medications.csv")
conditions_df = pd.read_csv(DATA_PATH + "conditions.csv")
observations_df = pd.read_csv(DATA_PATH + "observations.csv")

In [4]:
### 2- DELETE UNUSED COLUMNS
patient_cols = ["Id","BIRTHDATE","DEATHDATE","RACE","ETHNICITY","GENDER"]
patient_df = patient_df[patient_cols]

medication_cols = ["START","STOP","PATIENT","ENCOUNTER","CODE","DESCRIPTION"]
medication_df = medication_df[medication_cols]

conditions_cols = ["START","STOP","PATIENT","ENCOUNTER","CODE","DESCRIPTION"]
conditions_df = conditions_df[conditions_cols]

observations_cols = ["DATE","PATIENT","ENCOUNTER","CODE","DESCRIPTION","VALUE","UNITS"]
observations_df = observations_df[observations_cols]

In [5]:
### 3- DELETE RARE CODES TODO: FITLER USING A LIST OF CODES

THRESHOLD = 1000
### CALCULATE CODES PREVALENCE
medication_codes = medication_df["CODE"].value_counts()
conditions_codes = conditions_df["CODE"].value_counts()
observations_codes = observations_df["CODE"].value_counts()
### FILTER CODES BY PREVALENCE
medication_codes = medication_codes[medication_codes > THRESHOLD]
conditions_codes = conditions_codes[conditions_codes > THRESHOLD]
observations_codes = observations_codes[observations_codes > THRESHOLD]
### FILTER DATAFRAMES BY PREVALENCE
medication_df = medication_df[medication_df["CODE"].isin(medication_codes.index)]
conditions_df = conditions_df[conditions_df["CODE"].isin(conditions_codes.index)]
observations_df = observations_df[observations_df["CODE"].isin(observations_codes.index)]

In [6]:
### 4- CONVERT DATE COLUMNS TO DATETIME
patient_df["BIRTHDATE"] = pd.to_datetime(patient_df["BIRTHDATE"], utc=True)
patient_df["DEATHDATE"] = pd.to_datetime(patient_df["DEATHDATE"], utc=True)

medication_df["START"] = pd.to_datetime(medication_df["START"], utc=True)
medication_df["STOP"] = pd.to_datetime(medication_df["STOP"], utc=True)

conditions_df["START"] = pd.to_datetime(conditions_df["START"], utc=True)
conditions_df["STOP"] = pd.to_datetime(conditions_df["STOP"], utc=True)

observations_df["DATE"] = pd.to_datetime(observations_df["DATE"], utc=True)

In [7]:
### 5- CREATE AGE FROM BIRTTHDATE COLUMN
BIN_LENTGH = 365

birth_dict = dict(zip(patient_df["Id"],patient_df["BIRTHDATE"]))

medication_df["AGE"] = medication_df["START"] - medication_df["PATIENT"].map(birth_dict)
medication_df["AGE"] = (medication_df["AGE"].dt.days/BIN_LENTGH).astype(int)

conditions_df["AGE"] = conditions_df["START"] - conditions_df["PATIENT"].map(birth_dict)
conditions_df["AGE"] = (conditions_df["AGE"].dt.days/BIN_LENTGH).astype(int)

observations_df["AGE"] = observations_df["DATE"] - observations_df["PATIENT"].map(birth_dict)
observations_df["AGE"] = (observations_df["AGE"].dt.days/BIN_LENTGH).astype(int)

In [8]:
### 6- CREATE ONE-HOT ENCODING
medication_df["ONE"] = 1
conditions_df["ONE"] = 1
observations_df["VALUE"] = pd.to_numeric(observations_df["VALUE"], errors='coerce')

medication_df = medication_df.pivot_table(index=["PATIENT","AGE"], columns="CODE", values="ONE", aggfunc="count", fill_value=FILL_VALUE)
conditions_df = conditions_df.pivot_table(index=["PATIENT","AGE"], columns="CODE", values="ONE", aggfunc="count", fill_value=FILL_VALUE)
observations_df = observations_df.pivot_table(index=["PATIENT","AGE"], columns="CODE", values="VALUE", aggfunc="mean", fill_value=FILL_VALUE)


In [9]:
### 7- COMBINE DATAFRAMES
df = pd.concat([medication_df, conditions_df, observations_df], axis=1)
df.fillna(FILL_VALUE, inplace=True)

In [10]:
### 8- PREPARE PATIENT DATAFRAME
patient_df.set_index("Id", inplace=True)
patient_df.drop(["BIRTHDATE","DEATHDATE"], axis=1, inplace=True)
patient_df = pd.get_dummies(patient_df, columns = ["RACE","ETHNICITY","GENDER"]).replace({True:1, False:0})


In [11]:
### 9- CREATE 3D NDARRAY
patient_list = df.index.get_level_values(0).unique()

age_list = sorted(df.index.get_level_values(1).unique())
age_list = np.arange(age_list[0], age_list[-1]+1)


data_temporal = np.zeros((len(patient_list), len(age_list), df.shape[1])) + FILL_VALUE
data_static = np.zeros((len(patient_list), patient_df.shape[1])) + FILL_VALUE

for patient in patient_list:
    data_static[patient_list.get_loc(patient), :] = patient_df.loc[patient, :].values
    for age in age_list:
        if (patient, age) in df.index:
            data_temporal[patient_list.get_loc(patient), age, :] = df.loc[patient, age].values
        else:
            data_temporal[patient_list.get_loc(patient), age, :] = FILL_VALUE

In [20]:
### 10- SAVE NAME OF THE COLUMNS
pd.DataFrame({"cols": df.columns.tolist()}).to_csv("temporal_features_list.csv")
pd.DataFrame({"cols": patient_df.columns.tolist()}).to_csv("static_features_list.csv")
pd.DataFrame({"cols": medication_df.columns.tolist()}).to_csv("medication_list.csv")
pd.DataFrame({"cols": conditions_df.columns.tolist()}).to_csv("condition_list.csv")
pd.DataFrame({"cols": observations_df.columns.tolist()}).to_csv("observation_list.csv")