In [None]:
import fileinput
import glob
import pandas as pd
import numpy as np
import os
from datetime import datetime
from datetime import timedelta

### Load in the data

In [None]:
# load the height data!
DIR = "cleaned_files"
os.chdir(DIR)
height = pd.read_csv("height.csv")

# load in outcome data
DIR = "cleaned_files"
os.chdir(DIR)
outcome = pd.read_csv('outcomes.csv')
outcome = outcome[["e_patid", "obsdate"]]

# keep only relevant rows of both data frames
height = height[["e_patid", "height (cm)", "measuredate"]]

# Merge the data frames on patient id
merged = pd.merge(outcome, height, on = "e_patid")

# Convert the date columns to datetime objects
merged["obsdate"] = pd.to_datetime(merged["obsdate"])
merged["measuredate"] = pd.to_datetime(merged["measuredate"], errors = "coerce")

# Sort the data by patient id and date of height measurement
merged = merged.sort_values(["e_patid", "measuredate"])

### Try different fixed time windows to assess % missingness/

In [None]:
DIR = 'cleaned_files'
os.chdir(DIR)

# Remove invalid height values
merged.loc[merged['height (cm)'] < 0, 'height (cm)'] = np.nan

# Sort by patient ID and measurement date
merged = merged.sort_values(['e_patid', 'measuredate'])

# Create empty dictionary to store data frames for different time windows
dfs = {}

# Loop over different time windows
for time_window in [1, 2, 3, 4, 5, 10]:

    # Create a copy of the data frame for the current time window
    merged_window = merged.copy()

    # Create a start date for the time window range (use obsdate as the end)
    merged_window['date_start'] = pd.to_datetime(merged_window['obsdate']) - pd.DateOffset(years=time_window)
    
    # Ensure the other dates are in valid datetime format
    merged_window['measuredate'] = pd.to_datetime(merged_window['measuredate'], errors = 'coerce') 
    merged_window['obsdate'] = pd.to_datetime(merged_window['obsdate'])

    # Get T/F array of values that are in the timeframe (T) and that are not (F)
    is_between_dates = (merged_window['measuredate'] >= merged_window['date_start']) & (merged_window['measuredate'] <= merged_window['obsdate'])

    # Set values outside this valid window to NaN (because they are missing if they are not in the window!)
    merged_window.loc[~is_between_dates, 'height (cm)'] = np.nan
    
    # Drop duplicates
    merged_window = merged_window.sort_values(['e_patid', 'measuredate'])
    merged_window['days_diff'] = abs(merged_window['measuredate'] - merged_window['obsdate']).dt.days
    merged_window = merged_window.sort_values(['e_patid', 'obsdate', 'days_diff'], ascending=[True, True, True])
    merged_window = merged_window.drop_duplicates(subset=['e_patid'], keep='first')
    
    # Only keep relevant columns
    merged_window = merged_window[["e_patid", "height (cm)", "measuredate"]]
    merged_outcome = pd.merge(outcome, merged_window, on = 'e_patid', how = 'left')

    # Save data frame to dictionary
    dfs[f"{time_window}yr"] = merged_outcome[['e_patid', 'obsdate', 'height (cm)']]

# Save data frames to CSV files in separate folders
for folder_name, df in dfs.items():
    df.to_csv(os.path.join(folder_name, 'height_no_duplicates_merged.csv'), index=False)

### Now try closest measurement to index date to assess % missingness

In [None]:
# try to see what closest measuredate to obsdate gets us in terms of missing values 

# Remove invalid height values
merged.loc[merged['height (cm)'] < 0, 'height (cm)'] = np.nan

merged = merged.sort_values(['e_patid', 'measuredate'])

merged_window = merged.copy()

# Get T/F array of values that are in the timeframe (T) and that are not (F)
is_between_dates = (merged_window['measuredate'] <= merged_window['obsdate'])

# Set values outside this valid window to NaN (because they are missing if they are not in the window!)
merged_window.loc[~is_between_dates, 'height (cm)'] = np.nan

# Drop duplicates
merged_window['days_diff'] = abs(merged_window['measuredate'] - merged_window['obsdate']).dt.days
merged_window = merged_window.sort_values(['e_patid', 'obsdate', 'days_diff'], ascending = [True, True, True])
merged_window = merged_window.drop_duplicates(subset = ['e_patid'], keep = 'first')

merged_window = merged_window[["e_patid", "height (cm)", "measuredate"]]
merged_outcome = pd.merge(outcome, merged_window, on = 'e_patid', how = 'left')

DIR = 'cleaned_files\\Closest'
os.chdir(DIR)

merged_outcome.to_csv('height_no_duplicates_merged.csv', index = False)

merged_outcome['height (cm)'].isna().sum()