In [None]:
import fileinput
import glob
import pandas as pd
import numpy as np
import os
from datetime import datetime
from datetime import timedelta

### Examine CRP units

In [None]:
# get crp data
crp = pd.read_csv("merged_files\\crp.csv")

# first check what the invalid dates are --> thankfully it is just nans
not_datetime_crp = pd.to_datetime(crp["obsdate"], errors = 'coerce').isna()
not_datetime_crp = crp.obsdate[not_datetime_crp].unique()

# change the dates to be valid datetimes
crp["obsdate"] = pd.to_datetime(crp["obsdate"], errors = 'coerce')
crp = crp.sort_values(["e_patid", "obsdate"]) # and sort values by date and patient id

# get units table from dataset (text file with measurement units of the predictor)
units_crp_id = pd.read_table("NumUnit.txt")

# now link crp and unitd_id based on the numunitid
crp_units_merged = pd.merge(crp, units_crp_id, on = 'numunitid', how = 'left')

# add column for crp with the numunitid description
crp['unitdescription'] = crp_units_merged['Description']

# explore the different crp units
unit_counts_crp = crp.groupby('unitdescription').size().reset_index(name = 'counts')
unit_counts_crp = unit_counts_crp.sort_values(by = 'counts', ascending = False)
most_frequent_crp_units = unit_counts_crp[unit_counts_crp["counts"] >= 1]

##### Plot histograms for each of the different CRP values

In [None]:
import matplotlib.pyplot as plt

# Filter out the rows with NaN values in the "value" column
crp_filtered = crp.dropna(subset=['value'])

# Get the unique unit descriptions
unique_units = crp_filtered['unitdescription'].unique()

# Calculate the number of rows and columns for the subplots grid
num_rows = (len(unique_units) + 2) // 3
num_cols = min(len(unique_units), 3)

# Set the figure size and adjust spacing
fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 4*num_rows))
fig.tight_layout(pad=3.0)

# Iterate over each unique unit description and plot a histogram
for i, unit in enumerate(unique_units):
    row = i // num_cols
    col = i % num_cols
    ax = axes[row, col]

    values = crp_filtered.loc[crp_filtered['unitdescription'] == unit, 'value']
    ax.hist(values, bins=10, range=(0, 600))
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')
    ax.set_title(f'Histogram of CRP values ({unit})')
    ax.set_xlim([0, 600])

# Save the plots as a PDF document
output_filepath = 'crp_histograms.pdf'
plt.savefig(output_filepath, format='pdf')

# Display the plots
plt.show()

### Convert the CRP units that make sense to convert 

In [None]:
# scale the only values that I can... leave the rest as is because based on inspection, they were deemed as mg/L
crp.loc[crp.unitdescription == 'mg/d1', 'value'] *= 10
crp.loc[crp.unitdescription == 'MG/DL', 'value'] *= 10
crp.loc[crp.unitdescription == 'mg/100mL', 'value'] *= 10
crp.loc[crp.unitdescription == 'ug/L', 'value'] *= 0.001

# set the **ERROR** value to nan
crp.loc[crp.unitdescription == '**ERROR**', 'value'] = np.nan

DIR = 'cleaned_files'
os.chdir(DIR)
crp.to_csv('crp.csv', index = False)

### Plot updated CRP values

In [None]:
import matplotlib.pyplot as plt

# Filter out the rows with NaN values in the "value" column
crp_filtered = crp.dropna(subset=['value'])

# Get the unique unit descriptions
unique_units = crp_filtered['unitdescription'].unique()

# Calculate the number of rows and columns for the subplots grid
num_rows = (len(unique_units) + 2) // 3
num_cols = min(len(unique_units), 3)

# Set the figure size and adjust spacing
fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 4*num_rows))
fig.tight_layout(pad=3.0)

# Iterate over each unique unit description and plot a histogram
for i, unit in enumerate(unique_units):
    row = i // num_cols
    col = i % num_cols
    ax = axes[row, col]

    values = crp_filtered.loc[crp_filtered['unitdescription'] == unit, 'value']
    ax.hist(values, bins=10, range=(0, 600))
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')
    ax.set_title(f'Histogram of CRP values ({unit})')
    ax.set_xlim([0, 600])

# Display the plots
plt.show()

### Start CRP data

In [None]:
# only keep relevant CRP data
crp = crp[["e_patid", "value", "obsdate"]]
crp.rename(columns = {"value": "crp value (mg/L)", "obsdate": "measuredate"}, inplace = True)

# load in outcome data
DIR = "cleaned_files"
os.chdir(DIR)
outcome = pd.read_csv('outcomes.csv')
outcome = outcome[["e_patid", "obsdate"]]

# Merge the data frames on patient id
merged = pd.merge(outcome, crp, on = "e_patid")

# Convert the date columns to datetime objects
merged["obsdate"] = pd.to_datetime(merged["obsdate"])
merged["measuredate"] = pd.to_datetime(merged["measuredate"], errors = "coerce")

# Sort the data by patient id and date of weight measurement
merged = merged.sort_values(["e_patid", "measuredate"])

### Try different fixed time windows to assess % missingness

In [None]:
DIR = 'cleaned_files'
os.chdir(DIR)

# Remove invalid crp values
merged.loc[merged['crp value (mg/L)'] < 0, 'crp value (mg/L)'] = np.nan

# Sort by patient ID and measurement date
merged = merged.sort_values(['e_patid', 'measuredate'])

# Create empty dictionary to store data frames for different time windows
dfs = {}

# Loop over different time windows
for time_window in [1, 2, 3, 4, 5, 10]:

    # Create a copy of the data frame for the current time window
    merged_window = merged.copy()

    # Create a start date for the time window range (use obsdate as the end)
    merged_window['date_start'] = pd.to_datetime(merged_window['obsdate']) - pd.DateOffset(years=time_window)
    
    # Ensure the other dates are in valid datetime format
    merged_window['measuredate'] = pd.to_datetime(merged_window['measuredate'], errors = 'coerce') 
    merged_window['obsdate'] = pd.to_datetime(merged_window['obsdate'])

    # Get T/F array of values that are in the timeframe (T) and that are not (F)
    is_between_dates = (merged_window['measuredate'] >= merged_window['date_start']) & (merged_window['measuredate'] <= merged_window['obsdate'])

    # Set values outside this valid window to NaN (because they are missing if they are not in the window!)
    merged_window.loc[~is_between_dates, 'crp value (mg/L)'] = np.nan
    
    # Drop duplicates
    merged_window = merged_window.sort_values(['e_patid', 'measuredate'])
    merged_window['days_diff'] = abs(merged_window['measuredate'] - merged_window['obsdate']).dt.days
    merged_window = merged_window.sort_values(['e_patid', 'obsdate', 'days_diff'], ascending=[True, True, True])
    merged_window = merged_window.drop_duplicates(subset=['e_patid'], keep='first')
    
    # Only keep relevant columns
    merged_window = merged_window[["e_patid", "crp value (mg/L)", "measuredate"]]
    merged_outcome = pd.merge(outcome, merged_window, on = 'e_patid', how = 'left')

    # Save data frame to dictionary
    dfs[f"{time_window}yr"] = merged_outcome[['e_patid', 'obsdate', 'crp value (mg/L)']]

# Save data frames to CSV files in separate folders
for folder_name, df in dfs.items():
    df.to_csv(os.path.join(folder_name, 'crp_no_duplicates_merged.csv'), index=False)

### Now try closest measurement to index date to assess % missingness

In [None]:
# try to see what closest measuredate to obsdate gets us in terms of missing values 

# Remove invalid height values
merged.loc[merged['crp value (mg/L)'] < 0, 'crp value (mg/L)'] = np.nan

merged = merged.sort_values(['e_patid', 'measuredate'])

merged_window = merged.copy()

# Get T/F array of values that are in the timeframe (T) and that are not (F)
is_between_dates = (merged_window['measuredate'] <= merged_window['obsdate'])

# Set values outside this valid window to NaN (because they are missing if they are not in the window!)
merged_window.loc[~is_between_dates, 'crp value (mg/L)'] = np.nan

# Drop duplicates
merged_window['days_diff'] = abs(merged_window['measuredate'] - merged_window['obsdate']).dt.days
merged_window = merged_window.sort_values(['e_patid', 'obsdate', 'days_diff'], ascending = [True, True, True])
merged_window = merged_window.drop_duplicates(subset = ['e_patid'], keep = 'first')

merged_window = merged_window[["e_patid", "crp value (mg/L)", "measuredate"]]
merged_outcome = pd.merge(outcome, merged_window, on = 'e_patid', how = 'left')

DIR = 'cleaned_files\\Closest'
os.chdir(DIR)

merged_outcome.to_csv('crp_no_duplicates_merged.csv', index = False)

merged_outcome['crp value (mg/L)'].isna().sum()