In [1]:
import os
import sys
project_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_dir)
import numpy as np

from config_utils import CONFIG_FILE, Config, read_config_from_file
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

In [2]:
config = read_config_from_file(CONFIG_FILE)
data_path = os.path.join(config.data_dir,'raw', config.raw_file_name)
df = pd.read_csv(data_path)


age_adjusted_df = df.copy()

# Filter by Carrier.Status == "CTL"
df_ctl = df[df['Carrier.Status'] == 'CTL']

# Extract protein columns (ending with |PLASMA or |CSF)
protein_columns = [col for col in df_ctl.columns if col.endswith('|PLASMA') or col.endswith('|CSF')]
#Ages for CTL subjects
ages_ctl = df_ctl['AGE_AT_VISIT'].values
mean_age = np.mean(ages_ctl)
#Ages for all subjects
ages_all = df['AGE_AT_VISIT'].values

# Step 3: Perform age adjustment for each protein column
for protein in protein_columns:
    # Fit linear regression with AGE_AT_VISIT as independent variable
    protein_values_ctl = df_ctl[protein].values

    # Check for NaNs in protein values and drop corresponding rows in age and protein values
    if np.isnan(protein_values_ctl).any():
        print(f"Warning: NaN values found in {protein} for control group. Dropping these rows for regression.")
        valid_idx_ctl = ~np.isnan(protein_values_ctl)  # Identify rows that are not NaNs in control group
        protein_values_ctl = protein_values_ctl[valid_idx_ctl]
        ages_ctl_valid = ages_ctl[valid_idx_ctl]
    else:
        ages_ctl_valid = ages_ctl


    model = LinearRegression()
    model.fit(ages_ctl_valid.reshape(-1, 1), protein_values_ctl)

    # Calculate the expected protein value at the mean age (using control group)
    expected_value_at_mean_age = model.intercept_ + model.coef_[0] * mean_age
    # Predict the expected protein values for all individuals in the dataset (using the entire age range)
    predicted_values_all = model.predict(ages_all.reshape(-1, 1))

    # Calculate age-adjusted values for all individuals
    age_adjusted_values_all = df[protein].values - (predicted_values_all - expected_value_at_mean_age)

    # Replace the original protein values with age-adjusted values in the full dataframe
    age_adjusted_df[protein] = age_adjusted_values_all

new_filename = os.path.splitext(config.raw_file_name)[0] + "_age_adjusted.csv"
new_data_path = os.path.join(config.data_dir, 'raw', new_filename)

# Save the dataframe as a new CSV
age_adjusted_df.to_csv(new_data_path, index=False)

print(f"Age-adjusted data saved to {new_data_path}")




  df = pd.read_csv(data_path)


Age-adjusted data saved to /scratch/lcornelis/data/data_louisa/raw/ALLFTD_dataset_for_nina_louisa_071124_age_adjusted.csv
