In [None]:
import os
import dxpy
import pandas as pd
import numpy as np
from scipy.stats import norm

In [None]:
pheno_df = pd.read_csv("/mnt/project/notebooks/bmi/data/pheno.csv.gz")

In [None]:
def rint_normalization(ser):
    ranks = ser.rank()
    normalized = norm.ppf((ranks - 0.5)/ranks.notna().sum())
    return normalized

In [None]:
pheno_df.ancestry_pred.value_counts()

# Normalize BMI by ancestry and sex

In [None]:
pheno_df["bmi_rint"] = pheno_df.groupby(["ancestry_pred", "sex"])["bmi"].transform(rint_normalization)

In [None]:
pheno_df.groupby(["ancestry_pred", "sex"]).agg({"bmi_rint": ["mean", "min", "max", len]})

# Process covariates

- Convert genetic sex to 1 or 0
- Create age^2, age x sex
- Convert exome batch covariate

In [None]:
pheno_df["genetic_sex"] = (pheno_df.genetic_sex=="Female").astype(int)
pheno_df["age_2"] = pheno_df.age**2
pheno_df["age_sex"] = pheno_df.age*pheno_df.genetic_sex
pheno_df["exome_release_batch"] = (pheno_df.exome_release_batch=="50K Release").astype(int)

# Create ancestry normalized phenotypes and save in regenie format

In [None]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    return

In [None]:
def save_files_in_regenie_fmt(
    df, proj_dir, save_file, 
    phenos=['bmi', 'bmi_rint'],
    covariates=[
        'genetic_sex', 'age', 'age_2', 'age_sex', 'exome_release_batch',
        'genetic_pca1', 'genetic_pca2', 'genetic_pca3', 'genetic_pca4', 'genetic_pca5', 
        'genetic_pca6', 'genetic_pca7', 'genetic_pca8', 'genetic_pca9', 'genetic_pca10'
    ],
    prs=['bmi_prs'],
    lifestyle=['pa', 'alcohol', 'smoke', 'sleep', 'sedentary', 'diet']):
    curr_columns = df.columns
    df["FID"] = df.sample_names
    df["IID"] = df.sample_names
    df.loc[:, ["FID", "IID"]+ phenos + covariates + prs + lifestyle].fillna("NA").to_csv(save_file, index=False, sep="\t")
    upload_file_to_project(save_file, proj_dir)
    return

In [None]:
for ancestry in pheno_df.ancestry_pred.unique():
    anc_df = pheno_df.loc[pheno_df.ancestry_pred==ancestry].copy()
    proj_dir = "/notebooks/bmi/data/processed/"
    filename = f"{ancestry}_phenotype.tsv.gz"
    save_files_in_regenie_fmt(anc_df, proj_dir, filename)

# Normalize covariates in europeans

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
eur_df = pd.read_csv("/mnt/project/notebooks/bmi/data/processed/eur_phenotype.tsv.gz", sep="\t")

In [None]:
def normalize_covariates(pheno_df, covariates):
    norm_pheno_df = pheno_df.copy()
    for cov in covariates:
        scaler = StandardScaler()
        norm_pheno_df[cov] = scaler.fit_transform(norm_pheno_df.loc[:, [cov]])
    return norm_pheno_df

def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    return

In [None]:
# normalized pheno df
covariates = ["age", "age_2", "age_sex", "genetic_sex", "bmi_prs"] + [f"genetic_pca{i}" for i in range(1, 11)]
norm_eur_df = normalize_covariates(eur_df, covariates + ["bmi_prs"])

In [None]:
proj_dir = "/notebooks/bmi/data/processed/"
filename = f"eur_phenotype_norm.tsv.gz"
norm_eur_df.fillna("NA").to_csv(filename, index=False, sep="\t")
upload_file_to_project(filename, proj_dir)
