In [None]:
# Import packages
import pyspark
import dxpy
import dxdata

In [None]:
# Spark initialization (Done only once; do not rerun this cell unless you select Kernel -> Restart kernel).
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [None]:
# Automatically discover dispensed database name and dataset id
dispensed_database = dxpy.find_one_data_object(
    classname='database', 
    name='app*', 
    folder='/', 
    name_mode='glob', 
    describe=True)
dispensed_database_name = dispensed_database['describe']['name']

dispensed_dataset = dxpy.find_one_data_object(
    typename='Dataset', 
    name='app*.dataset', 
    folder='/', 
    name_mode='glob')
dispensed_dataset_id = dispensed_dataset['id']

In [None]:
dataset = dxdata.load_dataset(id=dispensed_dataset_id)

In [None]:
participant = dataset['participant']

# Download phenotype and covariate data

Covariates to extract: age, sex, release batch, and genetic principal components

Covariates to use: age, age^2, sex, an age-by-sex interaction term, experimental batch-related covariates, and genetic principal components

Adding first release as a batch: https://dnanexus.gitbook.io/uk-biobank-rap/science-corner/whole-exome-sequencing-oqfe-protocol/generation-and-utilization-of-quality-control-set-90pct10dp-on-oqfe-data/details-on-processing-the-300k-exome-data-to-generate-the-quality-control-set

In [None]:
field_name_dict = {
    'sample_names': 'eid',
    'age_assessment0': 'p21003_i0',
    'age_assessment1': 'p21003_i1',
    'age_assessment2': 'p21003_i2',
    'age_assessment3': 'p21003_i3',
    'ethnic_background0': 'p21000_i0',
    'ethnic_background1': 'p21000_i1',
    'ethnic_background2': 'p21000_i2',
    'genetic_kinship_to_other_participants': 'p22021',
    'sex': 'p31',
    'genetic_sex': 'p22001',
    'exome_release_batch': 'p32050',
    'bmi0': 'p21001_i0',
    'bmi1': 'p21001_i1',
    'bmi2': 'p21001_i2',
    'bmi3': 'p21001_i3',
    'bmi_prs': 'p26216',
}


for idx in range(1, 11):
    field_name_dict[f'genetic_pca{idx}'] = f'p22009_a{idx}'

In [None]:
field_names = list(field_name_dict.values())

In [None]:
df = participant.retrieve_fields(names=field_names, engine=dxdata.connect(), coding_values="replace")

In [None]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    return

In [None]:
pandas_df = df.toPandas()

In [None]:
pandas_df = pandas_df.rename(columns={v:k for k,v in field_name_dict.items()})

In [None]:
pandas_df.head()

In [None]:
pandas_df.dtypes

In [None]:
proj_dir = f"/notebooks/bmi/data/"

In [None]:
filename = f"bmi_with_cov_raw.csv.gz"
pandas_df.to_csv(filename, index=False)
upload_file_to_project(filename, proj_dir)