In [None]:
# Import packages
import pyspark
import dxpy
import dxdata

# Spark initialization (Done only once; do not rerun this cell unless you select Kernel -> Restart kernel).
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [None]:
# Automatically discover dispensed database name and dataset id
dispensed_database = dxpy.find_one_data_object(
    classname='database', 
    name='app*', 
    folder='/', 
    name_mode='glob', 
    describe=True)
dispensed_database_name = dispensed_database['describe']['name']

dispensed_dataset = dxpy.find_one_data_object(
    typename='Dataset', 
    name='app*.dataset', 
    folder='/', 
    name_mode='glob')
dispensed_dataset_id = dispensed_dataset['id']

dataset = dxdata.load_dataset(id=dispensed_dataset_id)

participant = dataset['participant']

In [None]:
field_name_dict = {
    'sample_names': 'eid',
    'sleep0': 'p1160_i0', # https://biobank.ndph.ox.ac.uk/ukb/label.cgi?id=100057
    'sleep1': 'p1160_i1', 
    'sleep2': 'p1160_i2',
    'met0': 'p22036_i0', # https://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=22036
    # 'met1': 'p22036_i1',
    # 'met2': 'p22036_i2',
    'tv0': 'p1070_i0',
    'tv1': 'p1070_i1',
    'tv2': 'p1070_i2',
    'computer0': 'p1080_i0',
    'computer1': 'p1080_i1',
    'computer2': 'p1080_i2',
    'alcohol0': 'p1558_i0', # https://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=1558
    'alcohol1': 'p1558_i1',
    'alcohol2': 'p1558_i2',
    'smokecurr0': 'p1239_i0', #https://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=1239
    'smokecurr1': 'p1239_i1',
    'smokecurr2': 'p1239_i2',
    'smokepast0': 'p1249_i0', #https://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=1249
    'smokepast1': 'p1249_i1',
    'smokepast2': 'p1249_i2',
}

## diet https://biobank.ndph.ox.ac.uk/ukb/label.cgi?id=100052
for ins in range(3):
    field_name_dict[f"cookedvegetable{ins}"] = f"p1289_i{ins}"
    field_name_dict[f"salad{ins}"] = f"p1299_i{ins}"
    field_name_dict[f"freshfruit{ins}"] = f"p1309_i{ins}"
    field_name_dict[f"driedfruit{ins}"] = f"p1319_i{ins}"
    field_name_dict[f"oilyfish{ins}"] = f"p1329_i{ins}"
    field_name_dict[f"nonoilyfish{ins}"] = f"p1339_i{ins}"
    field_name_dict[f"procmeat{ins}"] = f"p1349_i{ins}"
    field_name_dict[f"poultry{ins}"] = f"p1359_i{ins}"
    field_name_dict[f"beef{ins}"] = f"p1369_i{ins}"
    field_name_dict[f"mutton{ins}"] = f"p1379_i{ins}"
    field_name_dict[f"pork{ins}"] = f"p1389_i{ins}"
    field_name_dict[f"bread{ins}"] = f"p1438_i{ins}"
    field_name_dict[f"cereal{ins}"] = f"p1458_i{ins}"
    field_name_dict[f"tea{ins}"] = f"p1488_i{ins}"
    field_name_dict[f"coffee{ins}"] = f"p1498_i{ins}"
    field_name_dict[f"water{ins}"] = f"p1528_i{ins}"


In [None]:

field_names = list(field_name_dict.values())



In [None]:
df = participant.retrieve_fields(names=field_names, engine=dxdata.connect(), coding_values="replace")

In [None]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    return

In [None]:
pandas_df = df.toPandas()

In [None]:
pandas_df = pandas_df.rename(columns={v:k for k,v in field_name_dict.items()})

In [None]:
start = 0
final = len(pandas_df)
block = 0
proj_dir = f"/phenotype_processing/lifestyle_info/"
nrows = 50000

while start<final:
    end = start + nrows
    table = pandas_df.iloc[start:end, :]
    filename = f"lifestyle_block{block}.csv.gz"
    table.to_csv(filename, index=False)
    upload_file_to_project(filename, proj_dir)
    start += nrows
    block += 1

# Resources
1. UKB: https://biobank.ndph.ox.ac.uk/ukb/label.cgi?id=100050