## Retrieving biobank data using dxdata and plotting results

In [5]:
import dxpy
import dxdata
import pyspark

In [7]:
dxdata.__package__

'dxdata'

In [None]:
# Spark initialization 
sc = pyspark.SparkContext() # connection to a Spark cluster
spark = pyspark.sql.SparkSession(sc)

In [None]:
# Automatically discover dispensed database name and dataset id
dispensed_database = dxpy.find_one_data_object(
    classname='database', 
    name='app*', 
    folder='/', 
    name_mode='glob', 
    describe=True)
dispensed_database_name = dispensed_database['describe']['name']

dispensed_dataset = dxpy.find_one_data_object(
    typename='Dataset', 
    name='app*.dataset', 
    folder='/', 
    name_mode='glob')
dispensed_dataset_id = dispensed_dataset['id']

## Phenotype data

pheno = dataset['participant']

# Find by field name
field_eid = pheno.find_field(name='eid') #Participant ID

# Find by exact title
field_sex = pheno.find_field(title='Sex')
field_age = pheno.find_field(title='Age at recruitment')
field_own_rent = pheno.find_field(title='Own or rent accommodation lived in | Instance 0')

# Find by title pattern
pattern = 'Length of time at current address \| Instance [0-2]'
fields_len = list(pheno.find_fields(title_regex=pattern))

# Extract phenotype data for selected fields

# Final list of fields
field_list = [field_eid, field_sex, field_own_rent, field_age] + fields_len

# Extract data and construct a Spark DataFrame of the given fields
pheno_data = pheno.retrieve_fields(fields=field_list, engine=dxdata.connect()).to_koalas()

# See first five entries
pheno_data.head()

Let's list column name and title

In [None]:
pd.DataFrame(
    {
        'Name': [f.name for f in field_list],
        'Title': [f.title for f in field_list]
    }
)

## Summarize data

In [None]:
pheno_data.describe()

## Get averages and group counts by sex


In [None]:
# Show average of numeric columns (age, own or rent accommodation lived in, length of time at current address) by sex
pheno_data.groupby('p31').mean()

## Visually display correlation

len_address_inst0 = pheno_data.p699_i0.to_numpy()
len_address_inst1 = pheno_data.p699_i1.to_numpy()
age = pheno_data.p21022.to_numpy()

In [None]:
# Plot length of time at current address instance 1 against instance 2
ax = sns.jointplot(x=len_address_inst0, y=len_address_inst1, kind='scatter', space=0, color='black', alpha=0.1, s=4)
ax.set_axis_labels(fields_len[0].title, fields_len[1].title, fontsize=16)

In [None]:
# Plot age against length of time at current address
ax = sns.jointplot(x=age, y=len_address_inst0, kind='kde')
ax.set_axis_labels(field_age.title, fields_len[0].title, fontsize=16)