In [1]:
# Import packages
import pyspark
import dxpy
import dxdata
from pyspark.sql.functions import array_join

In [2]:
# Spark initialization (Done only once; do not rerun this cell unless you select Kernel -> Restart kernel).
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [3]:
# Automatically discover dispensed database name and dataset id
dispensed_database = dxpy.find_one_data_object(
    classname='database', 
    name='app*', 
    folder='/', 
    name_mode='glob', 
    describe=True)
dispensed_database_name = dispensed_database['describe']['name']

dispensed_dataset = dxpy.find_one_data_object(
    typename='Dataset', 
    name='app*.dataset', 
    folder='/', 
    name_mode='glob')
dispensed_dataset_id = dispensed_dataset['id']

In [4]:
dataset = dxdata.load_dataset(id=dispensed_dataset_id)

In [5]:
participant = dataset['participant']

In [6]:
field_name_dict = {
    'sample_names': 'eid',
    'icd': 'p41270',
}

In [7]:
field_names = list(field_name_dict.values())

In [8]:
df = participant.retrieve_fields(names=field_names, engine=dxdata.connect(), coding_values="raw")

In [9]:
df = df.withColumn("p41270", array_join(df.p41270, '|'))

In [10]:
# See the first five entries as a Pandas DataFrame:
df.limit(5).toPandas()

Unnamed: 0,eid,p41270
0,1270557,C210|C218|D129|H020|M2551|Z087|Z452|Z850
1,4366755,B954|D62|E780|H409|I10|I839|K802|K839|L031|L97...
2,5448800,A400|A418|C920|D70|E833|E834|E872|F329|F640|I1...
3,3351884,D860|J459|M4792
4,2896181,A099|E780|F009|F028|F03|F321|F329|F412|F419|G3...


In [11]:
def upload_file_to_project(filename, proj_dir):
    dxpy.upload_local_file(filename, folder=proj_dir, parents=True)
    print(f"*********{filename} uploaded!!*********")
    return

In [12]:
pandas_df = df.toPandas()

In [13]:
pandas_df = pandas_df.rename(columns={v:k for k,v in field_name_dict.items()})

In [14]:
start = 0
final = len(pandas_df)
block = 0
proj_dir = f"/phenotype_processing/icd_info/"
nrows = 50000

while start<final:
    end = start + nrows
    table = pandas_df.iloc[start:end, :]
    filename = f"icd_block{block}.csv.gz"
    table.to_csv(filename, index=False)
    upload_file_to_project(filename, proj_dir)
    start += nrows
    block += 1

*********icd_block0.csv.gz uploaded!!*********
*********icd_block1.csv.gz uploaded!!*********
*********icd_block2.csv.gz uploaded!!*********
*********icd_block3.csv.gz uploaded!!*********
*********icd_block4.csv.gz uploaded!!*********
*********icd_block5.csv.gz uploaded!!*********
*********icd_block6.csv.gz uploaded!!*********
*********icd_block7.csv.gz uploaded!!*********
*********icd_block8.csv.gz uploaded!!*********
*********icd_block9.csv.gz uploaded!!*********
*********icd_block10.csv.gz uploaded!!*********
