# Module 1 Lab 1 - Reading the SDTM

In this lab you will learn how to read parts of the SDTM and familiarize yourself with the format.  The data you will use is a sample dataset that does not contain actual patient data.


In [1]:
import sys
# ensure we are on a version of pandas that supports the read_sas method (this will take approximately 10 minutes)
!{sys.executable} -m pip install --upgrade "pandas>=1.1"
!{sys.executable} -m pip install xmltodict

import pandas as pd
import numpy as np


Collecting pandas>=1.1
[?25l  Downloading https://files.pythonhosted.org/packages/99/f0/f99700ef327e51d291efdf4a6de29e685c4d198cbf8531541fc84d169e0e/pandas-1.3.5.tar.gz (4.7MB)
[K     |████████████████████████████████| 4.7MB 3.5MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Building wheels for collected packages: pandas
  Building wheel for pandas (PEP 517) ... [?25ldone
[?25h  Created wheel for pandas: filename=pandas-1.3.5-cp37-cp37m-linux_x86_64.whl size=30216468 sha256=55d890eac279abf6f1907c20c7ede5f9b8df913cf6cfff1eb35d388e5d4fc879
  Stored in directory: /home/dcphw2/.cache/pip/wheels/5c/f4/45/389dc711f0c5ff9adeb5245397ab18bf75182e8cff9fbfa916
Successfully built pandas
Installing collected packages: pandas
  Found existing installation: pandas 0.25.2
    Uninstalling pandas-0.25.2:
      Successfully uninstalled pandas-0.25.2
Successfully installe



## Reading Data
Data in the SDTM is stored in a format created by SAS, the SAS Transport File Format, with a file extension of either `xport` or `xpt`.  Fortunately, we do not need SAS to read the format, as it is an open format.  Pandas can read this format using the `read_sas` method.

### Demographics
We will read the demographics from our example study.  Demographics are stored in the `dm` file.  See the specification for details on the fields for SDTM: [SDTM_v1.8.pdf](../resources/SDTM_v1.8.pdf).  Demographics are described on page 20.

Read [An Introduction to SDTM](../resources/IS04.pdf) for an overview of the different common datasets.

Note that we must read the file as a binary format for Pandas to effectively load the data (`'rb'` in the `open()` method).  If you receive errors when trying to read, be sure you've opened the file in binary format.

In [2]:
with open('../resources/SDTM_sample/dm.xpt', 'rb') as f:
    dm = pd.read_sas(f, format='xport', encoding='utf-8')
    
display(dm.head())

Unnamed: 0,STUDYID,DOMAIN,USUBJID,SUBJID,RFSTDTC,RFENDTC,RFXSTDTC,RFXENDTC,RFICDTC,RFPENDTC,...,SEX,RACE,ETHNIC,ARMCD,ARM,ACTARMCD,ACTARM,COUNTRY,DMDTC,DMDY
0,CDISCPILOT01,DM,01-701-1015,1015,2014-01-02,2014-07-02,2014-01-02,2014-07-02,,2014-07-02T11:45,...,F,WHITE,HISPANIC OR LATINO,Pbo,Placebo,Pbo,Placebo,USA,2013-12-26,-7.0
1,CDISCPILOT01,DM,01-701-1023,1023,2012-08-05,2012-09-02,2012-08-05,2012-09-01,,2013-02-18,...,M,WHITE,HISPANIC OR LATINO,Pbo,Placebo,Pbo,Placebo,USA,2012-07-22,-14.0
2,CDISCPILOT01,DM,01-701-1028,1028,2013-07-19,2014-01-14,2013-07-19,2014-01-14,,2014-01-14T11:10,...,M,WHITE,NOT HISPANIC OR LATINO,Xan_Hi,Xanomeline High Dose,Xan_Hi,Xanomeline High Dose,USA,2013-07-11,-8.0
3,CDISCPILOT01,DM,01-701-1033,1033,2014-03-18,2014-04-14,2014-03-18,2014-03-31,,2014-09-15,...,M,WHITE,NOT HISPANIC OR LATINO,Xan_Lo,Xanomeline Low Dose,Xan_Lo,Xanomeline Low Dose,USA,2014-03-10,-8.0
4,CDISCPILOT01,DM,01-701-1034,1034,2014-07-01,2014-12-30,2014-07-01,2014-12-30,,2014-12-30T09:50,...,F,WHITE,NOT HISPANIC OR LATINO,Xan_Hi,Xanomeline High Dose,Xan_Hi,Xanomeline High Dose,USA,2014-06-24,-7.0


### Vital Signs
We can read vital signs from the study as well.  Vitals are linked to demographics using the `USUBJID` column.  Unlike demographics, the vitals format is not predefined.  It can contain additional fields beyond the required and expected variables (these are known as permissible variables).

In [3]:
with open('../resources/SDTM_sample/vs.xpt', 'rb') as f:
    vs = pd.read_sas(f, format='xport', encoding='utf-8')
    
display(vs.head())

display(dm.join(vs.set_index(['USUBJID']), how='left', on=['USUBJID'], rsuffix='_vs'))

Unnamed: 0,STUDYID,DOMAIN,USUBJID,VSSEQ,VSTESTCD,VSTEST,VSPOS,VSORRES,VSORRESU,VSSTRESC,...,VISITNUM,VISIT,VISITDY,EPOCH,VSDTC,VSDY,VSTPT,VSTPTNUM,VSELTM,VSTPTREF
0,CDISCPILOT01,VS,01-701-1015,1.0,DIABP,Diastolic Blood Pressure,SUPINE,64,mmHg,64,...,1.0,SCREENING 1,-7.0,SCREENING,2013-12-26,-7.0,AFTER LYING DOWN FOR 5 MINUTES,815.0,PT5M,PATIENT SUPINE
1,CDISCPILOT01,VS,01-701-1015,2.0,DIABP,Diastolic Blood Pressure,STANDING,83,mmHg,83,...,1.0,SCREENING 1,-7.0,SCREENING,2013-12-26,-7.0,AFTER STANDING FOR 1 MINUTE,816.0,PT1M,PATIENT STANDING
2,CDISCPILOT01,VS,01-701-1015,3.0,DIABP,Diastolic Blood Pressure,STANDING,57,mmHg,57,...,1.0,SCREENING 1,-7.0,SCREENING,2013-12-26,-7.0,AFTER STANDING FOR 3 MINUTES,817.0,PT3M,PATIENT STANDING
3,CDISCPILOT01,VS,01-701-1015,4.0,DIABP,Diastolic Blood Pressure,SUPINE,68,mmHg,68,...,2.0,SCREENING 2,-1.0,SCREENING,2013-12-31,-2.0,AFTER LYING DOWN FOR 5 MINUTES,815.0,PT5M,PATIENT SUPINE
4,CDISCPILOT01,VS,01-701-1015,5.0,DIABP,Diastolic Blood Pressure,STANDING,59,mmHg,59,...,2.0,SCREENING 2,-1.0,SCREENING,2013-12-31,-2.0,AFTER STANDING FOR 1 MINUTE,816.0,PT1M,PATIENT STANDING


Unnamed: 0,STUDYID,DOMAIN,USUBJID,SUBJID,RFSTDTC,RFENDTC,RFXSTDTC,RFXENDTC,RFICDTC,RFPENDTC,...,VISITNUM,VISIT,VISITDY,EPOCH,VSDTC,VSDY,VSTPT,VSTPTNUM,VSELTM,VSTPTREF
0,CDISCPILOT01,DM,01-701-1015,1015,2014-01-02,2014-07-02,2014-01-02,2014-07-02,,2014-07-02T11:45,...,1.0,SCREENING 1,-7.0,SCREENING,2013-12-26,-7.0,AFTER LYING DOWN FOR 5 MINUTES,815.0,PT5M,PATIENT SUPINE
0,CDISCPILOT01,DM,01-701-1015,1015,2014-01-02,2014-07-02,2014-01-02,2014-07-02,,2014-07-02T11:45,...,1.0,SCREENING 1,-7.0,SCREENING,2013-12-26,-7.0,AFTER STANDING FOR 1 MINUTE,816.0,PT1M,PATIENT STANDING
0,CDISCPILOT01,DM,01-701-1015,1015,2014-01-02,2014-07-02,2014-01-02,2014-07-02,,2014-07-02T11:45,...,1.0,SCREENING 1,-7.0,SCREENING,2013-12-26,-7.0,AFTER STANDING FOR 3 MINUTES,817.0,PT3M,PATIENT STANDING
0,CDISCPILOT01,DM,01-701-1015,1015,2014-01-02,2014-07-02,2014-01-02,2014-07-02,,2014-07-02T11:45,...,2.0,SCREENING 2,-1.0,SCREENING,2013-12-31,-2.0,AFTER LYING DOWN FOR 5 MINUTES,815.0,PT5M,PATIENT SUPINE
0,CDISCPILOT01,DM,01-701-1015,1015,2014-01-02,2014-07-02,2014-01-02,2014-07-02,,2014-07-02T11:45,...,2.0,SCREENING 2,-1.0,SCREENING,2013-12-31,-2.0,AFTER STANDING FOR 1 MINUTE,816.0,PT1M,PATIENT STANDING
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,CDISCPILOT01,DM,01-718-1427,1427,2012-12-17,2013-02-18,2012-12-17,2013-02-11,,2013-06-03,...,3.0,BASELINE,1.0,TREATMENT,2012-12-17,1.0,,,,
305,CDISCPILOT01,DM,01-718-1427,1427,2012-12-17,2013-02-18,2012-12-17,2013-02-11,,2013-06-03,...,4.0,WEEK 2,14.0,TREATMENT,2012-12-31,15.0,,,,
305,CDISCPILOT01,DM,01-718-1427,1427,2012-12-17,2013-02-18,2012-12-17,2013-02-11,,2013-06-03,...,5.0,WEEK 4,28.0,TREATMENT,2013-01-17,32.0,,,,
305,CDISCPILOT01,DM,01-718-1427,1427,2012-12-17,2013-02-18,2012-12-17,2013-02-11,,2013-06-03,...,7.0,WEEK 6,42.0,TREATMENT,2013-01-28,43.0,,,,


## SDTM datasets
The SDTM encompasses a standard for communicating data to the Food and Drug Administration (FDA) in the United States, and the Pharmaceuticals and Medical Devices Agency (PMDA) in Japan.  Datasets such as demographics and vital signs have specific filenames that must be adhered to, as well as locations for these files.  Section 3 of the SDTM Implementation Guide for Human Clinical Trials contains the specifications for the file naming conventions: [SDTMIG_v3.3_FINAL.pdf](../resources/SDTMIG_v3.3_FINAL.pdf).  Also in this section are details about the keys included in each file. 

(Note that there are other Implementation Guides for submission of data in SDTM format for other purposes, such as non-clinical data, medical devices, etc.  Those guides are outside the scope of this course.  Section 7 of the SDTM provides more information for those interested.)

Metadata about the study is encoded in XML format in the `define.xml` file.  This file describes the datasets, their field contents, and how the datasets are joined.  This file defines the permissible variables that are included in any of the datasets being submitted, as well as the required and expected variables.

This file is required to be part of any submission to the FDA.  An overview of the format can be found here: [PharmaSUG-2014-AD02.pdf](../resources/PharmaSUG-2014-AD02.pdf)


Lets take a look at the sample define.xml.



In [4]:
!cat ../resources/SDTM_sample/define.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="define2-0-0.xsl"?>
<ODM
     xmlns:xlink="http://www.w3.org/1999/xlink"
     xmlns="http://www.cdisc.org/ns/odm/v1.3"
     xmlns:def="http://www.cdisc.org/ns/def/v2.0"
     ODMVersion="1.3.2"
     FileType="Snapshot"
     FileOID="TDF_SDTM.CDISC SDTM.3.2"
     CreationDateTime="2018-11-19T08:39:20">
   <Study OID="TDF_SDTM.CDISC SDTM.3.2">
      <GlobalVariables>
         <StudyName>TDF_SDTM</StudyName>
         <StudyDescription>Test datasets created by updating existing CDISCPILOT SDTM datasets</StudyDescription>
         <ProtocolName>TDF_Datasets</ProtocolName>
      </GlobalVariables>
      <MetaDataVersion OID="MDV.TDF_SDTM.CDISC SDTM.3.2" Name="Study TDF_SDTM Data Definitions"
                       Description="Test datasets created by updating existing CDISCPILOT SDTM datasets"
                       def:DefineVersion="2.0.0"
                       def:StandardName="CDISC SDTM"
  

            <CodeListRef CodeListOID="CL.NYNAN"/>
            <def:Origin Type="CRF"/>
         </ItemDef>
         <ItemDef OID="IT.QSDA.QSORRES.QSDA.QSTESTCD.EQ.DAITM16-aee44dbf"
                  Name="QSORRES.QSDA.QSTESTCD.EQ.DAITM16-aee44dbf"
                  DataType="integer"
                  Length="8"
                  SASFieldName="QSORRES">
            <CodeListRef CodeListOID="CL.NYNAN"/>
            <def:Origin Type="CRF"/>
         </ItemDef>
         <ItemDef OID="IT.QSDA.QSORRES.QSDA.QSTESTCD.EQ.DAITM17-b0a58a2b"
                  Name="QSORRES.QSDA.QSTESTCD.EQ.DAITM17-b0a58a2b"
                  DataType="integer"
                  Length="8"
                  SASFieldName="QSORRES">
            <CodeListRef CodeListOID="CL.NYNAN"/>
            <def:Origin Type="CRF"/>
         </ItemDef>
         <ItemDef OID="IT.QSDA.QSORRES.QSDA.QSTESTCD.EQ.DAITM18-2e9d0b57"
                  Name="QSORRES.QSDA.QSTESTCD.EQ.DAITM18-2e9d0b57"
                  D

### define.xml

As you can see, this file is complex and lengthy!  Here are some define.xml tags worth noting:

  * `<ItemDef>` - These describe the variables included in the datasets.
  * `<CodeList>` - These describe the codes in use that are possible for various data fields, and provides order and translations of the codes.
  * `<def:leaf>` - These are referenced documents.  In the sample, you can find the name of the annotated Case Report Form (CRF) here, although the standard location is in the `<def:AnnotatedCRF>` tag.
  * `<MethodDef>` - these tags desribe how some field values may be computed
  
Since XML is an unstructured format, Pandas cannot directly deal with it without some preprocessing.  Instead of using Pandas, we will use `xmltodict` to read a portion of the define.xml file for specific tags of interest and build a dataframe.

In [5]:
import xmltodict
from pprint import pprint

xml_data = open('../resources/SDTM_sample/define.xml', 'r').read()  # Read data
xmlDict = xmltodict.parse(xml_data, force_list=('Decode','TranslatedText','CodeListItem'))  # Parse XML (forcing some tags to always create a list even if there is only one child element)

In [6]:
pprint(xmlDict)

OrderedDict([('ODM',
              OrderedDict([('@xmlns:xlink', 'http://www.w3.org/1999/xlink'),
                           ('@xmlns', 'http://www.cdisc.org/ns/odm/v1.3'),
                           ('@xmlns:def', 'http://www.cdisc.org/ns/def/v2.0'),
                           ('@ODMVersion', '1.3.2'),
                           ('@FileType', 'Snapshot'),
                           ('@FileOID', 'TDF_SDTM.CDISC SDTM.3.2'),
                           ('@CreationDateTime', '2018-11-19T08:39:20'),
                           ('Study',
                            OrderedDict([('@OID', 'TDF_SDTM.CDISC SDTM.3.2'),
                                         ('GlobalVariables',
                                          OrderedDict([('StudyName',
                                                        'TDF_SDTM'),
                                                       ('StudyDescription',
                                                        'Test datasets created '
                             

                                                                                     ('def:WhereClauseRef',
                                                                                      OrderedDict([('@WhereClauseOID',
                                                                                                    'WC.LBUR.LBCAT.EQ.URINALYSIS-fbe6266d')]))]),
                                                                        OrderedDict([('@ItemOID',
                                                                                      'IT.LBUR.LBORRES.LBUR.LBTESTCD.EQ.TSH.LBUR.LBCAT.EQ.OTHER-d747e2b6'),
                                                                                     ('@OrderNumber',
                                                                                      '3'),
                                                                                     ('@Mandatory',
                                                                               

                                                                                      '37'),
                                                                                     ('@Mandatory',
                                                                                      'No'),
                                                                                     ('def:WhereClauseRef',
                                                                                      OrderedDict([('@WhereClauseOID',
                                                                                                    'WC.QSNI.QSTESTCD.EQ.NPITM08D-791c4267')]))]),
                                                                        OrderedDict([('@ItemOID',
                                                                                      'IT.QSNI.QSORRES.QSNI.QSTESTCD.EQ.NPITM08F-7fe633d1'),
                                                                                     ('@OrderNumber',

                                                                                      'HEMATOLOGY')]),
                                                                        OrderedDict([('@SoftHard',
                                                                                      'Soft'),
                                                                                     ('@def:ItemOID',
                                                                                      'IT.LBHE.LBTESTCD'),
                                                                                     ('@Comparator',
                                                                                      'EQ'),
                                                                                     ('CheckValue',
                                                                                      'EOS')])])]),
                                                         OrderedDict([('@OID',
                     

                                                                                     'EQ'),
                                                                                    ('CheckValue',
                                                                                     'MHITM09')]))]),
                                                         OrderedDict([('@OID',
                                                                       'WC.QSHI.QSTESTCD.EQ.MHITM10-a92f7298'),
                                                                      ('RangeCheck',
                                                                       OrderedDict([('@SoftHard',
                                                                                     'Soft'),
                                                                                    ('@def:ItemOID',
                                                                                     'IT.QSHI.QSTESTCD'),
                                

                                                                                     'EQ'),
                                                                                    ('CheckValue',
                                                                                     'TRT')]))]),
                                                         OrderedDict([('@OID',
                                                                       'WC.TS.TSPARMCD.EQ.TTYPE-82bc9eca'),
                                                                      ('RangeCheck',
                                                                       OrderedDict([('@SoftHard',
                                                                                     'Soft'),
                                                                                    ('@def:ItemOID',
                                                                                     'IT.TS.TSPARMCD'),
                                          

                                                                                      'Yes'),
                                                                                     ('@KeySequence',
                                                                                      '1'),
                                                                                     ('@Role',
                                                                                      'IDENTIFIER')]),
                                                                        OrderedDict([('@ItemOID',
                                                                                      'IT.CM.DOMAIN'),
                                                                                     ('@OrderNumber',
                                                                                      '2'),
                                                                                     ('@Mandatory',
                   

                                                                                      'Yes'),
                                                                                     ('@Role',
                                                                                      'SYNONYM '
                                                                                      'QUALIFIER')]),
                                                                        OrderedDict([('@ItemOID',
                                                                                      'IT.LBCH.LBCAT'),
                                                                                     ('@OrderNumber',
                                                                                      '7'),
                                                                                     ('@Mandatory',
                                                                                      'No'),
                       

                                                                                      'RECORD '
                                                                                      'QUALIFIER')]),
                                                                        OrderedDict([('@ItemOID',
                                                                                      'IT.QSGI.QSREASND'),
                                                                                     ('@OrderNumber',
                                                                                      '15'),
                                                                                     ('@Mandatory',
                                                                                      'No')]),
                                                                        OrderedDict([('@ItemOID',
                                                                                      'IT.QSGI.QSBLFL'),
    

                                                                                      '13'),
                                                                                     ('@Mandatory',
                                                                                      'No'),
                                                                                     ('@Role',
                                                                                      'TIMING')]),
                                                                        OrderedDict([('@ItemOID',
                                                                                      'IT.SC.SCDY'),
                                                                                     ('@OrderNumber',
                                                                                      '14'),
                                                                                     ('@Mandatory',
                          

                                                       ('ItemDef',
                                                        [OrderedDict([('@OID',
                                                                       'IT.TA.STUDYID'),
                                                                      ('@Name',
                                                                       'STUDYID'),
                                                                      ('@DataType',
                                                                       'text'),
                                                                      ('@Length',
                                                                       '12'),
                                                                      ('@SASFieldName',
                                                                       'STUDYID'),
                                                                      ('Description',
                     

                                                                                                   ('#text',
                                                                                                    'Subject '
                                                                                                    'Death '
                                                                                                    'Flag')])])])),
                                                                      ('CodeListRef',
                                                                       OrderedDict([('@CodeListOID',
                                                                                     'CL.Y_BLANK')])),
                                                                      ('def:Origin',
                                                                       OrderedDict([('@Type',
                                                                                     '

                                                                       '8.1'),
                                                                      ('Description',
                                                                       OrderedDict([('TranslatedText',
                                                                                     [OrderedDict([('@xml:lang',
                                                                                                    'en'),
                                                                                                   ('#text',
                                                                                                    'Visit '
                                                                                                    'Number')])])])),
                                                                      ('CodeListRef',
                                                                       OrderedDict([('@Cod

                                                                       'text'),
                                                                      ('@Length',
                                                                       '200'),
                                                                      ('@SASFieldName',
                                                                       'AEOUT'),
                                                                      ('Description',
                                                                       OrderedDict([('TranslatedText',
                                                                                     [OrderedDict([('@xml:lang',
                                                                                                    'en'),
                                                                                                   ('#text',
                                                                        

                                                                                                    'Collection')])])])),
                                                                      ('def:Origin',
                                                                       OrderedDict([('@Type',
                                                                                     'Derived')]))]),
                                                         OrderedDict([('@OID',
                                                                       'IT.LBCH.STUDYID'),
                                                                      ('@Name',
                                                                       'STUDYID'),
                                                                      ('@DataType',
                                                                       'text'),
                                                                      ('@Length',
                  

                                                                                                    'Name')])])])),
                                                                      ('CodeListRef',
                                                                       OrderedDict([('@CodeListOID',
                                                                                     'CL.VISIT')])),
                                                                      ('def:Origin',
                                                                       OrderedDict([('@Type',
                                                                                     'eDT')]))]),
                                                         OrderedDict([('@OID',
                                                                       'IT.LBHE.VISITDY'),
                                                                      ('@Name',
                                                                     

                                                                                                   ('#text',
                                                                                                    'Derived '
                                                                                                    'Flag')])])])),
                                                                      ('CodeListRef',
                                                                       OrderedDict([('@CodeListOID',
                                                                                     'CL.Y_BLANK')])),
                                                                      ('def:Origin',
                                                                       OrderedDict([('@Type',
                                                                                     'Assigned')]))]),
                                                         OrderedDict([('@OID',
             

                                                                       'text'),
                                                                      ('@Length',
                                                                       '1'),
                                                                      ('@SASFieldName',
                                                                       'QSBLFL'),
                                                                      ('Description',
                                                                       OrderedDict([('TranslatedText',
                                                                                     [OrderedDict([('@xml:lang',
                                                                                                    'en'),
                                                                                                   ('#text',
                                                                         

                                                                                     'Assigned')]))]),
                                                         OrderedDict([('@OID',
                                                                       'IT.QSMM.QSREASND'),
                                                                      ('@Name',
                                                                       'QSREASND'),
                                                                      ('@DataType',
                                                                       'text'),
                                                                      ('@Length',
                                                                       '17'),
                                                                      ('@SASFieldName',
                                                                       'QSREASND'),
                                                                  

                                                                                                    'en'),
                                                                                                   ('#text',
                                                                                                    'Date/Time '
                                                                                                    'of '
                                                                                                    'Collection')])])])),
                                                                      ('def:Origin',
                                                                       OrderedDict([('@Type',
                                                                                     'CRF')]))]),
                                                         OrderedDict([('@OID',
                                                                       'IT.SC.SCDY'),
 

                                                                                                   ('#text',
                                                                                                    'Data '
                                                                                                    'Value')])])])),
                                                                      ('def:Origin',
                                                                       OrderedDict([('@Type',
                                                                                     'Derived')])),
                                                                      ('def:ValueListRef',
                                                                       OrderedDict([('@ValueListOID',
                                                                                     'VL.SUPPDM.QVAL')]))]),
                                                         OrderedDict([('@OID',
      

                                                                       'LBORRES'),
                                                                      ('def:Origin',
                                                                       OrderedDict([('@Type',
                                                                                     'eDT')]))]),
                                                         OrderedDict([('@OID',
                                                                       'IT.LBHE.LBORRES.LBHE.LBTESTCD.EQ.POLYCHR.LBHE.LBCAT.EQ.HEMATOLOGY-3838b53f'),
                                                                      ('@Name',
                                                                       'LBORRES.LBHE.LBTESTCD.EQ.POLYCHR.LBHE.LBCAT.EQ.HEMATOLOGY-3838b53f'),
                                                                      ('@DataType',
                                                                       'integer'),
                      

                                                                                     'CL.NYNAN')])),
                                                                      ('def:Origin',
                                                                       OrderedDict([('@Type',
                                                                                     'CRF')]))]),
                                                         OrderedDict([('@OID',
                                                                       'IT.QSDA.QSORRES.QSDA.QSTESTCD.EQ.DAITM34-0fcba698'),
                                                                      ('@Name',
                                                                       'QSORRES.QSDA.QSTESTCD.EQ.DAITM34-0fcba698'),
                                                                      ('@DataType',
                                                                       'integer'),
                                                      

                                                                       'QSORRES'),
                                                                      ('CodeListRef',
                                                                       OrderedDict([('@CodeListOID',
                                                                                     'CL.FREQSC')])),
                                                                      ('def:Origin',
                                                                       OrderedDict([('@Type',
                                                                                     'CRF')]))]),
                                                         OrderedDict([('@OID',
                                                                       'IT.QSNI.QSORRES.QSNI.QSTESTCD.EQ.NPITM10S-d489bcde'),
                                                                      ('@Name',
                                                                  

                                                                                                                    'Failure')])])])])]),
                                                                        OrderedDict([('@CodedValue',
                                                                                      'Pbo'),
                                                                                     ('@OrderNumber',
                                                                                      '2'),
                                                                                     ('Decode',
                                                                                      [OrderedDict([('TranslatedText',
                                                                                                     [OrderedDict([('@xml:lang',
                                                                                                                    'en'),
      

                                                                                     ('@OrderNumber',
                                                                                      '6'),
                                                                                     ('Decode',
                                                                                      [OrderedDict([('TranslatedText',
                                                                                                     [OrderedDict([('@xml:lang',
                                                                                                                    'en'),
                                                                                                                   ('#text',
                                                                                                                    '5')])])])])])])]),
                                                         OrderedDict([('@OID',

                                                                                                                    'within '
                                                                                                                    'the '
                                                                                                                    'last '
                                                                                                                    '5 '
                                                                                                                    'years '
                                                                                                                    'of '
                                                                                                                    'a '
                                                                                                                    'serious '
           

                                                                                                                    'mU/L')])])])]),
                                                                                     ('Alias',
                                                                                      OrderedDict([('@Name',
                                                                                                    'C67408'),
                                                                                                   ('@Context',
                                                                                                    'nci:ExtCodeID')]))]),
                                                                        OrderedDict([('@CodedValue',
                                                                                      'mmol/L'),
                                                                                     ('@OrderNumber',
                 

                                                                                                                    'DETERIORATION')])])])])]),
                                                                        OrderedDict([('@CodedValue',
                                                                                      'MHITM03'),
                                                                                     ('Decode',
                                                                                      [OrderedDict([('TranslatedText',
                                                                                                     [OrderedDict([('@xml:lang',
                                                                                                                    'en'),
                                                                                                                   ('#text',
                                                                 

                                                                                      'OR '
                                                                                      'ALASKA '
                                                                                      'NATIVE'),
                                                                                     ('@OrderNumber',
                                                                                      '3'),
                                                                                     ('Decode',
                                                                                      [OrderedDict([('TranslatedText',
                                                                                                     [OrderedDict([('@xml:lang',
                                                                                                                    'en'),
                                                      

                                                                                                                    'IIb '
                                                                                                                    'Trial')])])])])]),
                                                                        OrderedDict([('@CodedValue',
                                                                                      'Phase '
                                                                                      'III '
                                                                                      'Trial'),
                                                                                     ('@OrderNumber',
                                                                                      '7'),
                                                                                     ('Decode',
                                                                  

                                                                                                                    'en'),
                                                                                                                   ('#text',
                                                                                                                    'UNSCHEDULED '
                                                                                                                    '12.1')])])])])]),
                                                                        OrderedDict([('@CodedValue',
                                                                                      '13'),
                                                                                     ('@OrderNumber',
                                                                                      '33'),
                                                                                     ('Decode',
 

                                                                       'MT.DM.ETHNIC'),
                                                                      ('@Name',
                                                                       'Algorithm '
                                                                       'to '
                                                                       'derive '
                                                                       'DM.ETHNIC'),
                                                                      ('@Type',
                                                                       'Computation'),
                                                                      ('Description',
                                                                       OrderedDict([('TranslatedText',
                                                                                     [OrderedDict([('@xml:lang',
                                    

                                                                      ('Description',
                                                                       OrderedDict([('TranslatedText',
                                                                                     [OrderedDict([('@xml:lang',
                                                                                                    'en'),
                                                                                                   ('#text',
                                                                                                    'Concatenation '
                                                                                                    'of '
                                                                                                    'STUDYID, '
                                                                                                    'DM.SITEID '
                                  

### Interpret the dictionary

The XML  has been converted to a set of nested OrderedDicts.  This structure mimics the XML structure, so to get the data we want we have to walk down the tree of OrderedDicts until we reach the tag we want.  Attributes are added by prefacing them with the `@` symbol.

We can print the basic information shared about this SDTM from the `GlobalVariables` tag.  The code below will return a list of tuples, where each tuple contains the tag first, and the value second.  This information corresponds to the following snippet from the define.xml:
```
<GlobalVariables>
  <StudyName>TDF_SDTM</StudyName>
  <StudyDescription>Test datasets created by updating existing CDISCPILOT SDTM datasets</StudyDescription>
  <ProtocolName>TDF_Datasets</ProtocolName>
</GlobalVariables>
```

In [7]:
global_vars = xmlDict['ODM']['Study']['GlobalVariables']

pprint(global_vars)

OrderedDict([('StudyName', 'TDF_SDTM'),
             ('StudyDescription',
              'Test datasets created by updating existing CDISCPILOT SDTM '
              'datasets'),
             ('ProtocolName', 'TDF_Datasets')])


### Get information about codes used

One of the things we may want to look at is information about codes used in the data set.  Codes can be either internally defined (specific to data captured by the study), or externally defined (such as LOINC codes, ICD-10, etc).  These are in the `CodeList` tags.  Note that the last several CodeLists in our example reference external codes, which will not be included in the resulting dataframe.  Instead, those are simply printed out and if necessary you would need to find the external definition for those codes via other sources.

In [8]:
code_list = xmlDict['ODM']['Study']['MetaDataVersion']['CodeList']

In [9]:
# we will manually build an array from the multiple levels included in these OrderedDicts
cols = ['OID', 'Name', 'DataType', 'CodedValue', 'OrderNumber', 'lang', 'TranslatedText']
data = []

for code in code_list:
    row1 = [code.get('@OID'), code.get('@Name'), code.get('@DataType')]
    if not code.get('CodeListItem'):
        print('Externally defined code:')
        pprint(code)
        print()
    else:
        for code_list_item in code.get('CodeListItem'):
            row2 = [code_list_item.get('@CodedValue'), code_list_item.get('@OrderNumber')]
            for decode in code_list_item.get('Decode'):
                for trans in decode.get('TranslatedText'):
                    data.append(row1 + row2 + [trans.get('@xml.lang'), trans.get('#text')])

df = pd.DataFrame(data)  # create DataFrame from the array
df.columns = cols # add our columns
display(df)

Externally defined code:
OrderedDict([('@OID', 'CL.AEDICT'),
             ('@Name', 'ADVERSE EVENT DICTIONARY'),
             ('@DataType', 'text'),
             ('ExternalCodeList',
              OrderedDict([('@Dictionary', 'MEDDRA'), ('@Version', '8.0')]))])

Externally defined code:
OrderedDict([('@OID', 'CL.DRUGDICT'),
             ('@Name', 'DRUG DICTIONARY'),
             ('@DataType', 'text'),
             ('ExternalCodeList',
              OrderedDict([('@Dictionary', 'WHODRUG'),
                           ('@Version', '200604')]))])

Externally defined code:
OrderedDict([('@OID', 'CL.MHDICT'),
             ('@Name', 'MEDICAL HISTORY DICTIONARY'),
             ('@DataType', 'text'),
             ('ExternalCodeList',
              OrderedDict([('@Dictionary', 'MEDDRA'), ('@Version', '8.0')]))])



Unnamed: 0,OID,Name,DataType,CodedValue,OrderNumber,lang,TranslatedText
0,CL.AECAUS,AECAUS,text,NONE,1,,NONE
1,CL.AECAUS,AECAUS,text,POSSIBLE,2,,POSSIBLE
2,CL.AECAUS,AECAUS,text,PROBABLE,3,,PROBABLE
3,CL.AECAUS,AECAUS,text,REMOTE,4,,REMOTE
4,CL.AGESPAN,AGESPAN,text,CHILDREN (2-11 YEARS),1,,CHILDREN (2-11 YEARS)
...,...,...,...,...,...,...,...
536,CL.VSUNIT,VSUNIT,text,kg,7,,kg
537,CL.VSUNIT,VSUNIT,text,mmHg,8,,mmHg
538,CL.YN,YN,text,N,1,,No
539,CL.YN,YN,text,Y,2,,Yes
