In [8]:
import csv
import json
import re

dbdescription = """Patient table from Simulacrum data release, version 1.2.0-2017, 2021-01-19

This is a package of synthetic data based on cancer registration data from 
the Cancer Outcomes and Services Dataset (COSD) and the Systemic Anti-Cancer 
Treatment (SACT) dataset collected by the National Cancer Registration and 
Analysis Service (NCRAS).

Any analysis performed on this synthetic data will not correspond exactly to 
analysis performed on the real data. The synthetic data does not contain real 
patients but just mimics specific statistical properties of the real data. To 
minimise the potential for confusion with real data, organisational codes in 
this extract have been obfuscated.


For more information on the Simulacrum and licensing details, please see
  https://simulacrum.healthdatainsight.org.uk/"""

properties = {}
with open('../data/simulacrum_release_v1.2.0.2017/dictionary_work/sim_DataDictionary.csv', mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    line_count = 0
    

    p = re.compile('^\\d')
    for row in csv_reader:
        propName = row['Variable / Field Name']
        print(propName)
        properties[propName] = {'description':row['Field Label']}
        if row['Text Validation Type OR Show Slider Number'] in ['number', 'integer']:
            properties[propName]['type'] = row['Text Validation Type OR Show Slider Number']
            properties[propName]['$unit'] = row['Field Note']
        elif row['Field Type'] in ["dropdown","radio"]:
            properties[propName]['type'] = 'string'
        else:
            properties[propName]['type'] = row['Field Type']


        choices = []
        if row['lookup_file']:
            print('has lookup file')
            with open(f'../data/simulacrum_release_v1.2.0.2017/data_dictionary_files/{row["lookup_file"]}', mode='r') as val_file:
                lookup_column = row['lookup_column']
                val_reader = csv.DictReader(val_file)

                for val_row in val_reader:
                    code = val_row[lookup_column]
                    desc = val_row['SHORTDESC']
                    choices.append({'const':code, 'title':desc})
                
    
                properties[propName]['oneOf'] = choices
        
        line_count += 1
   
    print(f'Processed {line_count} lines.')
    schema = {
   "name": "simulacrum_release_v1.2.0.2017_patient",
   "description": dbdescription,
   "data_model": {
      "$id": "",
      "description": "simulacrum_release_v1.2.0.2017",
      "$schema": "http://json-schema.org/draft-07/schema",
       "properties":properties}}
    print(json.dumps(schema,indent=3))
    
    with open('bigquery.simulacrum.av_patient.data_dict.json', 'w') as f:
        json.dump(schema, f, indent=4)

PATIENTID
SEX
has lookup file
LINKNUMBER
ETHNICITY
has lookup file
DEATHCAUSECODE_1A
DEATHCAUSECODE_1B
DEATHCAUSECODE_1C
DEATHCAUSECODE_2
DEATHCAUSECODE_UNDERLYING
DEATHLOCATIONCODE
has lookup file
NEWVITALSTATUS
has lookup file
VITALSTATUSDATE
Processed 12 lines.
{
   "name": "simulacrum_release_v1.2.0.2017_patient",
   "description": "Patient table from Simulacrum data release, version 1.2.0-2017, 2021-01-19\n\nThis is a package of synthetic data based on cancer registration data from \nthe Cancer Outcomes and Services Dataset (COSD) and the Systemic Anti-Cancer \nTreatment (SACT) dataset collected by the National Cancer Registration and \nAnalysis Service (NCRAS).\n\nAny analysis performed on this synthetic data will not correspond exactly to \nanalysis performed on the real data. The synthetic data does not contain real \npatients but just mimics specific statistical properties of the real data. To \nminimise the potential for confusion with real data, organisational codes in \nthi