In [1]:
from sas7bdat import SAS7BDAT
import pandas as pd
import datetime as dt

# Input/Output Definitions

In [2]:
sasfilename = 'D:/Data/linkedclaims_casuals_2018m04.sas7bdat'
ddictionaryfilename = sasfilename.replace('.sas7bdat', '_variables_types.csv')

## Predefined Data Dictionary Field Values (LinkedClaims_casuals)

In [3]:
dateformat = '%Y-%m-%d'
variable_event_start = ['StartDate']
variable_event_end = ['EndDate']
variable_as_index = ['ppsn','StartDate', 'EndDate']
variable_uid = ['ppsn']
variable_date_of_birth = ['life_event_date']
pii_variables = ['ppsn','sex']
total_summary_variables = ['hist_lr','CasualFlag','PenaltyFlag','End_weekly_rate','Start_weekly_rate', 'occupation',
                           'location','ada_code','marital_status','nat_code','county','family_flag','cda_number']

total_summary_variables.extend(['penaltyflag','casualflag1', 'CDA_flag1', 'ADA_flag1', 'LM_code'])

episode_summary_variables = ['hist_lr','CasualFlag','PenaltyFlag','End_weekly_rate','Start_weekly_rate',
                             'life_event_date','location','ada_code','marital_status','nat_code',
                             'occupation','county','family_flag', 'cda_number']

episode_summary_variables.extend(['penaltyflag','casualflag1', 'CDA_flag1', 'ADA_flag1', 'LM_code', 'StartDate', 'EndDate'])

## Export DataDictionary

In [4]:
sasfile = SAS7BDAT(sasfilename) 
col_names_types = [sasfile.column_names, sasfile.column_types]
sasfile.close()

names_types = pd.DataFrame(col_names_types)
names_types = names_types.T
names_types.columns = ['Variable', 'Type']
names_types.Variable = names_types.Variable.apply(lambda x:  x.decode('UTF-8').strip())
names_types.Type = names_types.Type.apply(lambda x:  x.replace('string','Char'))
names_types.Type = names_types.Type.apply(lambda x:  x.replace('number','Num'))

formats = []
for i in names_types.index:
    if (names_types.Variable.loc[i].lower().find('date') != -1 \
        and names_types.Variable.loc[i].lower().find('flag') == -1) \
        or names_types.Variable.loc[i].lower().find('dob') != -1 \
        or names_types.Variable.loc[i].lower().find('proposedfinish') != -1 :
        names_types.Type.loc[i] = 'Date'
        formats.append(dateformat)
    else:
        formats.append('')
names_types['Format'] = formats

is_index = []
for i in names_types.index:
    if names_types.Variable.loc[i] in variable_as_index:
        is_index.append('1')
    else:
        is_index.append('')
names_types['AsHDF5Index'] = is_index

is_event_start = []
for i in names_types.index:
    if names_types.Variable.loc[i] in variable_event_start:
        is_event_start.append('1')
    else:
        is_event_start.append('')
names_types['EventStart'] = is_event_start

is_event_end = []
for i in names_types.index:
    if names_types.Variable.loc[i] in variable_event_end:
        is_event_end.append('1')
    else:
        is_event_end.append('')
names_types['EventEnd'] = is_event_end

is_dob = []
for i in names_types.index:
    if names_types.Variable.loc[i] in variable_date_of_birth:
        is_dob.append('1')
    else:
        is_dob.append('')
names_types['DOB'] = is_dob

is_uid = []
for i in names_types.index:
    if names_types.Variable.loc[i] in variable_uid:
        is_uid.append('1')
    else:
        is_uid.append('')
names_types['UID'] = is_uid

is_pinfo = []
for i in names_types.index:
    if names_types.Variable.loc[i] in pii_variables:
        is_pinfo.append('1')
    else:
        is_pinfo.append('')
names_types['Pinfo'] = is_pinfo

is_totsumvar = []
for i in names_types.index:
    if names_types.Variable.loc[i] in total_summary_variables:
        is_totsumvar.append('1')
    else:
        is_totsumvar.append('')
names_types['TotalSummary'] = is_totsumvar

is_episumvar = []
for i in names_types.index:
    if names_types.Variable.loc[i] in episode_summary_variables:
        is_episumvar.append('1')
    else:
        is_episumvar.append('')
names_types['EpisodeSummary'] = is_episumvar
        
names_types.to_csv(ddictionaryfilename, index=False)

print ('\nALL DONE')


ALL DONE
