# Compare Datasets

In [1]:
import pandas as pd
import numpy as np
import pyreadstat
from reed import regex_select, compute_confusion
pd.options.display.max_columns = 200
pd.options.display.max_rows = 500

%load_ext autoreload
%autoreload 2

## Load data & compare overlap of instances in datasets

In [2]:
idc = 'xwaveid' # merge id
finn = pd.read_csv("data/all_vars.csv",index_col='xwaveid').reset_index()
anna, meta = pyreadstat.read_dta("../reduregvars.dta")
raw, meta_r = pyreadstat.read_sav(f'../part1/Combined a190c.sav') 
raw['xwaveid'] = raw['xwaveid'].astype(int)
anna['xwaveid'] = anna['xwaveid'].astype(int)
finn['in_finn'] = True
anna['in_anna'] = True

### Check currently studying indicators

In [3]:
summary_study = ['aedqstdy','aedfts','acaeft','acaept','anlreast','abncsty','abnfsty']

c11_study = [
 'aedcqsl',
 'aedcqsh',
 'aedcqnq',
 'aedcqtq',
 'aedcqta',
 'aedcqtc',
 'aedcqc1',
 'aedcqc2',
 'aedcqc3',
 'aedcqc4',
 'aedcqcd',
 'aedcqad',
 'aedcqav',
 'aedcqbd',
 'aedcqhd',
 'aedcqgd',
 'aedcqms',
 'aedcqdc',
 'aedcqbc',
 'aedcqsc',
 'aedcqcc',
 'aedcqgc',
 'aedcqcn',
 'aedcqdn',
 'aedcqnei',
 'aedcqna',
 'aedcqos',
 'aedcqdk',
]

dv_asced_study = [
 'aedcq100',
 'aedcq110',
 'aedcq120',
 'aedcq200',
 'aedcq211',
 'aedcq221',
 'aedcq310',
 'aedcq311',
 'aedcq312',
 'aedcq400',
 'aedcq411',
 'aedcq413',
 'aedcq421',
 'aedcq500',
 'aedcq511',
 'aedcq514',
 'aedcq521',
 'aedcq524',
 'aedcq600',
 'aedcq611',
 'aedcqunk'
]

studying_all = summary_study + c11_study + dv_asced_study

In [4]:
for c in studying_all:
    if raw[c].max() > 1:
        print(c)
        print(raw[c].value_counts(dropna=False))
    

In [5]:
def print_columns_with_labels(cols, meta, name):
    print(name)
    print('---------------------')
    for c in cols:
        print(c,"->",meta.column_names_to_labels.get(c))
    print('')
    
print_columns_with_labels(summary_study, meta_r, 'Summary columns')
print_columns_with_labels(c11_study, meta_r, 'C11 Questions')
print_columns_with_labels(dv_asced_study, meta_r, 'DV study')

Summary columns
---------------------
aedqstdy -> C7a Qualifications obtained - Still studying
aedfts -> DV: Full-time student
acaeft -> FG1 Any calendar activity - Enrolled FT in school/educational course
acaept -> FG1 Any calendar activity - Enrolled PT in school/educational course
anlreast -> F12 Not looking for work - Studying/returning to studies
abncsty -> G15 Do you currently receive any of these government pensions or allowances - Austudy/Abstudy payment
abnfsty -> G31 Pensions/Allowances received - Austudy/Abstudy

C11 Questions
---------------------
aedcqsl -> C11 Qualifications studying for - Secondary school - lower level
aedcqsh -> C11 Qualifications studying for - Secondary school - highest level
aedcqnq -> C11 Qualifications studying for - Nursing qualification
aedcqtq -> C11 Qualifications studying for - Teaching qualification
aedcqta -> C11 Qualifications studying for - Trade certificate or apprenticeship
aedcqtc -> C11 Qualifications studying for - Technicians cert./A

In [6]:

anna_ = pd.merge(anna, raw[studying_all+[idc]], how='left',on=idc)
print('aedcqfpt = ',meta_r.column_names_to_labels['aedcqfpt'])
anna_[studying_all].sum(axis=0)

aedcqfpt =  C10b Currently studying full or part time


aedqstdy    0.0
aedfts      0.0
acaeft      0.0
acaept      0.0
anlreast    0.0
abncsty     0.0
abnfsty     0.0
aedcqsl     0.0
aedcqsh     0.0
aedcqnq     0.0
aedcqtq     0.0
aedcqta     0.0
aedcqtc     0.0
aedcqc1     0.0
aedcqc2     0.0
aedcqc3     0.0
aedcqc4     0.0
aedcqcd     0.0
aedcqad     0.0
aedcqav     0.0
aedcqbd     0.0
aedcqhd     0.0
aedcqgd     0.0
aedcqms     0.0
aedcqdc     0.0
aedcqbc     0.0
aedcqsc     0.0
aedcqcc     0.0
aedcqgc     0.0
aedcqcn     0.0
aedcqdn     0.0
aedcqnei    0.0
aedcqna     0.0
aedcqos     0.0
aedcqdk     0.0
aedcq100    0.0
aedcq110    0.0
aedcq120    0.0
aedcq200    0.0
aedcq211    0.0
aedcq221    0.0
aedcq310    0.0
aedcq311    0.0
aedcq312    0.0
aedcq400    0.0
aedcq411    0.0
aedcq413    0.0
aedcq421    0.0
aedcq500    0.0
aedcq511    0.0
aedcq514    0.0
aedcq521    0.0
aedcq524    0.0
aedcq600    0.0
aedcq611    0.0
aedcqunk    0.0
dtype: float64

### Agreement in treatment, outcome & instances

In [7]:
# look at agreement of treatment, outcome and instances
treatments = ['^reduhl$', '^rehllt$', '^redudl$', '^redufl$', '^redllt$', '^refllt$']
outcomes = ['^rlwage$', '^mh$', '^mhbm$', '^wkhr$', '^y_']
compare = ['xwaveid',"^in_"]+treatments+outcomes+['ahgage1']

finn = finn[regex_select(finn.columns, compare)]
anna = anna[regex_select(anna.columns, compare)]
c = pd.merge(anna,finn,on=['xwaveid'],how='outer',suffixes=('_a','_f'))
c['in_finn'].fillna(False,inplace=True)
c['in_anna'].fillna(False,inplace=True)
both = c[c['in_anna']&c['in_finn']]
finn_only = c[~c['in_anna']&c['in_finn']]
anna_only = c[c['in_anna']&~c['in_finn']]
print(f'In both:{len(both)}, Finn only:{len(finn_only)}, Anna only:{len(anna_only)}')
assert len(both)+len(finn_only)+len(anna_only)==len(c), "merged frames must be same length"

In both:5250, Finn only:191, Anna only:48


## For people in both datasets
### Compare treatment computations

In [8]:
treatment_pairs = [
    ('reduhl_a','reduhl_f'),
    ('redudl_a','redudl_f'),
    ('redufl_a','redufl_f')
]
for ta, tf in treatment_pairs:
    print("Comparing:",(ta, tf))
    confusion = compute_confusion(both[ta],both[tf],'anna','finn')
    display(confusion)
    assert confusion.iloc[1,0] < 20, 'large number of instances treated according to anna but not finn'

Comparing: ('reduhl_a', 'reduhl_f')


Unnamed: 0,finn==0,finn==1
anna==0,4463,56
anna==1,0,450


Comparing: ('redudl_a', 'redudl_f')


Unnamed: 0,finn==0,finn==1
anna==0,3935,31
anna==1,13,1267


Comparing: ('redufl_a', 'redufl_f')


Unnamed: 0,finn==0,finn==1
anna==0,3932,30
anna==1,13,1271


In [9]:
both[both['redudl_a']!=both['redudl_f']].to_csv("data/anna_compare.csv",index=False)

### Compare outcome computations
   - These are almost identical. The difference is probably as I haven't replaced the values for those claiming to be working more than 100 hours per week.

In [10]:
outcome_pairs = [
    ('mh','y_ghmh'),
    ('wkhr','y_jbhruc')
]
for oa, of in outcome_pairs:
    print("Comparing:",(oa, of))
    display(np.corrcoef(both[oa].fillna(-1),both[of].fillna(-1)))

Comparing: ('mh', 'y_ghmh')


array([[1., 1.],
       [1., 1.]])

Comparing: ('wkhr', 'y_jbhruc')


array([[1.        , 0.99991192],
       [0.99991192, 1.        ]])