# Compare Datasets

In [1]:
import pandas as pd
import numpy as np
import pyreadstat
from reed import regex_select, compute_confusion
pd.options.display.max_columns = 200
pd.options.display.max_rows = 500

%load_ext autoreload
%autoreload 2

## Load data & compare overlap of instances in datasets

In [2]:
finn = pd.read_csv("all_vars.csv",index_col='xwaveid').reset_index()
anna, meta = pyreadstat.read_dta("../reduregvars.dta")
anna['xwaveid'] = anna['xwaveid'].astype(int)
finn['in_finn'] = True
anna['in_anna'] = True

treatments = ['^reduhl$', '^rehllt$', '^redudl$', '^redufl$', '^redllt$', '^refllt$']
outcomes = ['^rlwage$', '^mh$', '^mhbm$', '^wkhr$', '^y_']
compare = ['xwaveid',"^in_"]+treatments+outcomes+['ahgage1']

finn = finn[regex_select(finn.columns, compare)]
anna = anna[regex_select(anna.columns, compare)]
c = pd.merge(anna,finn,on=['xwaveid'],how='outer',suffixes=('_a','_f'))
c['in_finn'].fillna(False,inplace=True)
c['in_anna'].fillna(False,inplace=True)
both = c[c['in_anna']&c['in_finn']]
finn_only = c[~c['in_anna']&c['in_finn']]
anna_only = c[c['in_anna']&~c['in_finn']]
print(f'In both:{len(both)}, Finn only:{len(finn_only)}, Anna only:{len(anna_only)}')
print("Check lengths:",len(both)+len(finn_only)+len(anna_only),len(c))

In both:5250, Finn only:296, Anna only:48
Check lengths: 5594 5594


## For people in both datasets
### Compare treatment computations

In [3]:
treatment_pairs = [
    ('reduhl_a','reduhl_f'),
    ('redudl_a','redudl_f'),
    ('redufl_a','redufl_f')
]
for ta, tf in treatment_pairs:
    print("Comparing:",(ta, tf))
    confusion = compute_confusion(both[ta],both[tf],'anna','finn')
    display(confusion)
    assert confusion.iloc[1,0] < 20, 'large number of instances treated according to anna but not finn'

Comparing: ('reduhl_a', 'reduhl_f')


Unnamed: 0,finn==0,finn==1
anna==0,4463,56
anna==1,0,450


Comparing: ('redudl_a', 'redudl_f')


Unnamed: 0,finn==0,finn==1
anna==0,3935,31
anna==1,13,1267


Comparing: ('redufl_a', 'redufl_f')


Unnamed: 0,finn==0,finn==1
anna==0,3932,30
anna==1,13,1271


In [4]:
both[both['redudl_a']!=both['redudl_f']].to_csv("data/anna_compare.csv",index=False)

### Compare outcome computations
   - These are almost identical. The difference is probably as I haven't replaced the values for those claiming to be working more than 100 hours per week.

In [5]:
outcome_pairs = [
    ('mh','y_ghmh'),
    ('wkhr','y_jbhruc')
]
for oa, of in outcome_pairs:
    print("Comparing:",(oa, of))
    display(np.corrcoef(both[oa].fillna(-1),both[of].fillna(-1)))

Comparing: ('mh', 'y_ghmh')


array([[1., 1.],
       [1., 1.]])

Comparing: ('wkhr', 'y_jbhruc')


array([[1.        , 0.99991192],
       [0.99991192, 1.        ]])