In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyreadstat
import re
import string
from sklearn_pandas import DataFrameMapper
import networkx as nx
import reed
from clean import *


pd.options.display.max_columns=100
pd.options.display.max_colwidth=200
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
s,m,e = 'a','q','s' # select which waves to base analysis on
min_start_age = 25 # the minimum age people must as of the starting wave
missing_threshold = 0.95

In [3]:
# read the combined file for the starting wave
df1, meta1 = pyreadstat.read_sav(f'../part1/Combined {s}190c.sav') 
n0 = len(df1)
print(f"Number of people in initial wave {n0}")

Number of people in initial wave 19914


In [4]:
from treatment_outcomes import compute_treatment_vars, compute_outcomes
treatments = compute_treatment_vars(df1, s, m)
outcomes = compute_outcomes(df1, s, e)
print("Treatments:",treatments.columns)
print("Outcomes:",outcomes.columns)

Treatments: Index(['xwaveid', 'redudl', 'reduhl', 'redufl'], dtype='object')
Outcomes: Index(['xwaveid', 'y_jbhruc', 'y_ghmh', 'y_wsce', 'y_wscei', 'y_employment',
       'y_Djbhruc', 'y_Dghmh', 'y_Dwsce', 'y_Dwscei', 'y_Demployment'],
      dtype='object')


In [5]:
treatment_outcomes = pd.merge(treatments,outcomes,on='xwaveid',how='inner')
treatment_outcomes['xwaveid'] = treatment_outcomes['xwaveid'].astype(int)
treatment_outcomes.shape

(10087, 14)

In [6]:
from reed import compute_confusion
compute_confusion(treatments['redudl'],treatments['reduhl'],'dl','dh')

Unnamed: 0,dh==0,dh==1
dl==0,7833,701
dl==1,1503,929


In [7]:
from treatment_outcomes import simplify_employment
from reed import regex_select

def extract_basic_variables(df):
    # age, sex, education in 2001, employment status in 2001
    basic = df1[['xwaveid','ahgage','ahgsex','aedhigh1','aesdtl']].copy() 

    def simplify_education(v):
        """Simplify down to match categories in paper."""
        if v < 0 or v==10:
            return np.nan # missing
        if v < 3: #(above bachelors)
            return 2
        return v # < year 12:(9), year 12:(8), cert:(5), diploma/adv diploma:(4), bachelors/honours:(3)

    # remove under 25s
    basic = basic[basic['ahgage']>25].copy()
    
    # simplify education & employment in line with baseline paper
    basic['aesdtl']=basic['aesdtl'].apply(simplify_employment)
    basic['aedhigh1'] = basic['aedhigh1'].apply(simplify_education)
    
    # bin age
    basic['ahgage'] = pd.cut(basic['ahgage'],bins=[24,34,44,54,120])
    
    # dummy encode
    basic = pd.get_dummies(basic,columns=['ahgage','ahgsex','aedhigh1','aesdtl'],drop_first=True)
    
    # add interactions between gender and other variables
    age_edu_emp = regex_select(basic.columns,['^ahgage_','^aedhigh1_','^aesdtl_'])
    basic = create_interaction_columns(basic,['ahgsex_2.0'],age_edu_emp)
    basic['xwaveid'] = basic['xwaveid'].astype(int)
    return basic

basic = extract_basic_variables(df1)
l0 = len(basic)
basic = pd.merge(basic,treatment_outcomes,on='xwaveid',how='inner')
l1 = len(basic)
print(f"Dropped {l0-l1} individuals who are not present in waves {m} and {e} ({100*(l0-l1)/l0:.0f}%)")
basic.set_index('xwaveid',inplace=True)
basic.to_csv("basic_variables.csv",index=True)

Dropped 6125 individuals who are not present in waves q and s (50%)


In [8]:
def filter_participants(df1,min_start_age):
    """
    Remove those already studying or below the minimum age in the initial wave.
    """
    n0 = len(df1)
    df = df1.loc[df1[f'{s}hgage'] >= min_start_age].copy()
    print(f"Dropping {n0-len(df)} participants below age {min_start_age}")

    # filter out those already studying

    # If any of the following are > 0, then the respondant was already studying at the beginning of the period
    already_studying_cols = [s+col for col in ['caeft','caept','nlreast','bncsty','bnfsty']]

    already_studying = df[already_studying_cols].sum(axis=1)

    n0 = len(df)
    df = df[already_studying < 1].copy()
    print(f"Dropping {n0-len(df)} participants already studying at period start")
    print(f"Remaining participants:{len(df)}")
    return df

In [9]:
def read_type_information():
    headers = ['variable','vartype','format','label','long_label','varcat','relevance',"0"]
    type_df = pd.read_csv("HILDAw1vardic.csv",skiprows=4,index_col=None, names=headers)
    type_df['relevance'] = type_df['relevance'].fillna(1).astype(int)
    type_df.loc[type_df['label']=='ACAEPT','relevance'] = -1
    return type_df

def drop_irrelevant_columns_inplace(df, type_df):
    irrelevant = list(type_df.loc[type_df['relevance']<1,'variable'])
    irrelevant.remove('xwaveid')
    df.drop(columns=irrelevant,inplace=True)
    print(f"Dropped {len(irrelevant)} irrelevant columns.")
    return irrelevant
    

In [10]:
def fix_types_inplace(df1):
    # Reformat some of the columns
    dates = ["ahhhqivw","ahhcompi","ahhcompf","ahhcomps","ahhidate"]
    string = ['ahhtitle']
    categorical = [
     'acca1',
     'acca2',
     'ahhmgfxd',
     'ahhmgmxd',
     'ahhp1',
     'ahhp2',
     'ahhp3',
     'ahhpgfxd',
     'ahhpgmxd',
     'ahhpno',
     'xwaveid'
    ]

    for c in categorical:
        if c in df1.columns:
            df1[c] = pd.to_numeric(df1[c])

    # turn into days past epoch
    basedate = pd.to_datetime('01/01/1900',format='%d/%m/%Y')    
    for c in dates:
        if c in df1.columns:
            df1[c] = (pd.to_datetime(df1[c],format='%d/%m/%Y',errors='coerce')-basedate).dt.days 

    for c in string:
        df1[c] = df1[c].astype('category').cat.codes
        


In [11]:
def filter_raw_data(min_age=25, threshold=0.99):
    df1, meta1 = pyreadstat.read_sav(f'../part1/Combined {s}190c.sav') 
    print("Read in data, with shape:",df1.shape)
    df1 = filter_participants(df1,min_age)
    
    columns_dropped = {} # keep track of why each column was dropped
    
    type_df = read_type_information()
    irrelevant = drop_irrelevant_columns_inplace(df1,type_df)
    add_list_to_dict(irrelevant,columns_dropped,'invalid/irrelevant')

    fix_types_inplace(df1)
    
    constant = drop_constant_columns(df1)
    add_list_to_dict(constant,columns_dropped,'constant')

    mostly_missing = drop_mostly_missing_columns(df1, threshold = threshold)
    drop_strongly_correlated_columns(df1,columns_dropped,threshold=threshold,inplace=True,fillna=True)
    
    print("Processed data, with shape:",df1.shape)
    return df1, columns_dropped

def drop_strongly_correlated_columns(df, columns_dropped,threshold=0.99, inplace=True, fillna=True):
    """
    Drop columns that are highly correlated with another column.
    
    In each strongly connected component, the variable with the least missing data will be the 
    representative variable kept.
    """
    d = df.fillna(-1)
    c = compute_correlations(d)
    del d
    strong = c[c['correlation'].abs() > threshold]
    print(f"number of strong correlations stronger than {threshold:.2f} is: {len(strong)}")
    g = nx.Graph()
    g.add_edges_from(list((strong[['c1','c2']]).itertuples(index=False,name=None)))
    components = list(nx.connected_components(g))
    print("Number of connected components in strong correlation graph:",len(components))
    
    drop = set({})
    for component in components:
        least_missing = df[list(component)].isnull().sum(axis=0).idxmin()
        component.remove(least_missing)
        drop = drop.union(component)
        add_list_to_dict(component,columns_dropped,f"merged into {least_missing}")
    print("Columns dropped due to almost perfect correlation:",len(drop))
    if inplace:
        df.drop(columns=drop,inplace=True)
    return drop, columns_dropped

In [12]:
df, columns_dropped = filter_raw_data(threshold=missing_threshold)
l0 = len(df)
df = pd.merge(df,treatment_outcomes,on='xwaveid',how='inner')
l1 = len(df)
print(f"Dropped {l0-l1} individuals who are not present in waves {m} and {e} ({100*(l0-l1)/l0:.0f}%)")
df.set_index('xwaveid',inplace=True)
filename = f"all_vars_{int(round(missing_threshold*1000))}.csv"
df.to_csv(filename,index=True)
print("Written data to:",filename)

Read in data, with shape: (19914, 3400)


Dropping 7359 participants below age 25


Dropping 1216 participants already studying at period start
Remaining participants:11339
Dropped 403 irrelevant columns.


Dropping 108 columns that are constant or entirely missing
Dropping 1440 columns with more than 95% missing 


number of strong correlations stronger than 0.95 is: 5781
Number of connected components in strong correlation graph: 137
Columns dropped due to almost perfect correlation: 643
Processed data, with shape: (11339, 806)
Dropped 5793 individuals who are not present in waves q and s (51%)


Written data to: all_vars_950.csv
