In [2]:
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

sns.set(context='notebook', style='ticks')

In [3]:
lsl_dr = (pd.read_csv('../data/clean/lsl_dr.csv', index_col=0, low_memory=False)
                  .rename({'onset_1':'identify_mo'}, axis=1))

In [4]:
lsl_dr.shape

(59512, 231)

In [5]:
mask = (lsl_dr.age_test>=48) & (lsl_dr.age_test<60)
data_4yo = lsl_dr[mask].drop_duplicates(subset='study_id').copy()

In [6]:
data_4yo.shape

(2425, 231)

## Demographics

In [7]:
data_4yo.male.replace({1: 'Male', 0: 'Female', np.nan: 'Missing'}).value_counts()

Male       1284
Female     1122
Missing      19
Name: male, dtype: int64

In [8]:
data_4yo.race.replace({0:'White', 1:'Black', 2:'Hispanic', 3:'Asian', 4:'Other', np.nan:'Missing'}).value_counts()

White       1301
Hispanic     386
Other        252
Black        234
Asian        201
Missing       51
Name: race, dtype: int64

In [9]:
(data_4yo.premature_age.replace({9:None})==8).replace({True: '>=36 weeks', False: '<36 weeks', 
                                                            np.nan: 'Unknown'}).value_counts()

>=36 weeks    1738
<36 weeks      687
Name: premature_age, dtype: int64

In [10]:
data_4yo.sib.replace({0:'1', 1:'2', 2:'3', 3:'4+', np.nan:'Missing'}).value_counts().sort_index()

1          575
2          982
3          472
4+         251
Missing    145
Name: sib, dtype: int64

In [11]:
ed_lookup = {0:"8th grade or less",
1:"Some high school",
2:"High school diploma/GED",
3:"Some college",
4:"Bachelor's degree",
5:"Post graduate degree",
6:"Unknown",
np.nan:"Unknown"}

data_4yo.mother_ed.replace(ed_lookup).value_counts()

Unknown                    730
Some high school           654
High school diploma/GED    607
Some college               334
8th grade or less          100
Name: mother_ed, dtype: int64

In [12]:
data_4yo.father_ed.replace(ed_lookup).value_counts()

Unknown                    858
Bachelor's degree          493
Post graduate degree       327
High school diploma/GED    322
Some college               321
Some high school            74
8th grade or less           30
Name: father_ed, dtype: int64

In [13]:
family_lookup = {0:"Ideal Participation",
1:"Good Participation",
2:"Average Participation",
3:"Below Average",
4:"Limited Participation",
np.nan:"Missing"}

data_4yo.family_inv.replace(family_lookup).value_counts()

Ideal Participation      688
Good Participation       599
Average Participation    548
Missing                  355
Below Average            196
Limited Participation     39
Name: family_inv, dtype: int64

In [14]:
data_4yo[["age_diag", "age_amp", "age_int", "age"]].describe()

Unnamed: 0,age_diag,age_amp,age_int,age
count,1861.0,1821.0,1727.0,2374.0
mean,10.595648,15.804503,17.006948,28.764111
std,13.625255,14.170196,14.497651,16.339011
min,0.0,0.0,0.0,0.0
25%,1.0,4.0,4.0,15.0
50%,3.0,11.0,12.0,30.0
75%,18.0,25.0,26.0,41.0
max,59.0,65.0,60.0,88.0


In [15]:
data_4yo.otherserv.replace({0: "OPTION + outside", 1: "OPTION only", 2: "Missing"}).value_counts()

OPTION only         1149
OPTION + outside     351
Missing              318
Name: otherserv, dtype: int64

## Hearing loss

In [16]:
data_4yo.degree_hl_ad.value_counts()

6.0    963
4.0    319
3.0    293
5.0    287
2.0    146
0.0     86
1.0     50
Name: degree_hl_ad, dtype: int64

In [17]:
data_4yo.degree_hl_as.value_counts()

6.0    967
3.0    312
4.0    305
5.0    280
2.0    133
0.0     94
1.0     51
Name: degree_hl_as, dtype: int64

In [18]:
hl_data = data_4yo[['bilateral_snhl',
                     'bilateral_ansd',
                     'bilateral_mixed',
                     'bilateral_cond',
                     'bilateral_normal',
                     'bilateral_unk',
                     'unilateral_snhl',
                     'unilateral_ansd',
                     'unilateral_mixed',
                     'unilateral_cond',
                     'unilateral_unk',
                     'assymetrical']]

In [19]:
hl_data.sum().astype(int).sort_values(ascending=False)

bilateral_snhl      1692
bilateral_cond        82
unilateral_snhl       72
bilateral_ansd        66
bilateral_mixed       62
assymetrical          57
unilateral_cond       41
bilateral_normal      36
unilateral_mixed       5
unilateral_ansd        3
unilateral_unk         0
bilateral_unk          0
dtype: int64

In [20]:
hl_data.mean().round(2).sort_values(ascending=False)

bilateral_snhl      0.70
unilateral_snhl     0.03
bilateral_cond      0.03
bilateral_mixed     0.03
bilateral_ansd      0.03
assymetrical        0.02
unilateral_cond     0.02
bilateral_normal    0.01
unilateral_unk      0.00
unilateral_mixed    0.00
unilateral_ansd     0.00
bilateral_unk       0.00
dtype: float64

In [21]:
tech_data = data_4yo[['bilateral_ha',
                     'bilateral_ci',
                     'bimodal',
                     'bilateral_other',
                     'unilateral_ha',
                     'unilateral_ci',
                     'unilateral_other']]

In [22]:
tech_data.sum().astype(int).sort_values(ascending=False)

bilateral_ci        1735
bilateral_ha         398
bimodal              148
unilateral_ci         51
unilateral_ha         44
bilateral_other       10
unilateral_other       3
dtype: int64

In [23]:
tech_data.mean().round(2).sort_values(ascending=False)

bilateral_ci        0.72
bilateral_ha        0.16
bimodal             0.06
unilateral_ci       0.02
unilateral_ha       0.02
unilateral_other    0.00
bilateral_other     0.00
dtype: float64