In [1]:
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

sns.set(context='notebook', style='ticks')

In [2]:
lsl_dr = (pd.read_csv('../data/clean/lsl_dr.csv', index_col=0, low_memory=False)
                  .rename({'onset_1':'identify_mo'}, axis=1))

In [3]:
lsl_dr.shape

(59580, 231)

Exclusions

In [4]:
other_etiology = (lsl_dr[['etiology_3___2', 'etiology_3___4', 'etiology_3___5', 'etiology_3___6', 'etiology_3___9',
       'etiology_oth___1', 'etiology_oth___3', 'etiology_oth___4', 'etiology_oth___8', 'etiology_oth___9']]
                      .sum(1).astype(bool))

In [5]:
inclusion_mask = (~lsl_dr.non_english.astype(bool) 
                  & (lsl_dr.hl==0)
                  & ((lsl_dr.etiology_2==0)
                  | (lsl_dr.etiology_2.isnull() & ~other_etiology)))

In [6]:
inclusions = lsl_dr[inclusion_mask]
inclusions.drop_duplicates(subset='study_id').shape

(1281, 231)

In [7]:
age_mask = (lsl_dr.age_test>=48) & (lsl_dr.age_test<60)

In [8]:
data_4yo = inclusions[age_mask].drop_duplicates(subset='study_id').copy()

  """Entry point for launching an IPython kernel.


In [9]:
data_4yo.shape

(546, 231)

## Demographics

In [10]:
data_4yo.male.replace({1: 'Male', 0: 'Female', np.nan: 'Missing'}).value_counts()

Male      273
Female    273
Name: male, dtype: int64

In [11]:
data_4yo.race.replace({0:'White', 1:'Black', 2:'Hispanic', 3:'Asian', 4:'Other', np.nan:'Missing'}).value_counts()

White       359
Black        61
Hispanic     47
Other        43
Asian        27
Missing       9
Name: race, dtype: int64

In [12]:
(data_4yo.premature_age.replace({9:None})==8).replace({True: '>=36 weeks', False: '<36 weeks', 
                                                            np.nan: 'Unknown'}).value_counts()

>=36 weeks    405
<36 weeks     141
Name: premature_age, dtype: int64

In [13]:
data_4yo.sib.replace({0:'1', 1:'2', 2:'3', 3:'4+', np.nan:'Missing'}).value_counts().sort_index()

1          122
2          222
3          113
4+          66
Missing     23
Name: sib, dtype: int64

In [14]:
ed_lookup = {0:"8th grade or less",
1:"Some high school",
2:"High school diploma/GED",
3:"Some college",
4:"Bachelor's degree",
5:"Post graduate degree",
6:"Unknown",
np.nan:"Unknown"}

data_4yo.mother_ed.replace(ed_lookup).value_counts()

High school diploma/GED    164
Some high school           146
Unknown                    135
Some college                89
8th grade or less           12
Name: mother_ed, dtype: int64

In [15]:
data_4yo.father_ed.replace(ed_lookup).value_counts()

Unknown                    165
Bachelor's degree          128
Some college                96
Post graduate degree        81
High school diploma/GED     65
Some high school            11
Name: father_ed, dtype: int64

In [16]:
family_lookup = {0:"Ideal Participation",
1:"Good Participation",
2:"Average Participation",
3:"Below Average",
4:"Limited Participation",
np.nan:"Missing"}

data_4yo.family_inv.replace(family_lookup).value_counts()

Ideal Participation      176
Good Participation       117
Average Participation    111
Missing                   90
Below Average             42
Limited Participation     10
Name: family_inv, dtype: int64

In [17]:
data_4yo[["age_diag", "age_amp", "age_int", "age"]].describe()

Unnamed: 0,age_diag,age_amp,age_int,age
count,430.0,425.0,385.0,539.0
mean,9.463953,14.122353,15.119481,24.348794
std,12.918581,13.80983,13.577373,15.860593
min,0.0,0.0,0.0,0.0
25%,1.0,3.0,4.0,10.0
50%,3.0,8.0,12.0,24.0
75%,17.0,22.0,24.0,36.0
max,55.0,65.0,60.0,63.0


In [18]:
data_4yo.otherserv.replace({0: "OPTION + outside", 1: "OPTION only", 2: "Missing"}).value_counts()

OPTION only         214
Missing              66
OPTION + outside     60
Name: otherserv, dtype: int64

## Hearing loss

In [19]:
data_4yo.degree_hl_ad.isnull().sum()

26

In [20]:
data_4yo.degree_hl_as.isnull().sum()

28

In [21]:
data_4yo.degree_hl_ad.value_counts()

6.0    230
4.0     83
5.0     76
3.0     76
2.0     37
0.0      9
1.0      9
Name: degree_hl_ad, dtype: int64

In [22]:
data_4yo.degree_hl_as.value_counts()

6.0    231
4.0     90
3.0     76
5.0     67
2.0     28
0.0     15
1.0     11
Name: degree_hl_as, dtype: int64

In [23]:
hl_data = data_4yo.set_index('study_id')[['bilateral_snhl',
                     'bilateral_ansd',
                     'bilateral_mixed',
                     'bilateral_cond',
                     'bilateral_normal',
                     'bilateral_unk',
                     'unilateral_snhl',
                     'unilateral_ansd',
                     'unilateral_mixed',
                     'unilateral_cond',
                     'unilateral_unk',
                     'assymetrical']]

One or both parents with hearing loss

In [33]:
data_4yo.one_or_both_parent_hl.value_counts()

0.0    377
1.0     30
Name: one_or_both_parent_hl, dtype: int64

In [35]:
data_4yo.one_or_both_parent_hl.isnull().sum()

139

Individuals with no hearing loss type

In [24]:
hl_data[hl_data.sum(1)==0].index.values

array(['0416-2012-8108', '1045-2010-0056', '1046-1997-0050',
       '1043-2010-0028', '1149-2008-0008', '1149-2008-0019',
       '0938-2014-0019', '1149-2010-0046', '1149-2010-0052',
       '1149-2010-0034', '1149-2009-0038', '0938-2010-0017',
       '0938-2009-0013', '0205-2015-0005', '0205-2014-0006',
       '0836-2012-0021', '0836-2011-0017', '0208-2010-0025',
       '1046-2003-0059', '1046-2010-0039', '0625-2014-0094',
       '0416-2007-6094', '1046-1998-0048', '0101-2013-0101',
       '0836-2011-0019', '0836-2011-0016', '0836-2010-0018',
       '0836-2012-0024', '0836-2009-0008', '0417-2007-0014',
       '1046-2010-0041', '0205-2015-0008', '0519-2014-0075',
       '0624-2010-0052', '1149-2008-0043', '0416-2005-5255',
       '1149-2008-0012'], dtype=object)

In [25]:
hl_data.sum().astype(int).sort_values(ascending=False)

bilateral_snhl      400
bilateral_cond       30
assymetrical         19
bilateral_ansd       19
bilateral_mixed      15
unilateral_snhl      14
unilateral_cond      10
unilateral_mixed      1
unilateral_ansd       1
unilateral_unk        0
bilateral_unk         0
bilateral_normal      0
dtype: int64

In [26]:
hl_data.mean().round(2).sort_values(ascending=False)

bilateral_snhl      0.73
bilateral_cond      0.05
assymetrical        0.03
unilateral_snhl     0.03
bilateral_mixed     0.03
bilateral_ansd      0.03
unilateral_cond     0.02
unilateral_unk      0.00
unilateral_mixed    0.00
unilateral_ansd     0.00
bilateral_unk       0.00
bilateral_normal    0.00
dtype: float64

In [27]:
tech_data = data_4yo.set_index('study_id')[['bilateral_ha',
                     'bilateral_ci',
                     'bimodal',
                     'bilateral_other',
                     'unilateral_ha',
                     'unilateral_ci',
                     'unilateral_other']]

In [28]:
tech_data.sum().astype(int).sort_values(ascending=False)

bilateral_ci        377
bilateral_ha        107
bimodal              39
unilateral_ha        10
unilateral_ci         9
bilateral_other       1
unilateral_other      0
dtype: int64

In [29]:
tech_data.mean().round(2).sort_values(ascending=False)

bilateral_ci        0.69
bilateral_ha        0.20
bimodal             0.07
unilateral_ci       0.02
unilateral_ha       0.02
unilateral_other    0.00
bilateral_other     0.00
dtype: float64

Individuals with no technology type

In [30]:
tech_data[tech_data.sum(1)==0].index.values

array(['0523-2015-0002', '0412-2002-0068', '0412-2003-0060'], dtype=object)

## Summary of scores by domain

In [36]:
data_4yo.head()

Unnamed: 0,redcap_event_name,academic_year_rv,hl,male,_race,prim_lang,sib,_mother_ed,father_ed,par1_ed,...,gf3_sis_ss,gf3_siw_ss,gf_version,ppvt_f,ppvt_ss,rowpvt_ss,school,score,test_name,test_type
39,initial_assessment_arm_1,2012.0,0.0,1.0,2.0,0.0,1.0,3.0,2.0,,...,,,,,,,626,71.0,,EVT
362,initial_assessment_arm_1,2009.0,0.0,1.0,2.0,0.0,3.0,6.0,6.0,,...,,,2.0,,,,628,74.0,,Goldman
538,initial_assessment_arm_1,2009.0,0.0,0.0,2.0,0.0,0.0,6.0,6.0,,...,,,2.0,,,,628,92.0,,Goldman
567,initial_assessment_arm_1,2015.0,0.0,0.0,0.0,0.0,1.0,6.0,6.0,,...,,,,,,,626,75.0,,Arizonia
761,initial_assessment_arm_1,2016.0,0.0,1.0,1.0,0.0,0.0,3.0,2.0,,...,,,,,,,625,66.0,PLS,receptive


In [40]:
data_4yo.loc[(data_4yo.domain=='Language') & (data_4yo.test_type=='receptive'), 
           'domain'] = 'Receptive Language'
data_4yo.loc[(data_4yo.domain=='Language') & (data_4yo.test_type=='expressive'), 
           'domain'] = 'Expressive Language'

In [41]:
data_4yo.groupby('domain').score.describe().round(1)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Articulation,342.0,85.2,20.3,0.0,72.0,87.0,101.0,121.0
Expressive Language,2.0,56.0,4.2,53.0,54.5,56.0,57.5,59.0
Expressive Vocabulary,146.0,93.5,24.0,0.0,78.5,96.5,111.0,138.0
Receptive Language,47.0,84.2,25.0,50.0,63.0,85.0,103.0,145.0
Receptive Vocabulary,9.0,96.1,23.2,54.0,81.0,99.0,119.0,123.0
