In [1]:
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

sns.set(context='notebook', style='ticks')

In [2]:
lsl_dr = (pd.read_csv('../data/clean/lsl_dr.csv', index_col=0, low_memory=False)
                  .rename({'onset_1':'identify_mo'}, axis=1))

In [3]:
lsl_dr.shape

(59537, 246)

Exclusions

In [4]:
other_etiology = (lsl_dr[['etiology_3___2', 'etiology_3___4', 'etiology_3___5', 'etiology_3___6', 'etiology_3___9',
       'etiology_oth___1', 'etiology_oth___3', 'etiology_oth___4', 'etiology_oth___8', 'etiology_oth___9']]
                      .sum(1).astype(bool))

In [5]:
inclusion_mask = (~lsl_dr.non_english.astype(bool) 
                  & (lsl_dr.hl==0)
                  & ((lsl_dr.etiology_2==0)
                  | (lsl_dr.etiology_2.isnull() & ~other_etiology)))

In [6]:
inclusions = lsl_dr[inclusion_mask]
inclusions.drop_duplicates(subset='study_id').shape

(1261, 246)

In [7]:
age_mask = (lsl_dr.age_test>=48) & (lsl_dr.age_test<60)

In [8]:
data_4yo = inclusions[age_mask].drop_duplicates(subset='study_id').copy()

  """Entry point for launching an IPython kernel.


In [9]:
data_4yo.shape

(532, 246)

## Demographics

In [10]:
data_4yo.male.replace({1: 'Male', 0: 'Female', np.nan: 'Missing'}).value_counts()

Male      267
Female    265
Name: male, dtype: int64

In [11]:
data_4yo.race.replace({0:'White', 1:'Black', 2:'Hispanic', 3:'Asian', 4:'Other', np.nan:'Missing'}).value_counts()

White       348
Black        61
Hispanic     47
Other        43
Asian        27
Missing       6
Name: race, dtype: int64

In [12]:
(data_4yo.premature_age.replace({9:None})==8).replace({True: '>=36 weeks', False: '<36 weeks', 
                                                            np.nan: 'Unknown'}).value_counts()

>=36 weeks    394
<36 weeks     138
Name: premature_age, dtype: int64

In [13]:
data_4yo.sib.replace({0:'1', 1:'2', 2:'3', 3:'4+', np.nan:'Missing'}).value_counts().sort_index()

1          122
2          217
3          109
4+          64
Missing     20
Name: sib, dtype: int64

In [14]:
ed_lookup = {0:"8th grade or less",
1:"Some high school",
2:"High school diploma/GED",
3:"Some college",
4:"Bachelor's degree",
5:"Post graduate degree",
6:"Unknown",
np.nan:"Unknown"}

data_4yo.mother_ed.replace(ed_lookup).value_counts()

High school diploma/GED    161
Some high school           139
Unknown                    131
Some college                89
8th grade or less           12
Name: mother_ed, dtype: int64

In [15]:
data_4yo.father_ed.replace(ed_lookup).value_counts()

Unknown                    161
Bachelor's degree          127
Some college                91
Post graduate degree        80
High school diploma/GED     63
Some high school            10
Name: father_ed, dtype: int64

In [16]:
family_lookup = {0:"Ideal Participation",
1:"Good Participation",
2:"Average Participation",
3:"Below Average",
4:"Limited Participation",
np.nan:"Missing"}

data_4yo.family_inv.replace(family_lookup).value_counts()

Ideal Participation      177
Good Participation       113
Average Participation    110
Missing                   80
Below Average             42
Limited Participation     10
Name: family_inv, dtype: int64

In [17]:
data_4yo[["age_diag", "age_amp", "age_int", "age"]].describe()

Unnamed: 0,age_diag,age_amp,age_int,age
count,425.0,419.0,381.0,527.0
mean,9.577647,14.093079,15.238845,24.250474
std,13.098721,13.853244,13.790659,16.015376
min,0.0,0.0,0.0,0.0
25%,1.0,3.0,4.0,9.5
50%,3.0,8.0,11.0,24.0
75%,17.0,22.0,24.0,36.0
max,55.0,65.0,60.0,73.0


In [18]:
data_4yo.otherserv.replace({0: "OPTION + outside", 1: "OPTION only", 2: "Missing"}).value_counts()

OPTION only         214
Missing              65
OPTION + outside     59
Name: otherserv, dtype: int64

## Hearing loss

In [19]:
data_4yo.degree_hl_ad.isnull().sum()

14

In [20]:
data_4yo.degree_hl_as.isnull().sum()

16

In [21]:
data_4yo.degree_hl_ad.value_counts()

6.0    229
4.0     83
5.0     75
3.0     74
2.0     37
0.0     11
1.0      9
Name: degree_hl_ad, dtype: int64

In [22]:
data_4yo.degree_hl_as.value_counts()

6.0    229
4.0     90
3.0     76
5.0     68
2.0     27
0.0     15
1.0     11
Name: degree_hl_as, dtype: int64

In [23]:
hl_data = data_4yo.set_index('study_id')[['bilateral_snhl',
                     'bilateral_ansd',
                     'bilateral_mixed',
                     'bilateral_cond',
                     'bilateral_normal',
                     'bilateral_unk',
                     'unilateral_snhl',
                     'unilateral_ansd',
                     'unilateral_mixed',
                     'unilateral_cond',
                     'unilateral_unk',
                     'assymetrical']]

One or both parents with hearing loss

In [24]:
data_4yo.one_or_both_parent_hl.value_counts()

0.0    371
1.0     29
Name: one_or_both_parent_hl, dtype: int64

In [25]:
data_4yo.one_or_both_parent_hl.isnull().sum()

132

Individuals with no hearing loss type

In [26]:
hl_data[hl_data.sum(1)==0].index.values

array(['1045-2010-0056', '1149-2008-0008', '1149-2008-0019',
       '0938-2014-0019', '0205-2015-0005', '0205-2014-0006',
       '1149-2010-0052', '1149-2010-0046', '1149-2010-0034',
       '1149-2009-0038', '0938-2010-0017', '0938-2009-0013',
       '0101-2013-0101', '0205-2015-0008', '1149-2008-0043',
       '1149-2008-0012'], dtype=object)

In [27]:
hl_data.sum().astype(int).sort_values(ascending=False)

bilateral_snhl      403
bilateral_cond       30
assymetrical         19
bilateral_ansd       19
bilateral_mixed      15
unilateral_snhl      14
unilateral_cond      11
unilateral_mixed      2
unilateral_ansd       2
bilateral_normal      1
unilateral_unk        0
bilateral_unk         0
dtype: int64

In [28]:
hl_data.mean().round(2).sort_values(ascending=False)

bilateral_snhl      0.76
bilateral_cond      0.06
assymetrical        0.04
bilateral_ansd      0.04
unilateral_snhl     0.03
bilateral_mixed     0.03
unilateral_cond     0.02
unilateral_unk      0.00
unilateral_mixed    0.00
unilateral_ansd     0.00
bilateral_unk       0.00
bilateral_normal    0.00
dtype: float64

In [29]:
tech_data = data_4yo.set_index('study_id')[['bilateral_ha',
                     'bilateral_ci',
                     'bimodal',
                     'bilateral_other',
                     'unilateral_ha',
                     'unilateral_ci',
                     'unilateral_other']]

In [30]:
tech_data.sum().astype(int).sort_values(ascending=False)

bilateral_ci        362
bilateral_ha        107
bimodal              40
unilateral_ha        11
unilateral_ci         8
bilateral_other       1
unilateral_other      0
dtype: int64

In [31]:
tech_data.mean().round(2).sort_values(ascending=False)

bilateral_ci        0.68
bilateral_ha        0.20
bimodal             0.08
unilateral_ci       0.02
unilateral_ha       0.02
unilateral_other    0.00
bilateral_other     0.00
dtype: float64

Individuals with no technology type

In [32]:
tech_data[tech_data.sum(1)==0].index.values

array(['0523-2015-0002', '0412-2002-0068', '0412-2003-0060'], dtype=object)

## Hearing Loss

In [36]:
data_4yo.columns[data_4yo.columns.str.contains('_hl_')]

Index(['degree_hl_ad', 'type_hl_ad', 'degree_hl_as', 'type_hl_as',
       'bilateral_hl_normal', 'bilateral_hl_slight', 'bilateral_hl_mild',
       'bilateral_hl_moderate', 'bilateral_hl_modsev', 'bilateral_hl_severe',
       'bilateral_hl_profound', 'unilateral_hl_slight', 'unilateral_hl_mild',
       'unilateral_hl_moderate', 'unilateral_hl_modsev',
       'unilateral_hl_severe', 'unilateral_hl_profound'],
      dtype='object')

In [37]:
hl_data = data_4yo.set_index('study_id')[['bilateral_hl_normal', 'bilateral_hl_slight', 'bilateral_hl_mild',
       'bilateral_hl_moderate', 'bilateral_hl_modsev', 'bilateral_hl_severe',
       'bilateral_hl_profound', 'unilateral_hl_slight', 'unilateral_hl_mild',
       'unilateral_hl_moderate', 'unilateral_hl_modsev',
       'unilateral_hl_severe', 'unilateral_hl_profound']]

In [38]:
hl_data.sum().astype(int).sort_values(ascending=False)

bilateral_hl_profound     195
bilateral_hl_modsev        56
bilateral_hl_moderate      50
bilateral_hl_severe        38
bilateral_hl_mild          18
unilateral_hl_profound      7
unilateral_hl_modsev        7
unilateral_hl_severe        5
unilateral_hl_mild          5
bilateral_hl_slight         4
unilateral_hl_moderate      1
unilateral_hl_slight        1
bilateral_hl_normal         0
dtype: int64

In [39]:
hl_data.mean().round(2).sort_values(ascending=False)

bilateral_hl_profound     0.37
bilateral_hl_modsev       0.11
bilateral_hl_moderate     0.09
bilateral_hl_severe       0.07
bilateral_hl_mild         0.03
unilateral_hl_profound    0.01
unilateral_hl_severe      0.01
unilateral_hl_modsev      0.01
unilateral_hl_mild        0.01
bilateral_hl_slight       0.01
unilateral_hl_moderate    0.00
unilateral_hl_slight      0.00
bilateral_hl_normal       0.00
dtype: float64

Missing one or both hearing loss values (left/right)

In [45]:
(data_4yo[['degree_hl_ad', 'degree_hl_as']].isnull().sum(1) > 0).sum()

16

## Summary of scores by domain

In [46]:
test_scores = inclusions[age_mask].copy()

  """Entry point for launching an IPython kernel.


In [47]:
test_scores.loc[(test_scores.domain=='Language') & (test_scores.test_type=='receptive'), 
           'domain'] = 'Receptive Language'
test_scores.loc[(test_scores.domain=='Language') & (test_scores.test_type=='expressive'), 
           'domain'] = 'Expressive Language'

In [48]:
test_scores.groupby('domain').score.describe().round(1)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Articulation,355.0,85.6,20.3,0.0,72.0,88.0,101.0,121.0
Expressive Language,483.0,86.8,20.2,45.0,73.0,86.0,102.0,136.0
Expressive Vocabulary,469.0,95.7,21.4,0.0,83.0,97.0,111.0,140.0
Language,492.0,88.7,21.2,48.0,73.0,88.0,104.0,163.0
Receptive Language,482.0,90.5,19.6,50.0,76.0,90.0,106.0,145.0
Receptive Vocabulary,473.0,94.4,18.8,0.0,82.0,96.0,109.0,135.0
