In [1]:
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import pandas_profiling

sns.set(context='notebook', style='ticks')

In [2]:
lsl_dr = (pd.read_csv('../data/clean/lsl_dr.csv', index_col=0, low_memory=False)
                  .rename({'onset_1':'identify_mo'}, axis=1))

In [3]:
lsl_dr.shape

(59537, 246)

Exclusions

In [4]:
other_etiology = (lsl_dr[['etiology_3___2', 'etiology_3___4', 'etiology_3___5', 'etiology_3___6', 'etiology_3___9',
       'etiology_oth___1', 'etiology_oth___3', 'etiology_oth___4', 'etiology_oth___8', 'etiology_oth___9']]
                      .sum(1).astype(bool))

In [5]:
inclusion_mask = (~lsl_dr.non_english.astype(bool) 
                  & (lsl_dr.hl==0)
                  & ((lsl_dr.autism==0)))

In [6]:
autism_subset = lsl_dr[inclusion_mask]
autism_subset.drop_duplicates(subset='study_id').shape

(4074, 246)

## Demographics

In [7]:
autism_subset.male.replace({1: 'Male', 0: 'Female', np.nan: 'Missing'}).value_counts()

Male      22736
Female    21793
Name: male, dtype: int64

In [8]:
autism_subset.race.replace({0:'White', 1:'Black', 2:'Hispanic', 3:'Asian', 4:'Other', np.nan:'Missing'}).value_counts()

White       27064
Black        5272
Hispanic     4786
Other        4305
Asian        2431
Missing       671
Name: race, dtype: int64

In [9]:
(autism_subset.premature_age.replace({9:None})==8).replace({True: '>=36 weeks', False: '<36 weeks', 
                                                            np.nan: 'Unknown'}).value_counts()

>=36 weeks    32789
<36 weeks     11740
Name: premature_age, dtype: int64

In [10]:
autism_subset.sib.replace({0:'1', 1:'2', 2:'3', 3:'4+', np.nan:'Missing'}).value_counts().sort_index()

1          10759
2          17384
3           8762
4+          5101
Missing     2523
Name: sib, dtype: int64

In [11]:
ed_lookup = {0:"8th grade or less",
1:"Some high school",
2:"High school diploma/GED",
3:"Some college",
4:"Bachelor's degree",
5:"Post graduate degree",
6:"Unknown",
np.nan:"Unknown"}

autism_subset.mother_ed.replace(ed_lookup).value_counts()

Some high school           13365
High school diploma/GED    12036
Unknown                    11624
Some college                6365
8th grade or less           1139
Name: mother_ed, dtype: int64

In [12]:
autism_subset.father_ed.replace(ed_lookup).value_counts()

Unknown                    14139
Bachelor's degree          10002
High school diploma/GED     6634
Some college                6479
Post graduate degree        6050
Some high school            1063
8th grade or less            162
Name: father_ed, dtype: int64

In [13]:
family_lookup = {0:"Ideal Participation",
1:"Good Participation",
2:"Average Participation",
3:"Below Average",
4:"Limited Participation",
np.nan:"Missing"}

autism_subset.family_inv.replace(family_lookup).value_counts()

Ideal Participation      14192
Good Participation       11102
Average Participation     9837
Missing                   5512
Below Average             3022
Limited Participation      864
Name: family_inv, dtype: int64

In [14]:
autism_subset[["age_diag", "age_amp", "age_int", "age"]].describe()

Unnamed: 0,age_diag,age_amp,age_int,age
count,39572.0,37920.0,35954.0,44218.0
mean,11.620931,16.687737,17.423889,29.833405
std,16.491708,17.335948,17.735059,24.748746
min,0.0,0.0,0.0,0.0
25%,1.0,4.0,4.0,10.0
50%,3.0,11.0,12.0,26.0
75%,18.0,24.0,25.0,40.0
max,160.0,173.0,200.0,227.0


In [15]:
autism_subset.otherserv.replace({0: "OPTION + outside", 1: "OPTION only", 2: "Missing"}).value_counts()

OPTION only         22140
OPTION + outside     7333
Missing              5681
Name: otherserv, dtype: int64

## Hearing loss

In [16]:
autism_subset.degree_hl_ad.isnull().sum()

958

In [17]:
autism_subset.degree_hl_as.isnull().sum()

1004

In [18]:
autism_subset.degree_hl_ad.value_counts()

6.0    19643
4.0     6254
3.0     6232
5.0     5686
2.0     3423
0.0     1274
1.0     1059
Name: degree_hl_ad, dtype: int64

In [19]:
autism_subset.degree_hl_as.value_counts()

6.0    19605
3.0     6557
4.0     5813
5.0     5520
2.0     3304
0.0     1548
1.0     1178
Name: degree_hl_as, dtype: int64

In [20]:
hl_data = autism_subset.set_index('study_id')[['bilateral_snhl',
                     'bilateral_ansd',
                     'bilateral_mixed',
                     'bilateral_cond',
                     'bilateral_normal',
                     'bilateral_unk',
                     'unilateral_snhl',
                     'unilateral_ansd',
                     'unilateral_mixed',
                     'unilateral_cond',
                     'unilateral_unk',
                     'assymetrical']]

One or both parents with hearing loss

In [21]:
autism_subset.one_or_both_parent_hl.value_counts()

0.0    34396
1.0     2194
Name: one_or_both_parent_hl, dtype: int64

In [22]:
autism_subset.one_or_both_parent_hl.isnull().sum()

7939

Individuals with no hearing loss type

In [23]:
hl_data[hl_data.sum(1)==0].index.values

array(['0625-2018-0014', '0625-2018-0014', '0625-2018-0014', ...,
       '0102-2010-3021', '0102-2010-3021', '0102-2010-3021'], dtype=object)

In [24]:
hl_data.sum().astype(int).sort_values(ascending=False)

bilateral_snhl      34566
unilateral_snhl      1734
bilateral_cond       1692
bilateral_ansd       1509
assymetrical         1196
bilateral_mixed      1098
unilateral_cond       930
unilateral_mixed      150
unilateral_ansd       130
bilateral_normal       55
unilateral_unk          0
bilateral_unk           0
dtype: int64

In [25]:
hl_data.mean().round(2).sort_values(ascending=False)

bilateral_snhl      0.78
unilateral_snhl     0.04
bilateral_cond      0.04
assymetrical        0.03
bilateral_ansd      0.03
unilateral_cond     0.02
bilateral_mixed     0.02
unilateral_unk      0.00
unilateral_mixed    0.00
unilateral_ansd     0.00
bilateral_unk       0.00
bilateral_normal    0.00
dtype: float64

In [26]:
tech_data = autism_subset.set_index('study_id')[['bilateral_ha',
                     'bilateral_ci',
                     'bimodal',
                     'bilateral_other',
                     'unilateral_ha',
                     'unilateral_ci',
                     'unilateral_other']]

In [27]:
tech_data.sum().astype(int).sort_values(ascending=False)

bilateral_ci        29190
bilateral_ha         9135
bimodal              2800
unilateral_ha        1227
unilateral_ci         974
bilateral_other       153
unilateral_other       69
dtype: int64

In [28]:
tech_data.mean().round(2).sort_values(ascending=False)

bilateral_ci        0.66
bilateral_ha        0.21
bimodal             0.06
unilateral_ha       0.03
unilateral_ci       0.02
unilateral_other    0.00
bilateral_other     0.00
dtype: float64

## Hearing Loss

In [30]:
autism_subset.columns[autism_subset.columns.str.contains('_hl_')]

Index(['degree_hl_ad', 'type_hl_ad', 'degree_hl_as', 'type_hl_as',
       'bilateral_hl_normal', 'bilateral_hl_slight', 'bilateral_hl_mild',
       'bilateral_hl_moderate', 'bilateral_hl_modsev', 'bilateral_hl_severe',
       'bilateral_hl_profound', 'unilateral_hl_slight', 'unilateral_hl_mild',
       'unilateral_hl_moderate', 'unilateral_hl_modsev',
       'unilateral_hl_severe', 'unilateral_hl_profound'],
      dtype='object')

In [31]:
hl_data = autism_subset.set_index('study_id')[['bilateral_hl_normal', 'bilateral_hl_slight', 'bilateral_hl_mild',
       'bilateral_hl_moderate', 'bilateral_hl_modsev', 'bilateral_hl_severe',
       'bilateral_hl_profound', 'unilateral_hl_slight', 'unilateral_hl_mild',
       'unilateral_hl_moderate', 'unilateral_hl_modsev',
       'unilateral_hl_severe', 'unilateral_hl_profound']]

In [32]:
hl_data.sum().astype(int).sort_values(ascending=False)

bilateral_hl_profound     16765
bilateral_hl_moderate      4215
bilateral_hl_modsev        3288
bilateral_hl_severe        2806
bilateral_hl_mild          1990
unilateral_hl_profound      659
unilateral_hl_modsev        656
unilateral_hl_severe        538
unilateral_hl_moderate      445
bilateral_hl_slight         430
unilateral_hl_mild          298
unilateral_hl_slight        131
bilateral_hl_normal          40
dtype: int64

In [33]:
hl_data.mean().round(2).sort_values(ascending=False)

bilateral_hl_profound     0.38
bilateral_hl_moderate     0.09
bilateral_hl_modsev       0.07
bilateral_hl_severe       0.06
bilateral_hl_mild         0.04
unilateral_hl_profound    0.01
unilateral_hl_severe      0.01
unilateral_hl_modsev      0.01
unilateral_hl_moderate    0.01
unilateral_hl_mild        0.01
bilateral_hl_slight       0.01
unilateral_hl_slight      0.00
bilateral_hl_normal       0.00
dtype: float64

Missing one or both hearing loss values (left/right)

In [34]:
(autism_subset[['degree_hl_ad', 'degree_hl_as']].isnull().sum(1) > 0).sum()

1042

## Summary of scores by domain

In [35]:
test_scores = lsl_dr[inclusion_mask].copy()

In [36]:
test_scores.loc[(test_scores.domain=='Language') & (test_scores.test_type=='receptive'), 
           'domain'] = 'Receptive Language'
test_scores.loc[(test_scores.domain=='Language') & (test_scores.test_type=='expressive'), 
           'domain'] = 'Expressive Language'

In [37]:
test_scores.groupby('domain').score.describe().round(1)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Articulation,4941.0,82.0,19.8,0.0,69.0,84.0,98.0,128.0
Expressive Language,8585.0,84.1,20.6,0.0,69.0,85.0,99.0,150.0
Expressive Vocabulary,6780.0,91.6,19.6,0.0,79.0,93.0,105.0,150.0
Language,8733.0,84.3,21.6,0.0,68.0,84.0,100.0,250.0
Receptive Language,8511.0,86.6,20.1,0.0,73.0,87.0,101.0,150.0
Receptive Vocabulary,6979.0,89.3,18.4,0.0,77.0,90.0,102.0,160.0


In [38]:
test_scores.domain.unique()

array(['Expressive Vocabulary', 'Receptive Language',
       'Expressive Language', 'Language', 'Articulation',
       'Receptive Vocabulary'], dtype=object)

In [39]:
test_scores['domain_labels'] = test_scores.domain.replace({'Expressive Vocabulary': 'Expressive\nVocabulary', 
                                                           'Receptive Vocabulary': 'Receptive\nVocabulary',
                                                           'Receptive Language': 'Receptive\nLanguage', 
                                                           'Expressive Language': 'Expressive\nLanguage', 
                                                           'Language': 'Total\nLanguage'})

In [None]:
sns.catplot("domain_labels", y="score", data=test_scores, kind="box", color='white', fliersize=0)
sns.swarmplot("domain_labels", y="score", data=test_scores, color='grey', alpha=0.5)
fig = plt.gcf()
fig.set_size_inches(12, 5)
fig.axes[0].set_xlabel('');

## Domain-specific summaries

Age of amplification greater than 6mo.

In [41]:
(test_scores.assign(age_amp_greater_6=test_scores.age_amp>6)
         .groupby(['domain', 'age_amp_greater_6'])
         .score.describe())

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
domain,age_amp_greater_6,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Articulation,False,2260.0,84.341593,19.095625,39.0,72.0,87.0,100.0,128.0
Articulation,True,2681.0,80.031332,20.138896,0.0,66.0,82.0,97.0,122.0
Expressive Language,False,4334.0,88.29234,20.498308,0.0,74.0,89.0,103.0,150.0
Expressive Language,True,4251.0,79.918843,19.748376,9.0,65.0,79.0,94.0,150.0
Expressive Vocabulary,False,3182.0,93.758014,20.00028,0.0,80.0,95.0,108.0,146.0
Expressive Vocabulary,True,3598.0,89.757921,19.105142,0.0,78.0,90.0,102.0,150.0
Language,False,4366.0,88.308978,21.725678,0.0,74.0,89.0,103.0,250.0
Language,True,4367.0,80.219831,20.76927,40.0,64.0,79.0,95.0,186.0
Receptive Language,False,4313.0,90.11106,20.035901,0.0,76.0,91.0,104.0,150.0
Receptive Language,True,4198.0,83.028109,19.623394,11.0,68.0,83.0,98.0,150.0


Age of enrollment greater than 6mo.

In [42]:
(test_scores.assign(age_enroll_greater_6=test_scores.age_int>6)
         .groupby(['domain', 'age_enroll_greater_6'])
         .score.describe())

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
domain,age_enroll_greater_6,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Articulation,False,2356.0,83.515705,19.362016,0.0,71.0,86.0,99.0,128.0
Articulation,True,2585.0,80.623985,20.064568,3.0,67.0,83.0,97.0,122.0
Expressive Language,False,4306.0,87.093126,20.593864,26.0,73.0,87.0,102.0,150.0
Expressive Language,True,4279.0,81.180416,20.096731,0.0,67.0,81.0,95.0,150.0
Expressive Vocabulary,False,3278.0,92.895973,20.074317,0.0,79.0,94.0,107.0,146.0
Expressive Vocabulary,True,3502.0,90.455168,19.133892,0.0,78.0,91.0,103.0,150.0
Language,False,4358.0,87.104865,21.94684,19.0,72.0,87.0,102.0,250.0
Language,True,4375.0,81.434057,20.93861,0.0,66.0,81.0,96.0,186.0
Receptive Language,False,4296.0,89.185754,20.097943,27.0,75.0,90.0,103.0,150.0
Receptive Language,True,4215.0,83.999763,19.858344,0.0,69.0,84.0,98.0,150.0


(Non-)Profound hearing loss

In [43]:
(test_scores.assign(profound_hl=test_scores.degree_hl==6)
         .groupby(['domain', 'profound_hl'])
         .score.describe())

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
domain,profound_hl,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Articulation,False,2381.0,84.438891,19.422745,3.0,72.0,87.0,100.0,126.0
Articulation,True,2560.0,79.737109,19.850379,0.0,67.0,82.0,96.0,128.0
Expressive Language,False,4359.0,89.585685,19.668943,9.0,75.5,90.0,104.0,150.0
Expressive Language,True,4226.0,78.535258,19.944415,0.0,63.0,77.0,92.0,146.0
Expressive Vocabulary,False,3272.0,95.301345,18.699942,0.0,84.0,96.0,108.0,147.0
Expressive Vocabulary,True,3508.0,88.215792,19.86237,0.0,76.0,88.0,101.0,150.0
Language,False,4458.0,89.930013,20.793633,40.0,76.0,90.0,104.0,250.0
Language,True,4275.0,78.355322,20.900556,0.0,62.0,78.0,93.0,186.0
Receptive Language,False,4299.0,91.34729,19.470674,11.0,78.0,92.0,105.0,150.0
Receptive Language,True,4212.0,81.789886,19.675321,0.0,67.0,81.0,96.0,150.0


Parent with(out) hearing loss

In [44]:
test_scores.loc[test_scores.parent_hearing_loss=='Unknown', 'parent_hearing_loss'] = np.nan
(test_scores.assign(parent_with_hl=test_scores.parent_hearing_loss.isin(['Both parents have hearing loss',
                                                                       'Father has hearing loss',
                                                                       'Mother has hearing loss']))
         .groupby(['domain', 'parent_with_hl'])
         .score.describe())

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
domain,parent_with_hl,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Articulation,False,4686.0,81.969484,19.806053,0.0,69.0,84.0,98.0,128.0
Articulation,True,255.0,82.615686,19.392378,39.0,69.5,84.0,99.0,117.0
Expressive Language,False,8178.0,84.030203,20.594602,0.0,69.0,85.0,99.0,150.0
Expressive Language,True,407.0,86.474201,19.734536,45.0,73.0,87.0,101.0,144.0
Expressive Vocabulary,False,6424.0,91.513543,19.637136,0.0,79.0,92.0,105.0,150.0
Expressive Vocabulary,True,356.0,93.831461,19.410458,0.0,82.0,96.0,107.0,138.0
Language,False,8319.0,84.124053,21.571963,0.0,68.0,84.0,100.0,250.0
Language,True,414.0,87.074879,22.671391,40.0,71.0,87.0,100.0,195.0
Receptive Language,False,8105.0,86.556817,20.190712,0.0,72.0,87.0,101.0,150.0
Receptive Language,True,406.0,87.827586,19.216372,47.0,74.25,88.0,100.75,139.0


Mother with(out) college education

In [45]:
test_scores.groupby(['domain', 'mother_college']).score.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
domain,mother_college,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Articulation,0,2961.0,80.165147,20.031447,0.0,67.0,82.0,97.0,121.0
Articulation,1,1980.0,84.75101,19.084295,0.0,73.0,87.0,100.0,128.0
Expressive Language,0,4998.0,80.92517,19.358514,0.0,67.0,81.0,94.0,150.0
Expressive Language,1,3587.0,88.633956,21.332326,0.0,73.0,90.0,104.0,150.0
Expressive Vocabulary,0,4000.0,87.8195,18.939934,0.0,76.0,89.0,100.0,146.0
Expressive Vocabulary,1,2780.0,97.12554,19.30624,0.0,84.0,98.0,110.0,150.0
Language,0,5099.0,80.658168,20.484267,0.0,65.0,81.0,95.0,209.0
Language,1,3634.0,89.323335,22.18474,0.0,73.25,90.0,105.0,250.0
Receptive Language,0,4967.0,83.281659,19.144463,11.0,69.0,83.0,97.0,145.0
Receptive Language,1,3544.0,91.292607,20.583306,0.0,77.0,92.0,106.0,150.0
