In [1]:
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

sns.set(context='notebook', style='ticks')

In [2]:
lsl_dr = (pd.read_csv('../data/clean/lsl_dr.csv', index_col=0, low_memory=False)
                  .rename({'onset_1':'identify_mo'}, axis=1))

In [3]:
lsl_dr.shape

(59512, 231)

Exclusions

In [4]:
other_etiology = (lsl_dr[['etiology_3___2', 'etiology_3___4', 'etiology_3___5', 'etiology_3___6', 'etiology_3___9',
       'etiology_oth___1', 'etiology_oth___3', 'etiology_oth___4', 'etiology_oth___8', 'etiology_oth___9']]
                      .sum(1).astype(bool))

In [5]:
inclusion_mask = (~lsl_dr.non_english.astype(bool) 
                  & (lsl_dr.degree_hl!=0)
                  & ((lsl_dr.etiology_2==0)
                  | (lsl_dr.etiology_2.isnull() & ~other_etiology)))

In [6]:
inclusions = lsl_dr[inclusion_mask]
inclusions.drop_duplicates(subset='study_id').shape

(1326, 231)

In [7]:
age_mask = (lsl_dr.age_test>=48) & (lsl_dr.age_test<60)

In [8]:
data_4yo = inclusions[age_mask].drop_duplicates(subset='study_id').copy()

  """Entry point for launching an IPython kernel.


In [9]:
data_4yo.shape

(563, 231)

## Demographics

In [10]:
data_4yo.male.replace({1: 'Male', 0: 'Female', np.nan: 'Missing'}).value_counts()

Male      286
Female    277
Name: male, dtype: int64

In [11]:
data_4yo.race.replace({0:'White', 1:'Black', 2:'Hispanic', 3:'Asian', 4:'Other', np.nan:'Missing'}).value_counts()

White       372
Black        63
Hispanic     47
Other        45
Asian        27
Missing       9
Name: race, dtype: int64

In [12]:
(data_4yo.premature_age.replace({9:None})==8).replace({True: '>=36 weeks', False: '<36 weeks', 
                                                            np.nan: 'Unknown'}).value_counts()

>=36 weeks    412
<36 weeks     151
Name: premature_age, dtype: int64

In [13]:
data_4yo.sib.replace({0:'1', 1:'2', 2:'3', 3:'4+', np.nan:'Missing'}).value_counts().sort_index()

1          126
2          228
3          116
4+          68
Missing     25
Name: sib, dtype: int64

In [14]:
ed_lookup = {0:"8th grade or less",
1:"Some high school",
2:"High school diploma/GED",
3:"Some college",
4:"Bachelor's degree",
5:"Post graduate degree",
6:"Unknown",
np.nan:"Unknown"}

data_4yo.mother_ed.replace(ed_lookup).value_counts()

High school diploma/GED    168
Some high school           146
Unknown                    141
Some college                96
8th grade or less           12
Name: mother_ed, dtype: int64

In [15]:
data_4yo.father_ed.replace(ed_lookup).value_counts()

Unknown                    175
Bachelor's degree          134
Some college                97
Post graduate degree        81
High school diploma/GED     65
Some high school            11
Name: father_ed, dtype: int64

In [16]:
family_lookup = {0:"Ideal Participation",
1:"Good Participation",
2:"Average Participation",
3:"Below Average",
4:"Limited Participation",
np.nan:"Missing"}

data_4yo.family_inv.replace(family_lookup).value_counts()

Ideal Participation      184
Good Participation       123
Average Participation    114
Missing                   90
Below Average             42
Limited Participation     10
Name: family_inv, dtype: int64

In [17]:
data_4yo[["age_diag", "age_amp", "age_int", "age"]].describe()

Unnamed: 0,age_diag,age_amp,age_int,age
count,430.0,425.0,385.0,556.0
mean,9.463953,14.122353,15.119481,25.021583
std,12.918581,13.80983,13.577373,16.267125
min,0.0,0.0,0.0,0.0
25%,1.0,3.0,4.0,10.0
50%,3.0,8.0,12.0,24.0
75%,17.0,22.0,24.0,36.0
max,55.0,65.0,60.0,88.0


In [18]:
data_4yo.otherserv.replace({0: "OPTION + outside", 1: "OPTION only", 2: "Missing"}).value_counts()

OPTION only         224
Missing              70
OPTION + outside     62
Name: otherserv, dtype: int64

## Hearing loss

In [28]:
data_4yo.degree_hl_ad.isnull().sum()

43

In [29]:
data_4yo.degree_hl_as.isnull().sum()

43

In [19]:
data_4yo.degree_hl_ad.value_counts()

6.0    230
4.0     83
5.0     76
3.0     76
2.0     37
0.0      9
1.0      9
Name: degree_hl_ad, dtype: int64

In [20]:
data_4yo.degree_hl_as.value_counts()

6.0    230
4.0     90
3.0     76
5.0     68
2.0     28
0.0     17
1.0     11
Name: degree_hl_as, dtype: int64

In [21]:
hl_data = data_4yo[['bilateral_snhl',
                     'bilateral_ansd',
                     'bilateral_mixed',
                     'bilateral_cond',
                     'bilateral_normal',
                     'bilateral_unk',
                     'unilateral_snhl',
                     'unilateral_ansd',
                     'unilateral_mixed',
                     'unilateral_cond',
                     'unilateral_unk',
                     'assymetrical']]

Individuals with no hearing loss type

In [54]:
hl_data[hl_data.sum(1)==0].index

Int64Index([ 3949,  6604,  6659,  7021,  7179,  7183,  8650,  9675,  9836,
            10004, 10548, 10647, 10667, 10855, 10935, 12420, 13883, 14059,
            16393, 17142, 17352, 17457, 18019, 18024, 18040, 18080, 18679,
            19644, 19909, 20156, 21459, 21470, 22366, 25381, 25658, 28134,
            29697, 29944, 30472, 30489, 30530, 30551, 30711, 31754, 32474,
            32665, 33389, 33395, 43057, 43215, 47019, 49610],
           dtype='int64')

In [22]:
hl_data.sum().astype(int).sort_values(ascending=False)

bilateral_snhl      400
bilateral_cond       30
assymetrical         19
bilateral_ansd       19
bilateral_mixed      15
unilateral_snhl      14
unilateral_cond      10
bilateral_normal      2
unilateral_mixed      1
unilateral_ansd       1
unilateral_unk        0
bilateral_unk         0
dtype: int64

In [23]:
hl_data.mean().round(2).sort_values(ascending=False)

bilateral_snhl      0.71
bilateral_cond      0.05
assymetrical        0.03
bilateral_mixed     0.03
bilateral_ansd      0.03
unilateral_cond     0.02
unilateral_snhl     0.02
unilateral_unk      0.00
unilateral_mixed    0.00
unilateral_ansd     0.00
bilateral_unk       0.00
bilateral_normal    0.00
dtype: float64

In [24]:
tech_data = data_4yo[['bilateral_ha',
                     'bilateral_ci',
                     'bimodal',
                     'bilateral_other',
                     'unilateral_ha',
                     'unilateral_ci',
                     'unilateral_other']]

In [25]:
tech_data.sum().astype(int).sort_values(ascending=False)

bilateral_ci        393
bilateral_ha        107
bimodal              39
unilateral_ha        10
unilateral_ci         9
bilateral_other       1
unilateral_other      0
dtype: int64

In [26]:
tech_data.mean().round(2).sort_values(ascending=False)

bilateral_ci        0.70
bilateral_ha        0.19
bimodal             0.07
unilateral_ci       0.02
unilateral_ha       0.02
unilateral_other    0.00
bilateral_other     0.00
dtype: float64

Individuals with no technology type

In [58]:
tech_data[tech_data.sum(1)==0].index

Int64Index([6351, 11404, 13688, 15107], dtype='int64')