In [None]:
import sys
sys.path.append('../src/')
from helpers.helpers import *

In [None]:
conn = get_database_connection()

# Overview and summmary
Brief exploration of the LMH data in Douglas county, looking into the frequency of various primary and secondary diagnoses.

In [None]:
query = "select * from clean.joco110hsccclientmisc2eadiagnosis"
df = pd.read_sql(query, conn)
df.head()

In [None]:
df.shape

In [None]:
df.admission_year = pd.DatetimeIndex(df.admission_date).year
df.groupby(df.admission_year).size().plot.bar();

In [None]:
(df.admission_year == 1990).sum() # One year seems to have been misclassified

# Primary and secondary diagnosis

In [None]:
primary_classification = df.primary_diagnosis_classification.value_counts().reset_index(name='n')
secondary_classification = df.secondary_diagnosis_classification.value_counts().reset_index(name='n')

In [None]:
# The vast majority is not mental health / substance related
primary_classification

In [None]:
# The vast majority is not mental health / substance related
secondary_classification

In [None]:
# Most of the time the primary and secondary classification is the same
np.round((df.primary_diagnosis_classification == df.secondary_diagnosis_classification).mean(), 2)

In [None]:
# Let's look at cases where they are not the same and primary diagnosis is 'OTHER'
# How many of the secondary classifications are *not* 'OTHER' and hence potentially valuable?
condition_prim = ((df.primary_diagnosis_classification != df.secondary_diagnosis_classification) &
             (df.primary_diagnosis_classification == 'OTHER'))
df[condition_prim].shape

In [None]:
# Let's look at cases where they are not the same and secondary diagnosis is 'OTHER'
# How many of the primary classifications are *not* 'OTHER' and hence potentially valuable?
condition_sec = ((df.primary_diagnosis_classification != df.secondary_diagnosis_classification) &
                   (df.secondary_diagnosis_classification == 'OTHER'))
df[condition_sec].shape

In [None]:
# The above shows that there are more cases where the secondary classification might be relevant
# then the primary classification (given that they are different)
# *TODO*: Get clarity on what is more accurate [ask Matt]

# Primary and secondary diagnosis (more details)

In [None]:
primary_diagnosis = df.primarydiagnosis.value_counts().reset_index(name='n')
primary_diagnosis['prop'] = primary_diagnosis['n'] / df.shape[0]

secondary_diagnosis = df.secondarydiagnosis.value_counts().reset_index(name='n')
secondary_diagnosis['prop'] = secondary_diagnosis['n'] / df.shape[0]

In [None]:
# The top 20 primary impressions do not include any mental health related
# issue except major depressive disorder as the 6th most occuring issue
primary_diagnosis.head(20)

In [None]:
# The top 20 primary impressions do not include any mental health related
# issue except major depressive disorder as the 6th most occuring issue
secondary_diagnosis.head(20)

In [None]:
# Interestingly the most frequent combination is MDD + suicidal ideation
df.groupby([df.primarydiagnosis, df.secondarydiagnosis]).size().reset_index(name='n').sort_values('n', ascending=False)