# Suicide and fatal overdoses data questions

This notebook contains a cursory look of fatal overdoses and suicide data. Currently,
the presence of previous drug history and suicidality is determined solely by ambulance
data. A future iteration should look at more carefully aggregating this historical data
from additional data sources.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('../src/')

In [None]:
import pandas as pd
import sqlalchemy
import json
from utils.helpers import get_database_connection, get_events
from dateutil.relativedelta import relativedelta
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import os
import numpy as np

In [None]:
db_conn = get_database_connection()

In [None]:
# Will need to take a join of old and new matched joids as well as recalculate the num_client_rows for the first time interactions
"""
-- nr of old joids that dont appear in new matches
with old_matches as 
(
select joid, matchdate 
from clean.jocojococlient j 
where matchdate = '2022-05-23'
),
new_matches as 
(
select joid, matchdate
from clean.jocojococlient j 
where matchdate = '2022-06-26'
),
joined as 
(
select coalesce(n.joid, o.joid),
o.matchdate as oldmatchdate,
n.matchdate as newmatchdate
from new_matches n
full outer join old_matches o on n.joid = o.joid
where (o.matchdate is not null) and (n.matchdate is null)
)
select count(*)
from joined
"""


In [None]:
# Order or items below: 1) county abbreviation, 2) medical examiner data matching id, 3) ambulance table name, 4) ambulance id, 5) client to ambulance matching id
jc_names = ('jc', 'id', 'jocomedactincidents', 'hash_rcdid', 'hash_sourceid')
dc_names = ('dc', 'casenum', 'joco110hsccclientmisc2eaimpression', 'clientid', 'sourceid')  
dfs = []
for county, table_id, ambulance_table, ambulance_sourceid, client_sourceid in [jc_names, dc_names]:
    q = f"""
    with mex as 
    (
    select 
        client.joid,
        jcmex.suicide,
        jcmex.overdosed,
        client.dob,
        jcmex.dateofdeath
    from clean.jocojococlient client
    join clean.jocojcmexoverdosessuicides jcmex
    on client.joid = jcmex.joid
    where client.matchdate = '2022-06-26'
    UNION
    select
        client.joid,
        dcmex.suicide,
        dcmex.suicide,
        dcmex.overdosed,
        client.dob,
        dcmex.dateofdeath
    from client
    join clean.jocodcmexoverdosessuicides dcmex
    on client.joid = dcmex.joid
    ),
    amb as 
    (
    select
        client.joid,
        ambulance.suicide_attempt_flag,
        ambulance.suicidal_flag,
        ambulance.drug_flag
    from clean.jocojococlient client
    join clean.{ambulance_table} ambulance
    on client.{client_sourceid} = ambulance.{ambulance_sourceid}
    ),
    num_rows as 
    (
    select
        joid,
        count(*) as num_client_rows
    from
        clean.jocojococlient client
    where
        client.matchdate = '2022-06-26'
    group by joid
    )

    select distinct 
        mex.joid,
        mex.suicide,
        mex.overdosed,
        mex.dob,
        mex.dateofdeath,
        amb.suicide_attempt_flag,
        amb.suicidal_flag,
        amb.drug_flag,
        num_client_rows
    from mex
    left join amb
    on mex.joid = amb.joid
    left join num_rows on mex.joid = num_rows.joid
    """

    result = db_conn.execute(q)
    dfs.append(pd.DataFrame(result))

### Start here

In [None]:
q = """
    -- joids in the medical examiner data 
    select distinct 
        jcmex.joid,
        dateofdeath
    from clean.jocojcmexoverdosessuicides jcmex
    where suicide or overdosed
    and joid is not null
    UNION
    select distinct
        dcmex.joid,
        dateofdeath
    from clean.jocodcmexoverdosessuicides dcmex
    where (suicide or overdosed)
    and joid is not null
"""
mex_df = pd.read_sql(q, db_conn)

In [None]:
# All joids present in the medical examiner data
mex_joids = list(mex_df['joid'].values)
# mex_suicide_joids = list(mex_df[mex_df['suicide'] == True]['joid'].values)

In [None]:
len(mex_joids)

In [None]:
mex_joids_str = ', '.join([f"'{mex_joid}'" for mex_joid in mex_joids])

db_conn = get_database_connection()
q = f"""
select distinct joid
from clean.jocojococlient client
where joid in ({mex_joids_str})
and source not similar to '.+[JD]CMEX.+'
"""
q = f"""
select distinct joid, count(source)
from clean.jocojococlient client
where joid in ({mex_joids_str})
and source not similar to '.+[JD]CMEX.+'
group by joid
order by count(source) asc
"""
# print(q)
# ('JOCODCMEXOVERDOSESSUICIDES.CASENUM', 'JOCOJCMEXOVERDOSESSUICIDES.ID')
# print(q)
query_df = pd.read_sql(q, db_conn)

In [None]:
sum(query_df['count'] > 2)

In [None]:
len(query_df.index)

In [None]:
joids_multi_interaction = list(query_df['joid'].values)

In [None]:
len(joids_multi_interaction)

In [None]:
joids_single_interaction = set(mex_joids) - set(joids_multi_interaction)
assert not (set(joids_multi_interaction) - set(mex_joids))  # This set should be empty

In [None]:
joids_single_interaction

In [None]:
percent = len(joids_single_interaction) / (len(joids_single_interaction) + len(mex_joids))
print(f' joids in the medical examiner data, {len(joids_single_interaction)} out of {len(mex_joids)} ({percent:.2%}) have no previous data.')

In [None]:
# Check for which values do we not have 

# num_failures = 0
# for joid in list(joids_multi_interaction):
#     q = f"""
    
#     -- joids in the medical examiner data 
#     with medex_joids as
#     (
#     select distinct 
#         jcmex.joid, dateofdeath
#     from clean.jocojcmexoverdosessuicides jcmex
#     where suicide or overdosed
#     and joid is not null
#     UNION
#     select distinct
#         dcmex.joid, dateofdeath
#     from clean.jocodcmexoverdosessuicides dcmex
#     where (suicide or overdosed)
#     and joid is not null
#     ),
#     events_before_death as 
#     (
#     select *, medex_joids.dateofdeath
#     from semantic.client_events ce
#     join medex_joids on medex_joids.joid = ce.joid
#     where ce.joid = '{joid}'
#     and event_date < dateofdeath
#     limit 5
#     )
#     select count(*)
#     from events_before_death
#     """
#     df = pd.read_sql(q, db_conn)
#     # df = df.sort_values(by='event_date')
#     # print('testing joid ', joid)
#     if not all(df['count'] > 0):
#         num_failures += 1

In [None]:
# Let's ensure this code is bug free. For each multi interaction joid, verify it is in the medical examiner data
# with a suicide or fatal od and the joid is in the client table with a source other than medical examiner data
db_conn = get_database_connection()
for joid in list(joids_multi_interaction)[:10]:
    date_of_death = mex_df[mex_df['joid'] == joid]['dateofdeath']
    q = f"""
    select distinct source
    from clean.jocojococlient client
    where source not similar to '.+[JD]CMEX.+'
    and joid = '{joid}'
    limit 5;
    """
    df = pd.read_sql(q, db_conn)
    # display(df)
    if not len(df.index):
        print('failed on joid ', joid)

---
End here 

---

In [None]:
dup_joid = np.argmax(mex_df['joid'].duplicated())
print(dup_joid)
mex_df['joid'].iloc[int(dup_joid)]
# mex_df.drop_duplicates(subset=['joid'])

In [None]:
# Order or items below: 1) county abbreviation, 2) medical examiner data matching id, 3) ambulance table name, 4) ambulance id, 5) client to ambulance matching id
jc_names = ('jc', 'id', 'jocomedactincidents', 'hash_rcdid', 'hash_sourceid')
dc_names = ('dc', 'casenum', 'joco110hsccclientmisc2eaimpression', 'clientid', 'sourceid')  
dfs = []
for county, table_id, ambulance_table, ambulance_sourceid, client_sourceid in [jc_names, dc_names]:
    q = f"""
    with mex as 
    (
    select 
        client.joid,
        j.suicide,
        j.overdosed,
        client.dob,
        j.dateofdeath
    from clean.jocojococlient client
    join clean.joco{county}mexoverdosessuicides j
    on client.sourceid = j.{table_id}
    where client.matchdate = '2022-06-26'
    ),
    amb as 
    (
    select
        client.joid,
        ambulance.suicide_attempt_flag,
        ambulance.suicidal_flag,
        ambulance.drug_flag
    from clean.jocojococlient client
    join clean.{ambulance_table} ambulance
    on client.{client_sourceid} = ambulance.{ambulance_sourceid}
    ),
    num_rows as 
    (
    select
        joid,
        count(*) as num_client_rows
    from
        clean.jocojococlient client
    where
        client.matchdate = '2022-06-26'
    group by joid
    )

    select distinct 
        mex.joid,
        mex.suicide,
        mex.overdosed,
        mex.dob,
        mex.dateofdeath,
        amb.suicide_attempt_flag,
        amb.suicidal_flag,
        amb.drug_flag,
        num_client_rows
    from mex
    left join amb
    on mex.joid = amb.joid
    left join num_rows on mex.joid = num_rows.joid
    """

    result = db_conn.execute(q)
    dfs.append(pd.DataFrame(result))

In [None]:
dfs[0]['county'] = len(dfs[0].index) * ['j'] 
dfs[1]['county'] = len(dfs[1].index) * ['d'] 
both_df = pd.concat(dfs, ignore_index=True)

In [None]:
both_df

## Suicide and overdose data

In [None]:
def list_to_csv(filename, values):
    CSV_DIR = 'csv/'
    path = os.path.join(CSV_DIR, filename)
    with open(path, 'w') as f:
        csv_writer = csv.writer(f)
        values = [[value] for value in values]
        csv_writer.writerows(values)

### Suicides where a previous suicide attempt (or suicidality in general) is known

In [None]:
for county in ['johnson', 'douglas']:
    county_letter = county[0]
    county_df = both_df[both_df['county'] == county_letter]
    tot_suicides = sum(county_df['suicide'] == True)
    num_prev_attempts = sum((county_df['suicide'] == True) & (county_df['suicide_attempt_flag'] == True))
    # The below includes suicidal ideation, self-harm, etc.
    num_prev_suicidal = sum((county_df['suicide'] == True) & (county_df['suicidal_flag'] == True))
    print(f'{num_prev_attempts} out of {tot_suicides} ({num_prev_attempts / (num_prev_attempts + tot_suicides):.1%}) {county} county residents that committed suicide had a previous suicide attempt (tracked by ambulance data)')
    print(f'{num_prev_suicidal} out of {tot_suicides} ({num_prev_suicidal / (num_prev_suicidal + tot_suicides):.1%}) {county} county residents that committed suicide were previously suicidal (tracked by ambulance data)')
    print('-'*20)

### Drug overdoses and previous drug history

In [None]:
for county in ['johnson', 'douglas']:
    county_letter = county[0]
    county_df = both_df[both_df['county'] == county_letter]
    tot_fatal_ods = sum(county_df['overdosed'] == True)
    num_od_with_drug_history = sum((county_df['overdosed'] == True) & (county_df['drug_flag'] == True))
    print(f'{num_od_with_drug_history} out of {tot_fatal_ods} ({num_od_with_drug_history / (tot_fatal_ods + num_od_with_drug_history):.1%}) {county} county residents that fatally overdosed had previous drug history (as tracked by ambulance data)')
    print('-'*20)

### First-time interaction with the system is suicide 

In [None]:
for county in ['johnson', 'douglas']:
    county_letter = county[0]
    county_df = both_df[(both_df['county'] == county_letter) & (both_df['suicide'] == True)]
    know_only_suicide = county_df[county_df['num_client_rows'] == 1]
        
    # Export to csv
    list_to_csv(f'{county}_only_know_suicide.csv', know_only_suicide['joid'].tolist())
    
    num_only_know_suicide = len(know_only_suicide.index) 
    num_know_more_than_just_suicide = len(county_df[county_df['num_client_rows'] > 1].index)
    num_suicides = num_only_know_suicide + num_know_more_than_just_suicide
    print(f'In {county} county, {num_only_know_suicide} of {num_suicides} ({num_only_know_suicide / (num_only_know_suicide + num_know_more_than_just_suicide):.1%}) total suicides have no prior interaction with the system')

### First-time interaction with the system is drug overdose 

In [None]:
for county in ['johnson', 'douglas']:
    county_letter = county[0]
    county_df = both_df[(both_df['county'] == county_letter) & (both_df['overdosed'] == True)]
    only_know_od_df = county_df[county_df['num_client_rows'] == 1]
    num_only_know_od = len(only_know_od_df.index) 
    
    list_to_csv(f'{county}_only_know_od.csv', only_know_od_df['joid'].tolist())
    
    num_know_more_than_just_od = len(county_df[county_df['num_client_rows'] > 1].index)
    num_ods = num_only_know_od + num_know_more_than_just_od
    print(f'In {county} county, {num_only_know_od} of {num_ods} ({num_only_know_od / (num_only_know_od + num_know_more_than_just_od):.1%}) fatal overdoses have no prior interaction with the system (as tracked by ambulance data)')

### Suicide and overdoses by age

In [None]:
def get_age(death_date, dob):
    """ Return age of person at time of death."""
    if death_date is None or dob is None:
        return None
    return death_date.year - dob.year - ((death_date.month, death_date.day) < (dob.month, dob.day))

In [None]:
both_df['age'] = [get_age(death_date, dob) for death_date, dob in zip(both_df['dateofdeath'], both_df['dob'])]

In [None]:
both_df['single_event'] = [True if num_cl_rows == 1 else False for num_cl_rows in both_df['num_client_rows']]

In [None]:
sns.set()

#### We explore the age distribution of those that committed suicide, conditioned on i), whether their death is their first interaction with the system and, ii), whether the death was through overdose.

In [None]:
for death_type in ['suicide', 'overdosed']:
    for my_hue in ['single_event', 'overdosed']:
        for county in ['j', 'd']:
            print(f'Death type: {death_type}')
            if death_type == 'overdosed':
                print('(Not necessarily suicide)')
            county_name = 'Johnson County' if county == 'j' else 'Douglas County'
            data_df = both_df[(both_df['county'] == county) & (both_df[death_type] == True)]
            sns.histplot(data=data_df, x='age', stat='count', hue=my_hue, binwidth=5)
            plt.title(f'{county_name}')
            plt.show()

## Crosstab of features

In [None]:
both_df