# Suicide and fatal overdoses data questions

This notebook contains a cursory look of fatal overdoses and suicide data. Currently,
the presence of previous drug history and suicidality is determined solely by ambulance
data. A future iteration should look at more carefully aggregating this historical data
from additional data sources.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('../src/')

In [None]:
import pandas as pd
import sqlalchemy
import json
from utils.helpers import get_database_connection, get_events
from dateutil.relativedelta import relativedelta
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import os
import numpy as np

In [None]:
db_conn = get_database_connection()

In [None]:
event_types = ['AMBULANCE', 'MENTAL HEALTH', 'DEATH', 'HOSPITAL', 'ARREST']
query = """
with table_a as 
(
select joid, event_date, event_type, count(*) as num_events
from semantic.client_events
group by 1, 2, 3
),
lifetime_events_table as
(
select joid, event_type, count(*) as lifetime_events
from table_a
group by joid, event_type
),
mex_jc as
(
select joid, suicide, overdosed, (suicide or overdosed) as suic_or_od
from clean.jocojcmexoverdosessuicides j
),
mex_dc as 
(
select joid, suicide, overdosed, (suicide or overdosed) as suic_or_od
from clean.jocodcmexoverdosessuicides j
)

select 
    lea.*,
    coalesce(mjc.suicide, mdc.suicide, false) as suicide,
    coalesce(mjc.overdosed, mdc.overdosed, false) as overdosed,
    coalesce(mjc.suic_or_od, mdc.suic_or_od, false) as suic_or_od
from lifetime_events_table lea
left join mex_jc mjc on lea.joid = mjc.joid
left join mex_dc mdc on lea.joid = mdc.joid
"""

df = pd.read_sql(query, db_conn)

In [None]:
pt = df.pivot_table(index='joid', columns=['event_type'], aggfunc=sum, fill_value=0)['lifetime_events']

In [None]:
labels_dedup = df[['joid', 'suic_or_od']].drop_duplicates().set_index(['joid'])

In [None]:
labels_dedup

In [None]:
sum(pt['DEATH'] > 0)

In [None]:
sum(df['event_type'] == 'DEATH')

In [None]:
joined_df = pt.join(labels_dedup)

In [None]:
joined_df.head()

In [None]:
joined_df.iloc[joined_df['ARREST'].argmax()]

In [None]:
sum(joined_df['DEATH'] > 0)

## Aggregate information for true and false labels (suicide or fatal overdose)

In [None]:
# 1 event per day in which there was _some_ interaction
pd.set_option('display.float_format', '{:.2f}'.format)
labeled_dfs = []
display_dfs = []
for label_val in [True, False]:
    df_where_label = joined_df[joined_df['suic_or_od'] == label_val]
    labeled_dfs.append(df_where_label )
    print(f'num rows: {len(df_where_label.index)}; label={label_val}')
    display_df = joined_df[joined_df['suic_or_od'] == label_val].describe().loc[['mean', 'std', 'min', 'max', '50%']]
    display_dfs.append(display_df)
    heatmap_df = joined_df[joined_df['suic_or_od'] == label_val].describe().loc[['mean', '50%']]
    display(display_df)
df1, df2 = display_dfs

In [None]:
print('label true minus label false')
diff_df = df1 - df2
display((df1 - df2).drop(columns=['DEATH']))

In [None]:
normalized_df = joined_df.copy()

In [None]:
normalized_df.head()

In [None]:
for column in normalized_df.drop(columns=['DEATH', 'suic_or_od']):
    normalized_df[column] = (normalized_df[column] - normalized_df[column].mean()) / normalized_df[column].std()

In [None]:
normalized_df['AMBULANCE']

## Normalize the number of events for easier comparison

We use population mean and std to normalize. Since the class with false labels is much larger, the mean and std are very close to the mean and std of the false label population.

In [None]:
normalized_df.head()

In [None]:
normed_df_true, normed_df_false = normalized_df[normalized_df['suic_or_od'] == True], normalized_df[normalized_df['suic_or_od'] == False]

In [None]:
normed_df_true.head()

In [None]:
normed_df_false['HOSPITAL'].mean()

In [None]:
normed_df_false.drop(columns=['DEATH', 'suic_or_od']).mean()

In [None]:
normed_df_true.drop(columns=['DEATH', 'suic_or_od']).mean()

In [None]:
melt = pd.melt(
normed_df_true.drop(columns=['DEATH', 'suic_or_od'])
)

### Plot comments 

Since the population with label=false is much greater they essnetially dominate the mean and all their values are close to 0.
In the graph below we can observe that individuals with a true label are, on average, $.1-.3$ std above those with label false
for all fields except nr of hospital visits. There seem to be outliers with many hospital visits in the label=False population;
in all likelihood, this is why hospitalizations are lower for the true labels.

A follow-up plot might consider these populations without extreme outliers. 

In [None]:
# Get the dataframe with normalized number of events over the whole population (both label values). 
# Since the population with label=false is much greater they essnetially dominate the mean and all their values are close to 0.
# In the graph below we can observe that individuals with a true label are, on average, .1-.3 std above those with label false
# for all fields except nr of hospital visits. There are a few outliers with many hospital visits in the label=False population.
sns.barplot(data=melt, x='variable', y='value')
plt.title('Normalized number of events for individuals with true label')