# Dataset exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from data_loader import load_final
from data_analysis import cohen_d, top_v_bottom, summer_v_winter, suicides_by_month
sns.set_context("talk")
plt.style.use('ggplot')

## Read cleaned dataset from file

In [None]:
suicides = load_final('data/suicides_heat.csv')
suicides.head()

# Exploratory Data Analysis

There is a strong annual trend.

In [None]:
plt.plot(suicides.groupby('Year').sum().Deaths)
plt.title('Suicides per Year in the US', size=24)
plt.xlabel('Year')
plt.ylabel('Suicides');

## Hypothesis 1 - winter gets more suicides than summer

In [None]:
summer_v_winter(suicides, 'suicide_rate')

Looks like summer has more suicides than winter.

## Let's cancel out year effect

In [None]:
importlib.reload(data_analysis)
from data_analysis import data_by_month

In [None]:
suicides_by_month = data_by_month(suicides)
suicides_by_month.head()

In [None]:
summer_v_winter(suicides_by_month, 'suicide_rate')

Paired t-test. Compare the sum of the months against the mean of the months

In [None]:
summer_by_month = suicides_by_month.query('4 <= Month <= 8')
winter_by_month = suicides_by_month.query('Month <= 3 or Month >= 11')
sns.distplot(summer_by_month.suicide_rate, label='April-August')
sns.distplot(winter_by_month.suicide_rate, label='November-March')
plt.legend()
plt.rcParams["figure.figsize"] = (10,6)

plt.xlabel('suicide rate', size='x-large')
plt.title('Suicides Are Higher in Summer Than Winter',size=24)
sns.set_context("talk")
plt.style.use('ggplot')
t_stat, p_value = stats.ttest_rel(summer_by_month.suicide_rate, winter_by_month.suicide_rate)
print(f"Paired t-test: t-statistic: {t_stat:.03f}; p-value: {p_value / 2}")
print("Cohen's d: {:.03f}".format(cohen_d(winter_by_month.suicide_rate, summer_by_month.suicide_rate)))

Nope, in fact spring & summer are worse than winter!

## Extreme heat in summer

### High heat has no significant effect

In [None]:
importlib.reload(data_analysis)
from data_analysis import top_v_bottom

In [None]:
summer = suicides.query('4 <= Month <= 9')

In [None]:
top_v_bottom(.1, summer, 'avg_max_heat_index', 'suicide_rate', var_name='heat index')
plt.title("Suicide Rate During\nWarm v. Cool Summer Months", size=24)

### Unusually high heat does have a significant effect

In [None]:
top_v_bottom(.1, summer, 'heat_index_diff', 'suicide_rate')
plt.title("Suicide Rate During\nUnusually Warm v. Cool Summer Months", size=24)

### Particularly in the Southeast

In [None]:
southeast = ("Alabama", "Florida", "Georgia", "Kentucky", "Mississippi", 
             "North Carolina", "South Carolina", "Tennessee")
top_v_bottom(.1, summer[summer.State.isin(southeast)], 'heat_index_diff', 'suicide_rate')
plt.title("Suicide Rate During Unusual Heat,\nSoutheastern U.S.", size=24)

### No significant effect in Northeast (for example)

In [None]:
northeast = ("Connecticut", 'Maine', "Massachusetts", "New Hampshire", 
             "Rhode Island", "Vermont", "New Jersey", "New York", "Pennsylvania")
top_v_bottom(.1, summer[summer.State.isin(northeast)], 'heat_index_diff', 'suicide_rate')
plt.title('Northeast',size='xx-large')

## It's possible we're seeing a year effect in our extreme-heat analysis

In [None]:
top_v_bottom(.1, summer, 'Year', 'suicide_rate')

### Multivariate linear regression 

In [None]:
import statsmodels.api as sm

In [None]:
X = summer.dropna()[['Year','Month','heat_index_diff', 'min_t_diff', 'max_t_diff']].copy()
X = sm.add_constant(X)
Y = summer.dropna()['suicide_rate']
model = sm.OLS(Y,X)
results = model.fit()
results.summary()

In [None]:
results.tvalues

## Are suicides by state independent from suicides by year? Chi-square analysis

In [None]:
suicides_by_state_and_year = pd.pivot_table(suicides, 
                                            index='State', 
                                            values='suicide_rate', 
                                            columns='Year', aggfunc=np.mean).dropna()

In [None]:
stats.chisquare(suicides_by_state_and_year)

Conclusion: They are not independent of each other.

In [None]:
stats.chisquare(suicides_by_state_and_year.query('State in ("Alabama", "Mississippi")'))

## Mapping the data

Add state two-letter codes to suicides file

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_us_ag_exports.csv')

In [None]:
suicides = suicides.merge(df[['state','code']], left_on='State', right_on='state').drop(columns='state')

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations=suicides.groupby('code').mean().index, # Spatial coordinates
    z = suicides.groupby('code').mean().suicide_rate, # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Reds',
    colorbar_title = "suicide rate",
))
fig.update_layout(
    title_text = 'US Suicide Rates',
    geo_scope='usa', # limit map scope to USA
)
fig.show(renderer="png", width=1000, height=800)

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations=suicides.groupby('code').mean().index, # Spatial coordinates
    z = suicides.query('3 < Month < 9').groupby('code').mean().suicide_rate - suicides.query('Month < 4 or Month > 10').groupby('code').mean().suicide_rate, # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Reds',
))
fig.update_layout(
    geo_scope='usa', # limit map scope to USA
)
fig.show(renderer="png", width=1000, height=800)

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations=suicides.groupby('code').mean().index, # Spatial coordinates
    z = (suicides.query('Year == 2011').groupby('code').mean().suicide_rate 
        - suicides.query('Year == 1999').groupby('code').mean().suicide_rate) / suicides.query('Year == 1999').groupby('code').mean().suicide_rate, # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale='RdBu',
    reversescale=True,
    zmid=0
#    colorbar_title = "Millions USD",
))
fig.update_layout(
    title_text = "Suicide Rate Change from 1999 to 2011",
    geo_scope='usa', # limit map scope to USA
)
fig.show(renderer="png", width=1000, height=800)

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations=suicides.groupby('code').mean().index, # Spatial coordinates
    z = suicides.groupby('code').mean().avg_max_heat_index, # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Reds',
#    colorbar_title = "Millions USD",
))
fig.update_layout(
#    title_text = '2011 US Agriculture Exports by State',
    geo_scope='usa', # limit map scope to USA
)
fig.show(renderer="png", width=1000, height=800)

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations=suicides.groupby('code').mean().index, # Spatial coordinates
    z = suicides.groupby('code').mean().avg_max_t, # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Reds',
#    colorbar_title = "Millions USD",
))
fig.update_layout(
#    title_text = '2011 US Agriculture Exports by State',
    geo_scope='usa', # limit map scope to USA
)
fig.show()

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations=suicides.query('3 < Month < 9').groupby('code').mean().index, # Spatial coordinates
    z = suicides.query('3 < Month < 9').groupby('code').std().avg_max_heat_index, # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Reds',
#    colorbar_title = "Millions USD",
))
fig.update_layout(
#    title_text = '2011 US Agriculture Exports by State',
    geo_scope='usa', # limit map scope to USA
)
fig.show()

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations=suicides.query('3 < Month < 9').groupby('code').mean().index, # Spatial coordinates
    z = suicides.query('3 < Month < 9').groupby('code').std().avg_max_heat_index, # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Reds',
#    colorbar_title = "Millions USD",
))
fig.update_layout(
#    title_text = '2011 US Agriculture Exports by State',
    geo_scope='usa', # limit map scope to USA
)
fig.show()

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations=suicides.query(f'3 < Month < 9 & heat_index_diff >= {q_diff_90}').groupby('code').count().index, # Spatial coordinates
    z = suicides.query(f'3 < Month < 9 & heat_index_diff >= {q_diff_90}').groupby('code').count().heat_index_diff, # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Reds',
#    colorbar_title = "Millions USD",
))
fig.update_layout(
#    title_text = '2011 US Agriculture Exports by State',
    geo_scope='usa', # limit map scope to USA
)
fig.show(renderer="png", width=1000, height=800)

In [None]:
suicides.query(f'3 < Month < 9 & heat_index_diff >= {q_diff_90}').groupby('code').count().index

In [None]:
plt.plot(suicides.query(f'3 < Month < 9 & heat_index_diff >= {q_diff_90}').groupby('Year').count().State)

In [None]:
suicides.query(f'3 < Month < 9 & heat_index_diff >= {q_diff_90}')

## Detrend data experiment

In [None]:
plt.plot(suicides.groupby('Month Code').sum().Deaths)

In [None]:
from sklearn.linear_model import LinearRegression
# fit linear model
series = suicides.query('State == "Missouri"').copy()
X = [i for i in range(0, len(series))]
X = np.reshape(X, (len(X), 1))
y = series.suicide_rate.reset_index(drop=True)
model = LinearRegression()
model.fit(X, y)
# calculate trend
trend = model.predict(X)
# plot trend
plt.plot(y)
plt.plot(trend)
plt.title("Missouri suicide rate 1999-2011", size=24)
plt.xlabel("months since Jan 1999")

In [None]:
print(f"best fit: y = {model.coef_[0]:.04f} x + {model.intercept_:.04f}")

In [None]:
# detrend
detrended = [y[i]-trend[i] + y.mean() for i in range(0, len(series))] 
# plot detrended
plt.plot(detrended)
plt.title("Missouri detrended suicide rate 1999-2011", size=24)
plt.xlabel("months since Jan 1999")

In [None]:
series['detrended_suicide_rate'] = detrended

In [None]:
series

In [None]:
sns.distplot(series.suicide_rate, label='rate')
sns.distplot(series.detrended_suicide_rate, label='detrended')
plt.legend()

In [None]:
top_v_bottom(.90, .1, series.query('3 < Month < 9'), 'heat_index_diff', 'suicide_rate')
plt.title("Missouri",size='xx-large')

In [None]:
top_v_bottom(.90, .1, series.query('3 < Month < 9'), 'heat_index_diff', 'detrended_suicide_rate')
plt.title("Missouri detrended",size='xx-large')

In [None]:
plt.plot(series.heat_index_diff)