In [None]:
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
from scipy.stats import linregress
import numpy as np

# Suicide data import and cleanup

In [None]:
#Import, read, and output the suicide rate by state csv
sui_state = "Resources/state_suicide_CDC.csv"
sui_state_df = pd.read_csv(sui_state)
sui_state_df

In [None]:
#Rename columns
sui_state_df = sui_state_df.rename(columns = {"Crude Rate": 'Suicide Rate'})
sui_state_df

In [None]:
#Drop NA column
clean_sui_state_df = sui_state_df.drop(columns = ["Unnamed: 0"], axis = 1)
clean_sui_state_df

In [None]:
clean_sui_state_df.dtypes

# Unemployment data import and cleanup

In [None]:
#Import, read, and output the unemployment CSV file 
unemployed = "Resources/state_unemployment_USBLS.csv"
unemployed_df = pd.read_csv(unemployed)
unemployed_df

In [None]:
#Drop NA column
clean_unemploy_df = unemployed_df.dropna()
clean_unemploy_df

In [None]:
#Rename columns as needed
clean_unemploy_df = clean_unemploy_df.rename(columns = {"Area": "State"})
clean_unemploy_df.head()

In [None]:
#Drop unneccesary columns and remove "United States" and "District of Columbia" rows, as they are not states
clean_unemploy_df = clean_unemploy_df.drop(columns = {"Fips"}, axis = 1)
clean_unemploy_df = clean_unemploy_df.drop([0,9])
clean_unemploy_df

In [None]:
#Melt the data so that years are in the dataframe and unemployment rate percent is in one column
clean_unemploy_melt_df = clean_unemploy_df.melt(id_vars=['State'], 
              value_vars=['1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996',
                          '1997','1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018'],
              var_name='Year', value_name='Unemployment Rate (%)')
clean_unemploy_melt_df

In [None]:
#Remove rows with years that do not match the suicide CSV
clean_unemploy_melt_df = clean_unemploy_melt_df.loc[(clean_unemploy_melt_df["Year"] > "1998")]

clean_unemploy_melt_df

In [None]:
clean_unemploy_melt_df.dtypes

In [None]:
#Change the "Year" from an object to an integer
clean_unemploy_melt_df["Year"] = clean_unemploy_melt_df["Year"].astype(int)

# Merged

In [None]:
#Merge the two dataframes to prepare for graphing
Merge_Unemp_Sui_df = pd.merge(clean_sui_state_df, clean_unemploy_melt_df, on = ["State", "Year"], how = 'inner')
Merge_Unemp_Sui_df

In [None]:
#Check dataframe types
Merge_Unemp_Sui_df.dtypes

# Graphing

### Graph Unenployment Rate vs Suicide Rate for all years

In [None]:
#Create a scatter plot of unemployment rate and suicide rates
plt.scatter(Merge_Unemp_Sui_df["Unemployment Rate (%)"], Merge_Unemp_Sui_df["Suicide Rate"])
plt.xlabel("Unemployment Rate (%)")
plt.ylabel("Suicide Rate")
plt.title("State Unemployment Rate vs Suicide Rate 1999-2018")
plt.show()

In [None]:
#Perform a linear regression
x_values = Merge_Unemp_Sui_df["Unemployment Rate (%)"]
y_values = Merge_Unemp_Sui_df["Suicide Rate"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y =" + str(round(slope,2)) + "x +" + str(round(intercept, 2))
plt.annotate(line_eq, (8, 25), fontsize = 15, color = "red")

plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")

plt.xlabel("Unemployment Rate (%)")
plt.ylabel("Suicide Rate")
plt.title("State Unemployment Rate vs Suicide Rate 1999-2018")
plt.show()
print(f"The r-value = {rvalue}. The p-value = {pvalue}")
print(f"The r-squared value is {rvalue**2}")

Since the p-value is greater than 0.05, we fail to reject the null hypothesis that the slope of this regression line
is 0. Therefore, we can say that there is no statistical significance between unemployment and suicide rate.

In [None]:
#Try a regression test for a chosen year
year_regression_test = Merge_Unemp_Sui_df.loc[Merge_Unemp_Sui_df["Year"] == 2012]
year_regression_test

In [None]:
plt.scatter(year_regression_test["Unemployment Rate (%)"], year_regression_test["Suicide Rate"])
plt.xlabel("Unemployment Rate (%)")
plt.ylabel("Suicide Rate")
plt.title("State Unemployment Rate vs Suicide Rate in 2012")
plt.show()

In [None]:
x_values = year_regression_test["Unemployment Rate (%)"]
y_values = year_regression_test["Suicide Rate"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y =" + str(round(slope,2)) + "x +" + str(round(intercept, 2))
plt.annotate(line_eq, (8, 25), fontsize = 15, color = "red")

plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")

plt.xlabel("Unemployment Rate (%)")
plt.ylabel("Suicide Rate")
plt.title("State Unemployment Rate vs Suicide Rate in 2012")
plt.show()
print(f"The r-value = {rvalue}. The p-value = {pvalue}")
print(f"The r-squared value is {rvalue**2}")

Since the p-value is greater than 0.05, we fail to reject the null hypothesis that the slope of this regression line is 0. Therefore, we can say that there is no statistical significance between unemployment and suicide rate.

In [None]:
#Try a regression test for a chosen state
state_regression_test = Merge_Unemp_Sui_df.loc[Merge_Unemp_Sui_df["State"] == "Minnesota"]

In [None]:
x_values = state_regression_test["Unemployment Rate (%)"]
y_values = state_regression_test["Suicide Rate"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y =" + str(round(slope,2)) + "x +" + str(round(intercept, 2))
plt.annotate(line_eq, (5, 13), fontsize = 15, color = "red")

plt.scatter(x_values, y_values)
plt.plot(x_values, regress_values, "r-")

plt.xlabel("Unemployment Rate (%)")
plt.ylabel("Suicide Rate")
plt.title("State Unemployment Rate vs Suicide Rate for Minnesota 1999-2018")
plt.show()
print(f"The r-value = {rvalue}. The p-value = {pvalue}")
print(f"The r-squared value is {rvalue**2}")

Since the p-value is greater than 0.05, we fail to reject the null hypothesis that the slope of this regression line is 0. Therefore, we can say that there is no statistical significance between unemployment and suicide rate.