# Suicide vs GDP at the State Level (United States of America)

In [None]:
# dependencies
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

## Data exploration and Cleanup

### Read CSVs

In [None]:
# https://apps.bea.gov/regional/downloadzip.cfm - website for below info
# read in GDP csv into notebook
gdp_file = "BEA_GDP_DATA_mulitple_years.csv"
df_state_gdp = pd.read_csv(gdp_file)
df_state_gdp.head()

In [None]:
# read in suicide rate csv
sui_file = "suicide_rate_state.csv"
sui_df = pd.read_csv(sui_file)
sui_df.head()

### Perform cleanup on GDP DataFrame

In [None]:
#drop NA info
df_state_gdp = df_state_gdp.dropna()
# remove rows besides the REAL GDP row associated with LineCode 1.0
df_state_gdp = df_state_gdp.loc[df_state_gdp["LineCode"] == 1.0, :]
df_state_gdp.head(40)

In [None]:
# drop additional columns
df_state_gdp = df_state_gdp.drop(columns = {"LineCode", "TableName", "Region", "GeoFIPS"})

In [None]:
# melt all the row columns so they are rows within the DF
state_gdp_melt = df_state_gdp.melt(id_vars=['GeoName', 'Description', "Unit"], 
              value_vars=['1997','1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020', '2021'],
              var_name='Year', value_name='GDP')
state_gdp_melt

In [None]:
# drop the additonal description/unit columns - NOTE THAT THE GDP is in Millions
state_gdp_melt = state_gdp_melt.drop(columns = {"Description", "Unit"})

In [None]:
#drop year columns that the suicide CSV does not contain
state_gdp_melt = state_gdp_melt.loc[(state_gdp_melt["Year"] != "1997")]
state_gdp_melt = state_gdp_melt.loc[(state_gdp_melt["Year"] != "1998")]
state_gdp_melt = state_gdp_melt.loc[(state_gdp_melt["Year"] != "2021")]

#rename the Geoname column and GDP column
state_gdp_melt = state_gdp_melt.rename(columns = {"GeoName": "State", "GDP" :"GDP (in millions)"})

# locate and remove rows that are not States
state_gdp_melt = state_gdp_melt.loc[(state_gdp_melt["State"] != "United States")]
state_gdp_melt = state_gdp_melt.loc[(state_gdp_melt["State"] != "New England")]
state_gdp_melt = state_gdp_melt.loc[(state_gdp_melt["State"] != "Mideast")]
state_gdp_melt = state_gdp_melt.loc[(state_gdp_melt["State"] != "Great Lakes")]
state_gdp_melt = state_gdp_melt.loc[(state_gdp_melt["State"] != "Plains")]
state_gdp_melt = state_gdp_melt.loc[(state_gdp_melt["State"] != "Southeast")]
state_gdp_melt = state_gdp_melt.loc[(state_gdp_melt["State"] != "Southwest")]
state_gdp_melt = state_gdp_melt.loc[(state_gdp_melt["State"] != "Rocky Mountain")]
state_gdp_melt = state_gdp_melt.loc[(state_gdp_melt["State"] != "Far West")]
state_gdp_melt

In [None]:
# modify year column to be integers
state_gdp_melt["Year"] = state_gdp_melt["Year"].astype(int)

### Perform Cleanup on Suicide Rate DataFrame

In [None]:
# drop unnecessary columns
sui_df = sui_df.drop(columns = {"Unnamed: 0"})

In [None]:
sui_df = sui_df.rename(columns = {"Crude Rate": "Suicide Rate"})
sui_df.head()

### Merge DataFrames

In [None]:
# merge suicide and gdp DFs on 
gdp_sui_state = pd.merge(state_gdp_melt, sui_df, on = ["State", "Year"], how = "inner")
gdp_sui_state

In [None]:
# make GDP a float
gdp_sui_state["GDP (in millions)"] = gdp_sui_state["GDP (in millions)"].astype(float)

### Add GDP Per Capita to DataFrame 

In [None]:
# add GDP per Capita column
gdp_sui_state["GDP Per Capita"] = round(((gdp_sui_state["GDP (in millions)"]*1000000)/gdp_sui_state["Population"]), 2)
gdp_sui_state

## Merged DataFrame Analysis

### Perform Multiple Scatter Plots in search of Observable Trends

SCATTER PLOT of GDP vs SUICIDE RATE (years = 1999 - 2020)

In [None]:
# scatter plot of all GDP and suicide rate (all years)
plt.scatter(gdp_sui_state["GDP (in millions)"], gdp_sui_state["Suicide Rate"])
plt.xlabel("GDP (in Millions)")
plt.ylabel("Suicide Rate (per 100,000)")
plt.title("Suicide Rate vs GDP")
plt.show()

In [None]:
gdp_sui_state_lower = gdp_sui_state.loc[gdp_sui_state["GDP (in millions)"] < 1000000, :]

SCATTER PLOT of GDP vs SUICIDE RATE for States with less than $1 Trillion GDP (years = 1999 - 2020)

In [None]:
# scatter plot of all GDP and suicide rate & GDP < $1 Trillion (all years)
x_values = gdp_sui_state_lower["GDP (in millions)"]
y_values = gdp_sui_state_lower["Suicide Rate"]

# perform linear regression equations
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,5)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(50000,30),fontsize=15,color="red")
plt.xlabel('GDP (in millions)')
plt.ylabel('Suicide Rate')
plt.title("State Suicide vs GDP Per Capita")
plt.show()
print(pvalue, rvalue)

In [None]:
# make a DF for large GDP over 1 trillion
big_gdp_sui_state = gdp_sui_state.loc[gdp_sui_state["GDP (in millions)"] > 1000000, :]
big_gdp_sui_state.head()

SCATTER PLOT of GDP vs SUICIDE RATE for States with less than $1 Trillion GDP (years = 1999 - 2020)

In [None]:
# scatter plot of only large GDP
x_values = big_gdp_sui_state["GDP (in millions)"]
y_values = big_gdp_sui_state["Suicide Rate"]

# perform linear regression equations
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,5)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(50000,30),fontsize=15,color="red")
plt.xlabel('GDP (in millions)')
plt.ylabel('Suicide Rate')
plt.title("State Suicide vs GDP Per Capita")
plt.show()
print(pvalue, rvalue)

NOTE: Plot just shows the results for New York, California, and Texas

In [None]:
# try a DF just by year
gdp_sui_state_year = gdp_sui_state.loc[gdp_sui_state["Year"] == 2020]

SCATTER PLOT of GDP vs SUICIDE RATE (year = 2020)

In [None]:
# scatter plot of only large GDP
x_values = gdp_sui_state_year["GDP (in millions)"]
y_values = gdp_sui_state_year["Suicide Rate"]

# perform linear regression equations
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,5)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(50000,30),fontsize=15,color="red")
plt.xlabel('GDP (in millions)')
plt.ylabel('Suicide Rate')
plt.title("State Suicide vs GDP")
plt.show()
print(pvalue, rvalue)

SCATTER PLOT of GDP PER CAPITA vs SUICIDE RATE (year = 1999 - 2020)

In [None]:
# try some plots in GDP Per Capita instead of total GDP

# scatter plot of all GDP per Capita and suicide rate (all years)
plt.scatter(gdp_sui_state["GDP Per Capita"], gdp_sui_state["Suicide Rate"])
plt.xlabel("GDP Per Capita")
plt.ylabel("Suicide Rate (per 100,000)")
plt.show()

In [None]:
# set up the values of the plot to help determine linear regression equation
gdp_sui_stat_DC = gdp_sui_state.loc[gdp_sui_state["State"] != "District of Columbia"]
gdp_sui_stat_DC.head()

SCATTER PLOT of GDP PER CAPITA vs SUICIDE RATE for GDP Per Capita < 100,000 (year = 1999 - 2020)

In [None]:
x_values = gdp_sui_stat_DC["GDP Per Capita"]
y_values = gdp_sui_stat_DC["Suicide Rate"]

# perform linear regression equations
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,5)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(50000,30),fontsize=15,color="red")
plt.xlabel('GDP Per Capita')
plt.ylabel('Suicide Rate')
plt.title("State Suicide vs GDP Per Capita")
plt.show()
print(pvalue, rvalue)

In [None]:
# plot of 2020 GDP vs Suicide
x_values = gdp_sui_state_year["GDP Per Capita"]
y_values = gdp_sui_state_year["Suicide Rate"]

# perform linear regression equations
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,5)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(50000,30),fontsize=15,color="red")
plt.xlabel('GDP Per Capita')
plt.ylabel('Suicide Rate')
plt.title("State Suicide vs GDP Per Capita")
plt.show()
print(pvalue, rvalue)

In [None]:
# try a DF just by State
gdp_sui_state_ST = gdp_sui_state.loc[gdp_sui_state["State"] == "Minnesota"]
gdp_sui_state_ST

In [None]:
plt.scatter(gdp_sui_state_ST["GDP Per Capita"], gdp_sui_state_ST["Suicide Rate"])
plt.xlabel("GDP Per Capita")
plt.ylabel("Suicide Rate (per 100,000)")
plt.show()