In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn.functional as F
from torch import nn
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt 
import math

In [2]:
# Load in the data
ca_demData = pd.read_csv("CA_CensusData.csv")
ca_suicData = pd.read_csv("ca_county_suicide_2015.csv")

In [3]:
# Rename column names for merge
ca_demData = ca_demData.rename(columns = {'County' : 'County_Names'})

# Merge data
all_data = pd.merge(ca_demData, ca_suicData, on='County_Names', how='outer')

In [4]:
# Renaming some columns for use
all_data = all_data.rename(columns = {'Population, Census, April 1, 2010' : 'Population_2010', 'Persons per household, 2015-2019' : 'Avg_Household_Size', 'Veterans, 2015-2019' : 'Vet_Population', 'High school graduate or higher, percent of persons age 25 years+, 2015-2019' : 'HS_Grad_Percent', 'Bachelor\'s degree or higher, percent of persons age 25 years+, 2015-2019' : 'College_Grad_Percent', 'With a disability, under age 65 years, percent, 2015-2019' : 'Disability_percent', 'Persons  without health insurance, under age 65 years, percent' : 'Lacking_Health_Insurance_Percent', 'In civilian labor force, total, percent of population age 16 years+, 2015-2019' : 'Employed_16+_Percent', 'Total retail sales, 2012 ($1,000)' : 'Retail_Sales_by1000', 'Median household income (in 2019 dollars), 2015-2019' : 'Household_Income', 'Persons in poverty, percent' : 'Poverty_Percentage', 'All firms, 2012' : 'Firms_Num', 'Population per square mile, 2010' : 'Population_Density'})

# Dropping Alpine County as no data for suicide count
all_data = all_data.drop([1])
all_data.reset_index(drop=True, inplace=True)
all_data.head()

Unnamed: 0,County_Names,"Population estimates, July 1, 2019, (V2019)","Population estimates base, April 1, 2010, (V2019)","Population, percent change - April 1, 2010 (estimates base) to July 1, 2019, (V2019)",Population_2010,"Persons under 5 years, percent","Persons under 18 years, percent","Persons 65 years and over, percent","Female persons, percent","White alone, percent",...,"Women-owned firms, 2012","Minority-owned firms, 2012","Nonminority-owned firms, 2012","Veteran-owned firms, 2012","Nonveteran-owned firms, 2012",Population_Density,"Land area in square miles, 2010",FIPS_County_Code,Suicide_Counts,County_of_occurrence
0,Alameda County,1671329.0,1510258.0,10.7,1510271.0,5.7,20.3,14.3,50.7,49.3,...,57305.0,75026.0,69300.0,10213.0,134032.0,2043.6,739.02,1,138.0,1.0
1,Amador County,39752.0,38091.0,4.4,38091.0,4.1,15.1,27.5,45.6,89.6,...,921.0,259.0,2453.0,258.0,2363.0,64.1,594.58,5,7.0,5.0
2,Butte County,219186.0,220005.0,-0.4,220000.0,5.5,20.3,18.4,50.5,85.7,...,5341.0,1858.0,13644.0,1766.0,13509.0,134.4,1636.46,7,39.0,7.0
3,Calaveras County,45905.0,45578.0,0.7,45578.0,4.5,17.1,28.0,50.2,91.0,...,1008.0,291.0,2983.0,392.0,2799.0,44.7,1020.01,9,20.0,9.0
4,Colusa County,21547.0,21407.0,0.7,21419.0,6.9,27.1,14.9,49.1,91.1,...,306.0,397.0,656.0,55.0,944.0,18.6,1150.73,11,2.0,11.0


In [5]:
# Adding risk type column
all_data['Risk_Type'] = pd.Series(dtype='int')

for i in range(57):
    # Suicide rate per 100,000 people
    # High risk county if suicide rate greater than the national average 13.4
    if (all_data['Suicide_Counts'][i]/all_data['Population_2010'][i] * 100000) > 13.4:
        # high risk = 1
        all_data['Risk_Type'][i] = 1
    else:
        # low risk = 0
        all_data['Risk_Type'][i] = 0
        
all_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data['Risk_Type'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data['Risk_Type'][i] = 1


Unnamed: 0,County_Names,"Population estimates, July 1, 2019, (V2019)","Population estimates base, April 1, 2010, (V2019)","Population, percent change - April 1, 2010 (estimates base) to July 1, 2019, (V2019)",Population_2010,"Persons under 5 years, percent","Persons under 18 years, percent","Persons 65 years and over, percent","Female persons, percent","White alone, percent",...,"Minority-owned firms, 2012","Nonminority-owned firms, 2012","Veteran-owned firms, 2012","Nonveteran-owned firms, 2012",Population_Density,"Land area in square miles, 2010",FIPS_County_Code,Suicide_Counts,County_of_occurrence,Risk_Type
0,Alameda County,1671329.0,1510258.0,10.7,1510271.0,5.7,20.3,14.3,50.7,49.3,...,75026.0,69300.0,10213.0,134032.0,2043.6,739.02,1,138.0,1.0,0.0
1,Amador County,39752.0,38091.0,4.4,38091.0,4.1,15.1,27.5,45.6,89.6,...,259.0,2453.0,258.0,2363.0,64.1,594.58,5,7.0,5.0,1.0
2,Butte County,219186.0,220005.0,-0.4,220000.0,5.5,20.3,18.4,50.5,85.7,...,1858.0,13644.0,1766.0,13509.0,134.4,1636.46,7,39.0,7.0,1.0
3,Calaveras County,45905.0,45578.0,0.7,45578.0,4.5,17.1,28.0,50.2,91.0,...,291.0,2983.0,392.0,2799.0,44.7,1020.01,9,20.0,9.0,1.0
4,Colusa County,21547.0,21407.0,0.7,21419.0,6.9,27.1,14.9,49.1,91.1,...,397.0,656.0,55.0,944.0,18.6,1150.73,11,2.0,11.0,0.0


In [6]:
# Splitting data up
x_data = all_data[['Avg_Household_Size', 'HS_Grad_Percent', 'College_Grad_Percent', 'Disability_percent', 'Vet_Population', 'Employed_16+_Percent', 'Lacking_Health_Insurance_Percent', 'Household_Income', 'Poverty_Percentage', 'Population_Density']]
y_data = all_data['Risk_Type']

In [7]:
# Splitting data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data)

In [8]:
# Creating tensors for NN
x, x_t = torch.from_numpy(pd.DataFrame.to_numpy(x_train)), torch.from_numpy(pd.DataFrame.to_numpy(x_test))
y, y_t = torch.from_numpy(pd.DataFrame.to_numpy(y_train)), torch.from_numpy(pd.DataFrame.to_numpy(y_test))

In [14]:
class MyNetwork(nn.Module):
    def __init__(self):
        super(MyNetwork, self).__init__()
        self.lin1 = nn.Linear(10, 5)
        self.lin2 = nn.Linear(5, 1)
        
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.lin1(x)
        x = self.lin2(x)
        x = self.sigmoid(x)
        return x

net = MyNetwork().double()

In [15]:
# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the nn.Linear
# module which is members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(net.parameters(), lr=0.0000000001)

In [16]:
for epoch in range(2):
    
    for t in range(42):     
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass: Compute predicted y by passing x to the model
        y_pred = net(x[t])

        # Compute and print loss
        loss = nn.L1Loss()
        loss_val = loss(y_pred, y)
        # print every 10 mini-batches
        if t % 5 == 0:    
            print(t, loss_val.item())

        # Perform a backward pass, and update the weights.
        loss_val.backward()
        optimizer.step()

print("Finished Training")

0 0.5714285278493566
5 0.42857142857142855
10 0.42857142857142855
15 0.5714285714285714
20 0.5714285714285714
25 0.5714285714285714
30 0.4285714285714783
35 0.4445339230094961
40 0.5661058242389064
0 0.571428527765756
5 0.42857142857142855
10 0.42857142857142855
15 0.5714285714285714
20 0.5714285714285714
25 0.5714285714285714
30 0.42857142857147806
35 0.44451459194748033
40 0.5660987480344805
Finished Training


In [17]:
# Save model
torch.save(net.state_dict(), 'CA_PNN')

In [18]:
# Load model for testing
net.load_state_dict(torch.load('CA_PNN'))
outputs = net(x_t)
for i in range(15):
    print(str(x_t[i]) + "\n predicted suicide:" + str(outputs[i].item()) + "\n acc suicide:" + str(y_t[i].item()))

tensor([2.4700e+00, 8.0100e+01, 1.4800e+01, 1.4600e+01, 2.0480e+03, 4.4100e+01,
        7.3000e+00, 4.5283e+04, 1.7900e+01, 2.8400e+01], dtype=torch.float64)
 predicted suicide:1.0
 acc suicide:1.0
tensor([2.5700e+00, 8.9200e+01, 2.7200e+01, 1.2600e+01, 1.4209e+04, 5.6300e+01,
        7.7000e+00, 5.2537e+04, 1.6100e+01, 1.3440e+02], dtype=torch.float64)
 predicted suicide:6.627261687205883e-165
 acc suicide:1.0
tensor([2.7200e+00, 8.6300e+01, 4.0800e+01, 8.8000e+00, 1.0208e+04, 6.3700e+01,
        7.3000e+00, 8.2234e+04, 1.0600e+01, 5.8940e+02], dtype=torch.float64)
 predicted suicide:0.20501007288947767
 acc suicide:1.0
tensor([3.2700e+00, 7.1500e+01, 2.4700e+01, 5.5000e+00, 1.6874e+04, 5.9500e+01,
        1.1900e+01, 7.1015e+04, 1.2600e+01, 1.2650e+02], dtype=torch.float64)
 predicted suicide:9.533729455661491e-176
 acc suicide:1.0
tensor([2.6300e+00, 9.3400e+01, 3.4300e+01, 8.5000e+00, 1.3604e+04, 5.7900e+01,
        5.7000e+00, 8.3377e+04, 8.7000e+00, 1.0600e+02], dtype=torch.float