In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [3]:
gunData = pd.read_excel('firearm_data_cleaned_new.xlsx')
gunData.head()

Unnamed: 0,year,state,rate,deaths,state_name,law_strength_score,restrictive_laws,permissive_laws,total_law_changes,rate_change,law_strength_change,unique_law_classes,strength_background_checks,strength_carrying_a_concealed_weapon_ccw,strength_castle_doctrine,strength_dealer_license,strength_firearm_sales_restrictions,strength_local_laws_preempted_by_state,strength_minimum_age,strength_prohibited_possessor,strength_registration,strength_waiting_period,strength_firearm_removal_at_scene_of_domestic_violence,strength_firearms_in_college_university,strength_child_access_laws,strength_gun_trafficking,strength_open_carry,strength_required_reporting_of_lost_or_stolen_firearms,strength_safety_training_required,strength_untraceable_firearms,strength_permit_to_purchase,strength_firearms_in_k_12_educational_settings
0,2014,AK,19.2,145,Alaska,11,18,7,25,,,9,2,-1,-4,0,3,-1,7,2,0,2,1,0,0,0,0,0,0,0,0,0
1,2015,AK,23.4,177,Alaska,11,18,7,25,4.2,0.0,9,2,-1,-4,0,3,-1,7,2,0,2,1,0,0,0,0,0,0,0,0,0
2,2016,AK,23.3,177,Alaska,11,18,7,25,-0.1,0.0,9,2,-1,-4,0,3,-1,7,2,0,2,1,0,0,0,0,0,0,0,0,0
3,2017,AK,24.5,180,Alaska,11,18,7,25,1.2,0.0,9,2,-1,-4,0,3,-1,7,2,0,2,1,0,0,0,0,0,0,0,0,0
4,2018,AK,21.0,155,Alaska,11,18,7,25,-3.5,0.0,9,2,-1,-4,0,3,-1,7,2,0,2,1,0,0,0,0,0,0,0,0,0


In [4]:
gunData.shape

(502, 32)

In [5]:
""""
502 rows by 32 columns:

'year', 
'state', 
'rate', 
'deaths', 
'state_name', 
'law_strength_score',
'restrictive_laws', 
'permissive_laws', 
'total_law_changes',
'rate_change',                              # 10
'law_strength_change', 
'unique_law_classes',
'strength_background_checks',
'strength_carrying_a_concealed_weapon_ccw', 
'strength_castle_doctrine',
'strength_dealer_license', 
'strength_firearm_sales_restrictions',
'strength_local_laws_preempted_by_state', 
'strength_minimum_age',
'strength_prohibited_possessor',            # 20
'strength_registration',
'strength_waiting_period',
'strength_firearm_removal_at_scene_of_domestic_violence',
'strength_firearms_in_college_university', 
'strength_child_access_laws',
'strength_gun_trafficking', 
'strength_open_carry',
'strength_required_reporting_of_lost_or_stolen_firearms',
'strength_safety_training_required', 
'strength_untraceable_firearms',            # 30
'strength_permit_to_purchase',
'strength_firearms_in_k_12_educational_settings'
"""

'"\n502 rows by 32 columns:\n\n\'year\', \n\'state\', \n\'rate\', \n\'deaths\', \n\'state_name\', \n\'law_strength_score\',\n\'restrictive_laws\', \n\'permissive_laws\', \n\'total_law_changes\',\n\'rate_change\',                              # 10\n\'law_strength_change\', \n\'unique_law_classes\',\n\'strength_background_checks\',\n\'strength_carrying_a_concealed_weapon_ccw\', \n\'strength_castle_doctrine\',\n\'strength_dealer_license\', \n\'strength_firearm_sales_restrictions\',\n\'strength_local_laws_preempted_by_state\', \n\'strength_minimum_age\',\n\'strength_prohibited_possessor\',            # 20\n\'strength_registration\',\n\'strength_waiting_period\',\n\'strength_firearm_removal_at_scene_of_domestic_violence\',\n\'strength_firearms_in_college_university\', \n\'strength_child_access_laws\',\n\'strength_gun_trafficking\', \n\'strength_open_carry\',\n\'strength_required_reporting_of_lost_or_stolen_firearms\',\n\'strength_safety_training_required\', \n\'strength_untraceable_firearms

## RESEARCH QUESTION 1:

_Can we predict firearm death rates based on gun law characteristics?_

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import RidgeCV

In [7]:
# kitchen sink models

data1 = gunData[['year', 'state', 'rate', 'deaths', 'law_strength_score',
                    'restrictive_laws', 'permissive_laws', 'total_law_changes',                             
                    'law_strength_change', 'unique_law_classes']].dropna()
    # state is enough, omitting state_name
    # omitting rate_change
data1 = data1.query("state!='District of Columbia'")
    # there is only one datapoint for District of Columbia, flummoxes train_test_split. dropping.


data2 = gunData[['rate', 'state', 'strength_background_checks', 'strength_carrying_a_concealed_weapon_ccw', 
                    'strength_castle_doctrine', 'strength_dealer_license', 'strength_firearm_sales_restrictions',
                    'strength_local_laws_preempted_by_state', 'strength_minimum_age','strength_prohibited_possessor',            
                    'strength_registration', 'strength_waiting_period', 'strength_firearm_removal_at_scene_of_domestic_violence',
                    'strength_firearms_in_college_university', 'strength_child_access_laws', 'strength_gun_trafficking', 
                    'strength_open_carry', 'strength_required_reporting_of_lost_or_stolen_firearms',
                    'strength_safety_training_required', 'strength_untraceable_firearms', 'strength_permit_to_purchase',
                    'strength_firearms_in_k_12_educational_settings']].dropna()
data2 = data2.query("state!='District of Columbia'")

## 4x models:
* Multiple Regression (sensible)
* Logit (Does not make sense, we are not looking for a binary outcome)
* KNN >> evaluate to see if rate breaks into distinct clusters? >> can we do multiple predictors in this?
* K Means >> evaluate to see if rate breaks into distinct clusters? (on its own)

In [8]:
bigX_1 = data1[['year', 'state', 'deaths', 'law_strength_score',
                    'restrictive_laws', 'permissive_laws', 'total_law_changes',                             
                    'law_strength_change', 'unique_law_classes']]
    # state is enough, omitting state_name
    # omitting rate_change

bigX_2 = data2[['state', 'strength_background_checks', 'strength_carrying_a_concealed_weapon_ccw', 
                    'strength_castle_doctrine', 'strength_dealer_license', 'strength_firearm_sales_restrictions',
                    'strength_local_laws_preempted_by_state', 'strength_minimum_age','strength_prohibited_possessor',            
                    'strength_registration', 'strength_waiting_period', 'strength_firearm_removal_at_scene_of_domestic_violence',
                    'strength_firearms_in_college_university', 'strength_child_access_laws', 'strength_gun_trafficking', 
                    'strength_open_carry', 'strength_required_reporting_of_lost_or_stolen_firearms',
                    'strength_safety_training_required', 'strength_untraceable_firearms', 'strength_permit_to_purchase',
                    'strength_firearms_in_k_12_educational_settings']]
bigY1 = data1['rate']
bigY2 = data2['rate']

categories1 = ['year', 'state']
numers1 = ['deaths', 'law_strength_score', 'restrictive_laws', 'permissive_laws', 'total_law_changes',                           
                    'law_strength_change', 'unique_law_classes']

categories2 = ['state']
numers2 = ['strength_background_checks', 'strength_carrying_a_concealed_weapon_ccw', 
                    'strength_castle_doctrine', 'strength_dealer_license', 'strength_firearm_sales_restrictions',
                    'strength_local_laws_preempted_by_state', 'strength_minimum_age','strength_prohibited_possessor',            
                    'strength_registration', 'strength_waiting_period', 'strength_firearm_removal_at_scene_of_domestic_violence',
                    'strength_firearms_in_college_university', 'strength_child_access_laws', 'strength_gun_trafficking', 
                    'strength_open_carry', 'strength_required_reporting_of_lost_or_stolen_firearms',
                    'strength_safety_training_required', 'strength_untraceable_firearms', 'strength_permit_to_purchase',
                    'strength_firearms_in_k_12_educational_settings']

In [9]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(bigX_1, bigY1, test_size = 0.2, random_state = 123)
x_train2, x_test2, y_train2, y_test2 = train_test_split(bigX_2, bigY2, test_size = 0.2, random_state = 123)

In [10]:
xform1 = ColumnTransformer(transformers = [("encoder1", OneHotEncoder(drop='first'), categories1),
                                           ("numeric1", "passthrough", numers1)])
xform2 = ColumnTransformer(transformers = [("encoder2", OneHotEncoder(drop='first'), categories2),
                                           ("numeric2", "passthrough", numers2)])

sinkMod1 = Pipeline(steps = [("transformer1", xform1), ("model1", LinearRegression())])
sinkMod2 = Pipeline(steps = [("transformer2", xform2), ("model2", LinearRegression())])

sinkMod1.fit(x_train1, y_train1)
sinkMod2.fit(x_train2, y_train2)

0,1,2
,steps,"[('transformer2', ...), ('model2', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('encoder2', ...), ('numeric2', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [11]:
predictions1 = sinkMod1.predict(x_test1)
mse_calc1 = mean_squared_error(y_test1, predictions1)
root_mse1 = mse_calc1 ** 0.5
root_mse1

1.2767631486908197

In [12]:
predictions2 = sinkMod2.predict(x_test2)
mse_calc2 = mean_squared_error(y_test2, predictions2)
root_mse2 = mse_calc2 ** 0.5
root_mse2

1.5975319060496322

In [13]:
"""
the following code (calculating adjusted r^2) was produced by generative AI (Claude by Anthropic)) 
in response to a direct request of how to calculate adjusted r^2 for a multi-linear model built with
scikitlearn's Pipeline.
"""

r2_1 = sinkMod1.score(x_test1, y_test1)
n_1 = x_test1.shape[0]
p_1 = x_test1.shape[1]
adj_r2_1 = 1 - (1 - r2_1) * (n_1-1) / (n_1 - p_1 - 1)
adj_r2_1

0.9462662882466966

In [14]:
r2_2 = sinkMod2.score(x_test2, y_test2)
n_2 = x_test2.shape[0]
p_2 = x_test2.shape[1]
adj_r2_2 = 1 - (1 - r2_2) * (n_2 - 1) / (n_2 - p_2 -1)
adj_r2_2

0.903951369034542

In [None]:
"""
the following code (capturing coefficients from the model and putting them into a dataframe, the latter several cells down) was 
produced by generative AI (Claude by Anthropic)) in response to a direct request of how to determine coefficients for 
a multi-linear model built with scikitlearn's OneHotEncoder.
"""

first_set = xform1.get_feature_names_out()
second_set = xform2.get_feature_names_out()

firstMod = sinkMod1.named_steps['model1']
#print(f"Intercept for first model: {firstMod.intercept_}")
#rint(f"Coefficients for first model: {firstMod.coef_}")
secondMod = sinkMod2.named_steps['model2']
#print(f"Intercept for second model: {secondMod.intercept_}")
#print(f"Coefficients for second model: {secondMod.coef_}")

coef_df_1 = pd.DataFrame({'feature': first_set, 'coefficient': firstMod.coef_})
#coef_df_1
coef_df_2 = pd.DataFrame({'feature': second_set, 'coefficient': secondMod.coef_})
#coef_df_2

In [None]:
plotData1 = pd.DataFrame({"actual": y_test1, "predictions": predictions1})
plotData2 = pd.DataFrame({"actual": y_test2, "predictions": predictions2})

In [19]:
px.scatter(plotData1, x="actual", y="predictions", trendline='ols')


In [20]:
px.scatter(plotData2, x="actual", y="predictions", trendline='ols')

In [None]:
"""
Apply Ridge regression (IAW notebook 12 and guidance from genAI)

1.  Make a set of lambdas
2.  Re-preprocess data >> StandardScaler for numerical.
3.  Use RidgeCV object to train model (via Pipeline)
4.  Test model
5.  Evaluate performance

NOTE: doing Ridge but NOT Lasso because Lasso is strong when only a few predictors matter;
This project attempts to model a complex phenomenon, so this possiblity is dismissed.

"""

# Ridge regression with multiple alpha (lambda) values
lambdas = np.logspace(-2, 6, 100)

# re-preprocess data
xform1_1 = ColumnTransformer([('nums', StandardScaler(), numers1),
                              ('cats', OneHotEncoder(drop='first'), categories1)])
xform2_2 = ColumnTransformer([("nums2", StandardScaler(), numers2),
                              ("cats2", OneHotEncoder(drop="first"), categories2)])

# Pipeline >> train model
updated_model1 = Pipeline(steps=[('transformer1_1', xform1_1), 
                                ("model1_1", RidgeCV(alphas=lambdas, cv=10, scoring='neg_mean_squared_error'))])
updated_model2 = Pipeline(steps=[("transformer2_2", xform2_2), 
                                 ("model2_2", RidgeCV(alphas=lambdas, cv=10, scoring='neg_mean_squared_error'))])

updated_model1.fit(x_train1, y_train1)
updated_model2.fit(x_train2, y_train2)

# Test Model
predictions1_1 = updated_model1.predict(x_test1)
predictions2_2 = updated_model2.predict(x_test2)


In [24]:
# Evaluate performance

plotData1_1 = pd.DataFrame({"actual": y_test1, "predicted": predictions1_1})
plotData2_2 = pd.DataFrame({"actual": y_test2, "predicted": predictions2_2})

px.scatter(plotData1_1, x='predicted', y='actual', trendline='ols')

In [25]:
px.scatter(plotData2_2, x='predicted', y='actual', trendline='ols')

In [None]:
# and the metrics

# inputs to .score are both x_test and y_test
r2_1_1 = updated_model1.score(x_test1, y_test1)
n = x_test2.shape[0]
p = x_test2.shape[1]
adj_r2_1_1 = 1 - (1 - r2_1_1) * (n - 1) / (n - p -1)
adj_r2_1_1

0.9383386357349911

In [28]:
r2_2_2 = updated_model2.score(x_test2, y_test2)
n = x_test2.shape[0]
p = x_test2.shape[1]
adj_r2_2_2 = 1 - (1 - r2_2_2) * (n - 1) / (n - p -1)
adj_r2_2_2

0.8926107464701822

In [29]:
mse_calc1_1 = mean_squared_error(y_test1, predictions1_1)
root_mse1_1 = mse_calc1_1 ** 0.5
root_mse1_1

1.2804811718565812

In [30]:
mse_calc2_2 = mean_squared_error(y_test2, predictions2_2)
root_mse2_2 = mse_calc2_2 ** 0.5
root_mse2_2

1.6892127974491857