In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [6]:
# --- 1. Data Loading and Preparation ---

# Load the data
df = pd.read_csv("../Data/processed/firearm_data_cleaned_new.csv")

# Filter for the latest year's data (2023)
max_year = df['year'].max()
df_latest = df[df['year'] == max_year].copy()
print(f"--- Data filtered for the year {max_year} ---")

# Define the law strength features (predictors)
law_features = [col for col in df.columns if col.startswith('strength_')]
law_features.append('law_strength_score')

# Features (X)
X_features = df_latest[law_features]

--- Data filtered for the year 2023 ---


In [7]:
# --- 2. Generating Cluster Labels (Target variable Y) ---

# Re-run K-Means (K=3) to generate consistent labels for supervised learning
K = 3
kmeans = KMeans(n_clusters=K, random_state=42, n_init=10)
# Y is the target variable (the cluster label)
y_clusters = kmeans.fit_predict(X_features) # Using unscaled features for K-Means here for consistency with previous step
df_latest['kmeans_cluster'] = y_clusters
print(f"Cluster labels (0, 1, 2) generated using K-Means with K={K}.")

Cluster labels (0, 1, 2) generated using K-Means with K=3.


In [8]:
# --- 3. Preprocessing Pipeline ---

# We only have numerical features, so we just need a standard scaler
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), law_features)
    ],
    remainder='passthrough'
)

In [9]:
# --- 4. Splitting Data for Training and Testing ---

# Split the features and cluster labels
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y_clusters, test_size=0.3, random_state=42, stratify=y_clusters
)
print(f"Training set size: {len(X_train)} states")
print(f"Testing set size: {len(X_test)} states")

Training set size: 35 states
Testing set size: 16 states


In [10]:
# --- 5. Defining and Training the MLP Classifier ---

# Define the MLP model:
# hidden_layer_sizes=(100, 50): Two hidden layers with 100 and 50 neurons respectively.
# activation='relu': Rectified Linear Unit activation function.
# solver='adam': Stochastic gradient-based optimizer.
# max_iter=500: Maximum number of iterations to run.
# random_state=42: For reproducibility.
mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    max_iter=500,
    random_state=42
)

# Create a full pipeline: Scaling -> MLP
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', mlp)
])

# Train the model
model_pipeline.fit(X_train, y_train)
print("\n--- MLP Classifier Model Trained Successfully ---")


--- MLP Classifier Model Trained Successfully ---


In [11]:
# --- 6. Evaluating the Model ---

# Make predictions on the test set
y_pred = model_pipeline.predict(X_test)

# Calculate accuracy and display classification report
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy on Test Set: {accuracy:.4f}")

print("\nClassification Report:")
# Cluster labels are arbitrary (0, 1, 2) from K-Means
# Based on the previous analysis:
# Cluster 1: Most Restrictive
# Cluster 2: Moderately Restrictive
# Cluster 0: Least Restrictive
print(classification_report(y_test, y_pred, target_names=['Least Restrictive (0)', 'Most Restrictive (1)', 'Moderately Restrictive (2)'], zero_division=0))



Model Accuracy on Test Set: 0.8750

Classification Report:
                            precision    recall  f1-score   support

     Least Restrictive (0)       0.67      1.00      0.80         2
      Most Restrictive (1)       0.91      1.00      0.95        10
Moderately Restrictive (2)       1.00      0.50      0.67         4

                  accuracy                           0.88        16
                 macro avg       0.86      0.83      0.81        16
              weighted avg       0.90      0.88      0.86        16



In [12]:
# --- 7. Demonstration: Predicting a Hypothetical New State ---

# Create a hypothetical state profile with a synthesized score of 35 (moderate)
hypothetical_state_data = pd.DataFrame({
    'strength_background_checks': [7],
    'strength_carrying_a_concealed_weapon_ccw': [-1],
    'strength_castle_doctrine': [-2],
    'strength_dealer_license': [1],
    'strength_firearm_sales_restrictions': [4],
    'strength_local_laws_preempted_by_state': [0],
    'strength_minimum_age': [6],
    'strength_prohibited_possessor': [5],
    'strength_registration': [0],
    'strength_waiting_period': [2],
    'strength_firearm_removal_at_scene_of_domestic_violence': [1],
    'strength_firearms_in_college_university': [0],
    'strength_child_access_laws': [1],
    'strength_gun_trafficking': [1],
    'strength_open_carry': [0],
    'strength_required_reporting_of_lost_or_stolen_firearms': [1],
    'strength_safety_training_required': [1],
    'strength_untraceable_firearms': [1],
    'strength_permit_to_purchase': [1],
    'strength_firearms_in_k_12_educational_settings': [0],
    'law_strength_score': [35]
})

# Predict the cluster for the hypothetical state using the pipeline
predicted_cluster = model_pipeline.predict(hypothetical_state_data)
predicted_proba = model_pipeline.predict_proba(hypothetical_state_data)[0]

print("\n--- Hypothetical State Prediction ---")
print(f"Input Features: A synthesized score of 35.")
print(f"Predicted Cluster: {predicted_cluster[0]}")
print(f"Prediction Probabilities (0, 1, 2): {np.round(predicted_proba, 2)}")
print("\nInterpretation (from K-Means results):")
print("Cluster 0: Least Restrictive (~12.9 score)")
print("Cluster 1: Most Restrictive (~49.5 score)")
print("Cluster 2: Moderately Restrictive (~24.5 score)")
print(f"\nThe MLP classifies the hypothetical state (Score=35) into Cluster {predicted_cluster[0]}.")


--- Hypothetical State Prediction ---
Input Features: A synthesized score of 35.
Predicted Cluster: 2
Prediction Probabilities (0, 1, 2): [0.08 0.18 0.73]

Interpretation (from K-Means results):
Cluster 0: Least Restrictive (~12.9 score)
Cluster 1: Most Restrictive (~49.5 score)
Cluster 2: Moderately Restrictive (~24.5 score)

The MLP classifies the hypothetical state (Score=35) into Cluster 2.
