In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely import wkt
from shapely.geometry import Point

In [7]:
gun_violence_db = pd.read_csv('data/gun_violence_db.csv')
merge_geo=pd.read_csv('data/merge_geo.csv')
counties_db = pd.read_csv('data/counties_db.csv')

## Modélisation

Dans cette partie, nous proposons une tentative de modélisation de la fréquence d'incidents, du nombre de tués et du nombre de blessés au total par an, par comté.

In [6]:
state_incident_counts = merge_geo.groupby('state')['incident_id'].count().reset_index()
state_incident_counts.columns = ['state', 'sum_incident']

state_population = counties_db.groupby('Etat')['resident_pop_year_2015'].sum().reset_index()
state_population.columns= ['state', 'pop']

state_incident_counts['incident_per_1K'] = state_incident_counts['sum_incident']/state_population['pop']

KeyError: 'Column not found: resident_pop_year_2015'

In [None]:
score_by_state = total_db.groupby('Etat')['score_legis'].mean().reset_index()
score_by_state.columns = ['state', 'score_legis']

In [None]:
merged_data = pd.merge(left=state_incident_counts, right = score_by_state, on='state')

In [None]:
merged_data.sort_values(by='incident_per_1K').head()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(merged_data['score_legis'], merged_data['incident_per_1K'])
plt.title('Legislation Score vs. Incidents per Capita by State')
plt.xlabel('Legislation Score')
plt.ylabel('Incident Rate per Capita')
plt.show()

In [None]:
dem_state = total_db.groupby('Etat').mean('per_dem').reset_index()[['Etat', 'per_dem']]
dem_state.columns = ['state', 'per_dem']
merged_data = pd.merge(left=merged_data, right=dem_state, on='state')

In [None]:
merged_data['per_dem'].corr(merged_data['incident_per_1K'])

In [None]:
from scipy.stats import linregress
merged_data = merged_data.dropna(subset=['incident_per_1K', 'score_legis'])
# Assuming heat_data is your DataFrame and 'column1' and 'column2' are the column names
result = linregress(merged_data[['incident_per_1K', 'score_legis']])

# The result object contains various statistics, including the slope and intercept
slope = result.slope
intercept = result.intercept
correlation_coefficient = result.rvalue
p_value = result.pvalue
standard_error = result.stderr

# Print the results
print(f"Slope: {slope}")
print(f"Intercept: {intercept}")
print(f"Correlation Coefficient: {correlation_coefficient}")
print(f"P-value: {p_value}")
print(f"Standard Error: {standard_error}")

In [None]:
result.rvalue**2

Ici, tentative de heatmap avec variables intéressantes

In [None]:
counties_data = total_db[['fips', 'resident_pop_year_2015','med_h_income_year_2015', 'unemp_rate_year_2015', 'snap_beneficiaries_year_2015', 'bachelors_deg_year_2015', 'bchecks_2015', 'score_legis', 'per_dem']]
counties_data['pop_density'] = total_db['resident_pop_year_2015']/total_db['geometry'].area

In [None]:
counties_data['area'] = total_db_geo['geometry'].area
states_data = counties_data.groupby('fips').agg({
    'med_h_income_year_2015': 'mean',
    'unemp_rate_year_2015': 'mean',
    'snap_beneficiaries_year_2015': 'mean',
    'bachelors_deg_year_2015': 'mean',
    'bchecks_2015': 'mean',
    'score_legis': 'mean',
    'resident_pop_year_2015': 'mean',
    'area': 'mean',
    'per_dem' : 'mean'
})

states_data['pop_density'] = states_data['resident_pop_year_2015'] / states_data['area']
states_data['bchecks_2015'] = states_data['bchecks_2015'] / states_data['resident_pop_year_2015']

states_data.drop('area', axis=1, inplace=True)


In [None]:
states_data.head(1)

In [None]:
incident_data = merge_geo.loc[merge_geo['date'].str.startswith('2015')].groupby('fips').agg({
    'n_killed': 'sum',
    'n_injured': 'sum',
    'incident_id': 'count',
})
"""
incident_data['n_killed'] = incident_data['n_killed']/states_data['resident_pop_year_2015']
incident_data['n_injured'] = incident_data['n_injured']/states_data['resident_pop_year_2015']
incident_data['incident_id'] = incident_data['incident_id']/states_data['resident_pop_year_2015']
"""
incident_data.head(1)

In [None]:
heat_data = pd.merge(left=states_data, right=incident_data, on='fips')
heat_data.head()

In [None]:
import seaborn as sns
g1 = sns.heatmap(heat_data.corr(), cmap='seismic', annot=True, fmt=".2f")

In [None]:
heat_data.columns

In [None]:
import statsmodels.api as sm
import pandas as pd

heat_data = heat_data.dropna()

# Add a constant term for the intercept
X = sm.add_constant(heat_data[['med_h_income_year_2015', 'unemp_rate_year_2015',
       'snap_beneficiaries_year_2015', 'bachelors_deg_year_2015',
       'bchecks_2015', 'score_legis',
       'pop_density']])

# Fit the model
model = sm.OLS(heat_data['n_killed'], X).fit(cov_type='HC3')

# Print the summary
print(model.summary())

In [None]:
heat_data.columns

In [None]:
from sklearn.pipeline import Pipeline

def extract_features_selected(lasso: Pipeline, preprocessing_step_name: str = 'preprocess') -> pd.Series:
    """
    Extracts selected features based on the coefficients obtained from Lasso regression.

    Parameters:
    - lasso (Pipeline): The scikit-learn pipeline containing a trained Lasso regression model.
    - preprocessing_step_name (str): The name of the preprocessing step in the pipeline. Default is 'preprocess'.

    Returns:
    - pd.Series: A Pandas Series containing selected features with non-zero coefficients.
    """
    # Check if lasso object is provided
    if not isinstance(lasso, Pipeline):
        raise ValueError("The provided lasso object is not a scikit-learn pipeline.")

    # Extract the final transformer from the pipeline
    lasso_model = lasso[-1]

    # Check if lasso_model is a Lasso regression model
    if not isinstance(lasso_model, Lasso):
        raise ValueError("The final step of the pipeline is not a Lasso regression model.")

    # Check if lasso model has 'coef_' attribute
    if not hasattr(lasso_model, 'coef_'):
        raise ValueError("The provided Lasso regression model does not have 'coef_' attribute. "
                         "Make sure it is a trained Lasso regression model.")

    # Get feature names from the preprocessing step
    features_preprocessing = lasso[preprocessing_step_name].get_feature_names_out()

    # Extract selected features based on non-zero coefficients
    features_selec = pd.Series(features_preprocessing[np.abs(lasso_model.coef_) > 0])

    return features_selec

In [None]:
#2. Echantillon d'entraînement et échantillon test
X_train, X_test, y_train, y_test = train_test_split(
    heat_data.drop(["n_killed", 'n_injured'], axis = 1),
    100*heat_data[['n_killed']], test_size=0.2, random_state=0
)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, ColumnTransformer

numerical_features = X_train.select_dtypes(include='number').columns.tolist()
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()


numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])


preprocessor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

In [None]:
model = Lasso(fit_intercept=True, alpha = 0.1)  

lasso_pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', model)
])
lasso_pipeline.fit(X_train, y_train)

In [None]:
# Question 5
lasso1 = lasso_pipeline['model']
features_selec = extract_features_selected(lasso_pipeline)
#np.abs(lasso1.coef_)

In [None]:
features_selec.str.replace("(number__|category__)", "", regex = True)

In [None]:
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
import sklearn.metrics
from sklearn.linear_model import LinearRegression
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from sklearn.linear_model import lasso_path
import seaborn as sns

In [None]:
#6. Utilisation de lasso_path
preprocessed_features = pd.DataFrame(
      numeric_pipeline.fit_transform(
        X_train.drop(columns = categorical_features)
      )
  )
my_alphas = np.array([0.001,0.01,0.02,0.025,0.05,0.1,0.25,0.5,0.8,1.0])

alpha_for_path, coefs_lasso, _ = lasso_path(
  preprocessed_features,
  y_train,
  alphas=my_alphas)
#print(coefs_lasso)
nb_non_zero = np.apply_along_axis(func1d=np.count_nonzero,arr=coefs_lasso,axis=0)
nb_non_zero = pd.DataFrame(
  nb_non_zero
).sum(axis = 0)

## graphique

sns.set_style("whitegrid")
plt.figure()
p = sns.lineplot(y=nb_non_zero, x=alpha_for_path)
p.set(title = r"Number variables and regularization parameter ($\alpha$)", xlabel=r'$\alpha$', ylabel='Nb. de variables')
p.figure.get_figure()