# Traitement des données

Dans cette partie, l'objectif est d'importer nos données, les convertir à un format convenable pour la production de statistiques descriptives, et exporter le résultat pour que ces bases puissent être utilisées par les autres programmes

Les bases en question sont :
- la base listant tous les incidents de violence par armes à feu aux USA entre 2013 et 2018
- la base listant les caractéristiques économiques générales des comtés et de ses habitants.
- la base gun_legislation proposant une synthèse des différences de législation entre Etats
- la base bg_checks recensant tous les *background checks* du FBI pour l'année 2018 (documentation fournie plus bas)

In [21]:
#Pour le traitement classique
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import MultiPolygon
from tqdm import tqdm

#Pour l'interaction avec l'API 
import requests
from statistics import mean
import time
from collections import defaultdict

## Traitement de la base d'incidents armes à feu

In [2]:
#Base incidents armes à feu
url="https://drive.google.com/file/d/1GGOLMc_Ow9yZC9sICegPegDggQuHOD3t/view?usp=drive_link"
url="https://drive.google.com/uc?export=download&confirm=1&id=" + url.split("/")[-2]
gun_violence_db = pd.read_csv(url)
gun_violence_db.sample(5)

Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
154643,692058,2016-11-03,Florida,Jupiter,1694 West Indiantown Road,0,1,http://www.gunviolencearchive.org/incident/692058,http://www.wptv.com/news/region-c-palm-beach-c...,False,...,0::36||2::17,0::Adult 18+||1::Adult 18+||2::Teen 12-17,0::Male||1::Male||2::Male,0::Jesus Manuel Perea Ortiz||2::Travious Terre...,2::Armed Robbery,"0::Injured||1::Unharmed||2::Unharmed, Arrested",0::Victim||1::Victim||2::Subject-Suspect||3::S...,http://www.palmbeachpost.com/news/crime--law/j...,,
71878,347300,2015-05-25,District of Columbia,Washington,Seventh and O Streets NW,1,0,http://www.gunviolencearchive.org/incident/347300,http://www.washingtonpost.com/local/crime/woma...,False,...,0::31,0::Adult 18+,0::Female,0::Tamara Gliss,,0::Killed,0::Victim,http://www.washingtonpost.com/local/crime/woma...,,
27973,162862,2014-07-24,Ohio,Toledo,2957 Collingwood Dr.,1,0,http://www.gunviolencearchive.org/incident/162862,http://www.nbc24.com/news/story.aspx,False,...,0::16,0::Teen 12-17,0::Male,0::Tyler McIntoush,,0::Killed,0::Victim,http://www.nbc24.com/news/story.aspx,44.0,11.0
159484,717586,2016-12-01,Illinois,Danville,Jackson and Fletcher,0,2,http://www.gunviolencearchive.org/incident/717586,http://www.wandtv.com/story/33855778/2-arreste...,False,...,1::17||2::18,0::Teen 12-17||1::Teen 12-17||2::Adult 18+,0::Male||1::Male||2::Male,,2::Armed Robbery,"0::Injured||1::Injured, Arrested||2::Unharmed,...",0::Victim||1::Victim||2::Subject-Suspect,http://www.illinoishomepage.net/news/danville-...,,
115381,519028,2016-03-08,Illinois,Chicago,8800 block of South Wood Street,0,1,http://www.gunviolencearchive.org/incident/519028,https://www.dnainfo.com/chicago/20160309/aubur...,False,...,0::42,0::Adult 18+,0::Male||1::Male,,,0::Injured||1::Unharmed,0::Victim||1::Subject-Suspect,https://www.dnainfo.com/chicago/20160309/aubur...,35.0,18.0


Conformément à la documentation de la base, certaines colonnes sont codées de façon à pouvoir les reconvertir en dictionnaire :

In [3]:
def convert_to_dict(value):
    if pd.isna(value):
        return value

    pairs = value.split('||')
    result_dict = {}
    for pair in pairs:
        #Some are corrupted : 1: instead of ::
        if '::' in pair:
            key, val = pair.split('::', 1)
            result_dict[int(key)] = val
        else:
            key, val = pair.split(':', 1)
            result_dict[int(key)] = val
    return result_dict

list_of_dict_columns = ['gun_stolen', 'gun_type', 'participant_age', 'participant_age_group', 'participant_gender', 'participant_name', 'participant_relationship', 'participant_status', 'participant_type']
gun_violence_db[list_of_dict_columns] = gun_violence_db[list_of_dict_columns].applymap(convert_to_dict)
gun_violence_db.head()



  gun_violence_db[list_of_dict_columns] = gun_violence_db[list_of_dict_columns].applymap(convert_to_dict)


Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.gunviolencearchive.org/incident/461105,http://www.post-gazette.com/local/south/2013/0...,False,...,{0: '20'},"{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Male', 1: 'Male', 3: 'Male', 4: 'Female'}",{0: 'Julian Sims'},,"{0: 'Arrested', 1: 'Injured', 2: 'Injured', 3:...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Vi...",http://pittsburgh.cbslocal.com/2013/01/01/4-pe...,,
1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.gunviolencearchive.org/incident/460726,http://www.dailybulletin.com/article/zz/201301...,False,...,{0: '20'},"{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...",{0: 'Male'},{0: 'Bernard Gillis'},,"{0: 'Killed', 1: 'Injured', 2: 'Injured', 3: '...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Vi...",http://losangeles.cbslocal.com/2013/01/01/man-...,62.0,35.0
2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,http://www.gunviolencearchive.org/incident/478855,http://chronicle.northcoastnow.com/2013/02/14/...,False,...,"{0: '25', 1: '31', 2: '33', 3: '34', 4: '33'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Male', 1: 'Male', 2: 'Male', 3: 'Male', 4...","{0: 'Damien Bell', 1: 'Desmen Noble', 2: 'Herm...",,"{0: 'Injured, Unharmed, Arrested', 1: 'Unharme...","{0: 'Subject-Suspect', 1: 'Subject-Suspect', 2...",http://www.morningjournal.com/general-news/201...,56.0,13.0
3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.gunviolencearchive.org/incident/478925,http://www.dailydemocrat.com/20130106/aurora-s...,False,...,"{0: '29', 1: '33', 2: '56', 3: '33'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Female', 1: 'Male', 2: 'Male', 3: 'Male'}","{0: 'Stacie Philbrook', 1: 'Christopher Ratlif...",,"{0: 'Killed', 1: 'Killed', 2: 'Killed', 3: 'Ki...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Su...",http://denver.cbslocal.com/2013/01/06/officer-...,40.0,28.0
4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.gunviolencearchive.org/incident/478959,http://www.journalnow.com/news/local/article_d...,False,...,"{0: '18', 1: '46', 2: '14', 3: '47'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Teen 12-1...","{0: 'Female', 1: 'Male', 2: 'Male', 3: 'Female'}","{0: 'Danielle Imani Jameison', 1: 'Maurice Eug...",{3: 'Family'},"{0: 'Injured', 1: 'Injured', 2: 'Killed', 3: '...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Su...",http://myfox8.com/2013/01/08/update-mother-sho...,62.0,27.0


In [4]:
gun_violence_db.to_csv("data/gun_violence_db.csv", index=False)

## Traitements des informations par comté

Ici, nous récupérons plusieurs informations nous permettant d'enrichir notre analyse des incidents par armes à feu. Nous utilisons l'API Fred, nous permettant de récupérer les valeurs de plusieurs séries temporelles de variables économiques par comté.
Puis, nous utilisons encore Fred pour récupérer la géographie des comtés, nécessaire pour effectuer de la cartographie.
La documentation précisant le mode d'interaction avec l'API de StLouisFed se trouve à la page https://fred.stlouisfed.org/docs/api/fred/#API.

### Préparation de l'extraction de données

Chaque objet dans l'API peut être associé avec son ID. Nous récupérons ici la liste des comtés et les ID associés.

In [5]:
#Each request is categorized with an url and an id
#The gist here is to recover the proper id to retrieve data
api_key = "180de2e6a1d1e953d270ebf38341cd44"
param = {"api_key" : api_key, "file_type" : "json", "category_id" : "27281"}
url = "https://api.stlouisfed.org/fred/category/children?"

In [6]:
def request_db(index):
    #this function requests to the API the database associated with the category id index
    param["category_id"] = index #on ajuste les paramètres de la request pour demander la bonne catégorie
    response = requests.get(url, params = param)
    data = response.json()
    return data

In [7]:
def check_response(url, param):
    while True:
        response = requests.get(url, params=param).json()

        if not response.get('error_message') or response['error_message'] != 'Too Many Requests.  Exceeded Rate Limit':
            return response
        time.sleep(5)

In [8]:
#Some names are ambiguous between dframes
def simplify_name(name):
    if name.endswith("County"):
        return name.rsplit("County", 1)[0].strip()
    if name.endswith("Parish"):
        return name.rsplit("Parish", 1)[0].strip()
    if name.endswith("Census Area"):
        return name.replace("Census Area", "CA")
    if name.endswith("Borough/city"):
        return name.replace("Borough/city", "Cty&Bor")      
    if name.endswith("Municipality"):
        return name.replace("Municipality", "Muny")
    if name.endswith("Borough/municipality"):
        return name.replace("Borough/municipality", "Muny")    
    if name.endswith("County/city"):
        return name.rsplit("County/city", 1)[0].strip() 

        
    return name

In [9]:
us_data = request_db(27281)['categories']
#We create our dframe by creating a list of dicts, each element is a new row
database = list()
for state in tqdm(us_data):
    id_state = state['id']
    state_name = state['name']
    
    #Request to recover id in order to extract counties
    state_info = request_db(id_state)["categories"]
    if state_info != []: #One exception : which one ?
        id_list_of_state_counties = state_info[0]['id']
        list_of_state_counties = request_db(id_list_of_state_counties)["categories"]
        for county in list_of_state_counties:
            dict_county = dict()
            id_county = county['id']
            
            parts = county['name'].split(', ')
            county_name, state_code = parts[0], parts[-1]
            
            dict_county = {
                'Nom': simplify_name(county_name),
                'Etat': state_name,
                'Code_Etat': state_code,
                'id_Etat': id_state,
                'id_county': id_county
            }
            
            if '+' in county_name:
                county_name, county_bis = county_name.split(' + ')
                dict_county['Nom'] = simplify_name(county_bis)
                database.append(dict_county.copy())  # Append a copy to avoid modifying the original
                dict_county['Nom'] = simplify_name(county_name)
            database.append(dict_county)

counties_db = pd.DataFrame(database)

100%|██████████| 53/53 [00:23<00:00,  2.22it/s]


In [10]:
#Gestion des exceptions pour le merging(obligé de faire du cas par cas car merging sur le code_Etat + nom)

counties_db.loc[(counties_db['Code_Etat'] == 'Aleutian Islands Census Area'), 'Code_Etat'] = 'AK'
counties_db.loc[(counties_db['Code_Etat'] == 'District of Columbia'), 'Code_Etat'] = 'DC'
counties_db.loc[(counties_db['Nom'] == 'De Soto'), 'Nom'] = 'DeSoto'
counties_db.loc[(counties_db['Nom'] == 'DeSoto') & (counties_db['Code_Etat'] == 'LA'), 'Nom'] = 'De Soto'
counties_db.loc[(counties_db['Nom'] == 'De Kalb'), 'Nom'] = 'DeKalb'
counties_db.loc[(counties_db['Nom'] == 'Du Page'), 'Nom'] = 'DuPage'
counties_db.loc[(counties_db['Nom'] == 'La Salle'), 'Nom'] = 'LaSalle'
counties_db.loc[(counties_db['Nom'] == 'La Porte'), 'Nom'] = 'LaPorte'
counties_db.loc[(counties_db['Nom'] == 'Lagrange'), 'Nom'] = 'LaGrange'
counties_db.loc[(counties_db['Nom'] == 'LaFourche'), 'Nom'] = 'Lafourche'
counties_db.loc[(counties_db['Nom'] == 'Lac Qui Parle'), 'Nom'] = 'Lac qui Parle'
counties_db.loc[(counties_db['Nom'] == 'Dona Ana'), 'Nom'] = 'Doña Ana'
counties_db.loc[(counties_db['Nom'] == 'La Moure'), 'Nom'] = 'LaMoure'
counties_db.loc[(counties_db['Nom'] == 'De Witt'), 'Nom'] = 'DeWitt'
counties_db.loc[(counties_db['Nom'] == 'DeWitt') & (counties_db['Code_Etat'] == 'IL'), 'Nom'] = 'De Witt'
counties_db.loc[(counties_db['Nom'] == 'LaSalle') & (counties_db['Code_Etat'] == 'TX'), 'Nom'] = 'La Salle'
counties_db.loc[(counties_db['Code_Etat'] == 'WI (includes Menominee)'), 'Code_Etat'] = 'WI'
counties_db.loc[(counties_db['Nom'] == 'Fond Du Lac'), 'Nom'] = 'Fond du Lac'

On a maintenant un premier dframe recensant tous les comtés des USA ainsi que les ID permettant de les retrouver dans l'API. On peut désormais extraire pour chaque comté les informations socio-démographiques nous permettant de produire nos statistiques descriptives et notre modèle.

Note : id_county fait office de clé primaire dans cette base (au sein de l'API FRED)

In [11]:
counties_db.to_csv("data/counties_db.csv", index=False)

### Récupération des données géographiques

FRED n'inclut pas directement de données géographiques associées aux comtés par leurs ID. Il faut donc ici récupérer ces données géographiques, et effectuer une jointure sur l'Etat et le nom du comté (donc de nombreuses exceptions).

In [12]:
#On change l'URL pour pouvoir récupérer des données géographiques sur tous les comtés (ainsi que le fips, le nom simplifié)
#L'enjeu ici est d'associer les données aux comtés correspondants : les codes id de l'API n'ont pas de correspondance avec les donneés géographiques à part le nom
url = "https://api.stlouisfed.org/geofred/shapes/file?shape=county"

In [13]:
geom_counties_db = request_db(29802)#The code is arbitrary here, each request gives the geometries of all counties
geom_counties_db = gpd.GeoDataFrame.from_features(geom_counties_db['features'])
geom_counties_db = geom_counties_db.loc[geom_counties_db['hc-group'] == 'admin2']
geom_counties_db['Code_Etat'] = geom_counties_db['hc-key'].apply(lambda x: x.split('-')[1].upper() if len(x.split('-')) > 1 else None)
geom_counties_db['name'] = geom_counties_db['name'].apply(lambda x : x.rsplit("Parish", 1)[0].strip())
geom_counties_db.sample(5)


Unnamed: 0,geometry,hc-group,hc-middle-x,hc-middle-y,hc-key,hc-a2,fips,name,Code_Etat
482,"POLYGON ((5024.000 7722.000, 5062.000 7723.000...",admin2,0.5,0.5,us-ia-195,WO,19195,Worth,IA
2781,"POLYGON ((-714.000 7295.000, -726.000 7299.000...",admin2,0.48,0.51,us-ca-033,LA,6033,Lake,CA
779,"POLYGON ((8330.000 8051.000, 8342.000 8052.000...",admin2,0.51,0.38,us-ny-053,MA,36053,Madison,NY
1972,"POLYGON ((4396.000 8261.000, 4396.000 8277.000...",admin2,0.5,0.5,us-mn-155,TR,27155,Traverse,MN
2028,"MULTIPOLYGON (((7771.000 6486.000, 7779.000 64...",admin2,0.5,0.5,us-wv-063,MO,54063,Monroe,WV


In [14]:
counties_db = pd.merge(left= counties_db, right=geom_counties_db, how='left', left_on=['Nom', 'Code_Etat'], right_on=['name', 'Code_Etat'])
counties_db = counties_db.loc[:,['Nom', 'Etat', 'Code_Etat', 'id_Etat', 'id_county', 'fips', 'geometry']]
counties_db.sample(10)

Unnamed: 0,Nom,Etat,Code_Etat,id_Etat,id_county,fips,geometry
2989,Rockingham,Virginia,VA,27330,33818,51165,"POLYGON ((7971.000 6805.000, 7980.000 6830.000..."
1979,Scotland,North Carolina,NC,27315,29337,37165,"POLYGON ((8092.000 5819.000, 8099.000 5792.000..."
2963,York,Virginia,VA,27330,30335,51199,"POLYGON ((8547.000 6512.000, 8547.000 6512.000..."
1233,Nantucket,Massachusetts,MA,27303,28578,25019,"POLYGON ((9564.000 7883.000, 9583.000 7863.000..."
2338,Darlington,South Carolina,SC,27323,29704,45031,"POLYGON ((8011.000 5695.000, 8036.000 5688.000..."
3002,Fairfax,Virginia,VA,27330,33927,51059,"MULTIPOLYGON (((8314.000 6895.000, 8315.000 68..."
1038,Gallatin,Kentucky,KY,152,883,21077,"POLYGON ((6823.000 6614.000, 6807.000 6607.000..."
2058,Butler,Ohio,OH,27317,29418,39017,"POLYGON ((6790.000 6832.000, 6857.000 6841.000..."
717,Daviess,Indiana,IN,151,532,18027,"POLYGON ((6382.000 6511.000, 6366.000 6510.000..."
215,Modoc,California,CA,27286,27546,6049,"POLYGON ((-214.000 7980.000, -108.000 7950.000..."


Les comtés où la jointure géographique n'est pas possible (11 comtés sur 3195, soit 3‰):

In [15]:
counties_db.loc[counties_db['geometry'] == None]

Unnamed: 0,Nom,Etat,Code_Etat,id_Etat,id_county,fips,geometry
68,Aleutian Islands CA,Alaska,AK,27283,33743,,
90,Prince of Wales-Outer Ketchikan CA,Alaska,AK,27283,27421,,
92,Skagway-Hoonah-Angoon CA,Alaska,AK,27283,27423,,
96,Wade Hampton CA,Alaska,AK,27283,27426,,
97,Wrangell Borough/City,Alaska,AK,27283,33518,,
98,Wrangell-Petersburg CA,Alaska,AK,27283,27427,,
99,Yakutat City and Borough,Alaska,Yakutat City and Borough,27283,32212,,
556,Kalawao Counties,Hawaii,HI,27293,33804,,
2425,Shannon,South Dakota,SD,27324,29791,,
2856,Clifton Forge City,Virginia,VA,27330,30228,,


### Récupération des séries FRED

Ici, le but est de récupérer des données économiques clé pour chaque comté : population, taux de chômage, salaire médian, nombre de bénéficiaires des aides alimentaires, taux de scolarisation.
En raison de la limite de requêtes par minute, nous décomposons cette extraction pour minimiser les erreurs associées.
Dans un premier temps, nous récupérons la liste des séries associées à chaque comté ainsi que l'ID de ces séries.
Une fois l'ID récupéré, nous extrayons toutes les données nécessaires.

In [16]:
#Attempt to optimize the execution time + retrieve multiple series all at once
#Step 1 : Retrieve the ID of all series
dict_series = {}
for id_county in tqdm(counties_db['id_county']):
    url = "https://api.stlouisfed.org/fred/category/series?"
    param = {"api_key" : api_key, "file_type" : "json", "category_id" : id_county}
    response = check_response(url, param)

    dict_series[id_county] = response['seriess']

100%|██████████| 3195/3195 [26:23<00:00,  2.02it/s]  


In [17]:
#Step 2 : Retrieve the ID of the specific series we want to study
series_to_retrieve = {}
for id_county in tqdm(counties_db['id_county']):
    unemp_rate_id = med_h_income_id = resident_pop_id = snap_beneficiaries_id = bachelors_deg_id = 'not found'
    for serie in dict_series[id_county]:
        if ("Unemployment Rate" in serie["title"]) and ("Monthly" in serie["frequency"]):
            unemp_rate_id = serie['id']
        if ("Estimate of Median Household Income" in serie["title"]) and ("Interval" not in serie["title"]):
            med_h_income_id = serie['id']
        if ("Resident Population" in serie["title"]):
            resident_pop_id = serie['id']
        if ("SNAP Benefits Recipients" in serie["title"]):
            snap_beneficiaries_id = serie['id']
        if ("Bachelor's Degree or Higher" in serie["title"]):  
            bachelors_deg_id = serie['id']
    series_to_retrieve[id_county] = (unemp_rate_id, med_h_income_id, resident_pop_id, snap_beneficiaries_id, bachelors_deg_id)
#eg : series_to_retrieve = {id_county_1 : (series_1_id, series_2_id, ...)}

100%|██████████| 3195/3195 [00:00<00:00, 4279.11it/s]


In [31]:
#Step 3 : Determine the value associated with the series
list_of_values = defaultdict(dict)
list_of_variables = ['unemp_rate', 'med_h_income', 'resident_pop', 'snap_beneficiaries', 'bachelors_deg']

for id_county in tqdm(counties_db['id_county']):
    for index, id_series in enumerate(series_to_retrieve[id_county]):
        if id_series == 'not found':
            continue

        # Request data
        url = "https://api.stlouisfed.org/fred/series/observations?"
        param = {
            "api_key": api_key,
            "file_type": "json",
            "series_id": id_series,
            "observation_start": "2013-01-01",
            "observation_end": "2018-01-01",
        }
        observations = check_response(url, param).get("observations", [])

        # Extract values and years
        values = [float(obs["value"]) for obs in observations]
        years = [pd.to_datetime(obs["date"]).year for obs in observations]

        # Save values for each year
        for year, value in zip(years, values):
            list_of_values[id_county][f"{list_of_variables[index]}_year_{year}"] = value

# Convert defaultdict to regular dict for better compatibility
list_of_values = dict(list_of_values)

100%|██████████| 3195/3195 [2:11:06<00:00,  2.46s/it]  


In [40]:
#Step 4 : Convert data in a dataframe, then merge it to obtain our final db
cols = ['unemp_rate_year_2013', 'unemp_rate_year_2014', 'unemp_rate_year_2015', 'unemp_rate_year_2016', 'unemp_rate_year_2017', 'unemp_rate_year_2018', 'med_h_income_year_2013', 'med_h_income_year_2014', 'med_h_income_year_2015', 'med_h_income_year_2016', 'med_h_income_year_2017', 'med_h_income_year_2018', 'resident_pop_year_2013', 'resident_pop_year_2014', 'resident_pop_year_2015', 'resident_pop_year_2016', 'resident_pop_year_2017', 'resident_pop_year_2018', 'snap_beneficiaries_year_2013', 'snap_beneficiaries_year_2014', 'snap_beneficiaries_year_2015', 'snap_beneficiaries_year_2016', 'snap_beneficiaries_year_2017', 'snap_beneficiaries_year_2018', 'bachelors_deg_year_2013', 'bachelors_deg_year_2014', 'bachelors_deg_year_2015', 'bachelors_deg_year_2016', 'bachelors_deg_year_2017', 'bachelors_deg_year_2018']
fred_db = pd.DataFrame.from_dict(list_of_values, orient='index', columns=cols)
fred_db.reset_index(inplace=True)
fred_db.rename(columns={'index': 'id_county'}, inplace=True)
fred_db.head(5)

counties_db = pd.merge(left= counties_db, right=fred_db, how='left', left_on='id_county', right_on='id_county')

## Traitement de la base sur la législation par Etat

La base ici étudiée nous permet de synthétiser les différences de législation encadrant la vente, la possession et l'utilisation d'armes à feu sur le teritoire américain. Pour pouvoir comparer ces différences, nous essayons ici de construire des variables synthétiques traduisant l'intensité des restrictions dans plusieurs catégories : ....

La documentation de la base peut se trouver à l'addresse https://view.officeapps.live.com/op/view.aspx?src=https%3A%2F%2Fmail.statefirearmlaws.org%2Fsites%2Fdefault%2Ffiles%2F2020-07%2Fcodebook_0.xlsx&wdOrigin=BROWSELINK.

In [41]:
legislation_db = pd.read_excel('data/gun_legislation.xlsx')
legislation_db = legislation_db.loc[legislation_db['year'] == 2018].sort_values(by='state')
legislation_db.head(10)

Unnamed: 0,state,year,felony,invcommitment,invoutpatient,danger,drugmisdemeanor,alctreatment,alcoholism,relinquishment,...,expartedating,dvrosurrender,dvrosurrendernoconditions,dvrosurrenderdating,expartesurrender,expartesurrendernoconditions,expartesurrenderdating,dvroremoval,stalking,lawtotal
27,Alabama,2018,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,10
57,Alaska,2018,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
87,Arizona,2018,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,8
117,Arkansas,2018,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,11
147,California,2018,1,1,0,1,0,0,1,1,...,1,1,1,1,1,1,1,1,1,109
177,Colorado,2018,1,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,30
207,Connecticut,2018,1,1,1,1,1,0,0,1,...,1,1,1,1,1,1,1,0,1,88
237,Delaware,2018,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,41
267,Florida,2018,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30
297,Georgia,2018,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6


Ici, pour synthétiser l'information, nous pouvons construire un score d'intensité de la régulation pour chaque indicatrice, et calculer le score total pour cahque état (en groupant les indicatrices par type de législation).
Les scores sont ici fixés de manière arbitraire, se rangeant de 1(législation modérée) à 5(législation très restrictive). Nous calculons ensuite le maximum de score pour chaque catégorie de législation :
- réglementation des vendeurs
- réglementation des acheteurs
- restriction des armes à haut risque
- *background checks*
- réglementation de la possession d'armes à feu
- réglementation des armes d'assaut
- protetion des enfants
- réglementation du traffic d'armes
- restrictions après des faits de violence domestique


In [42]:
# Adjusted scores for each category
category_scores = {
    'dealer_regulations': {
        'dealer': 5,
        'dealerh': 4,
        'recordsdealer': 3,
        'recordsdealerh': 3,
        'recordsall': 2,
        'recordsallh': 2,
        'reportdealer': 5,
        'reportdealerh': 4,
        'reportall': 3,
        'reportallh': 3,
        'purge': 2,
        'residential': 1,
        'theft': 3,
        'security': 4,
        'inspection': 3,
        'liability': 4,
        'junkgun': 5,
    },
    'buyers_regulations': {
        'waiting': 4,
        'waitingh': 3,
        'permit': 5,
        'permith': 4,
        'permitlaw': 3,
        'fingerprint': 4,
        'training': 3,
        'registration': 5,
        'registrationh': 4,
        'defactoreg': 3,
        'defactoregh': 3,
        'age21handgunsale': 4,
        'age18longgunsale': 3,
        'age21longgunsaled': 4,
        'age21longgunsale': 4,
        'loststolen': 3,
        'onepermonth': 2,
    },
    'high_risk_gun': {
        'felony': 4,
        'violent': 3,
        'violenth': 3,
        'violentpartial': 2,
        'invcommitment': 4,
        'invoutpatient': 3,
        'danger': 3,
        'drugmisdemeanor': 2,
        'alctreatment': 2,
        'alcoholism': 2,
        'relinquishment': 3,
    },
    'background_checks': {
        'universal': 5,
        'universalh': 4,
        'gunshow': 4,
        'gunshowh': 3,
        'universalpermit': 5,
        'universalpermith': 4,
        'backgroundpurge': 2,
        'threedaylimit': 3,
        'mentalhealth': 4,
        'statechecks': 4,
        'statechecksh': 3,
    },
    'possession_regulations': {
        'age21handgunpossess': 5,
        'age18longgunpossess': 4,
        'age21longgunpossess': 5,
        'gvro': 5,
        'gvrolawenforcement': 4,
        'college': 4,
        'collegeconcealed': 3,
        'elementary': 4,
        'opencarryh': 3,
        'opencarryl': 3,
        'opencarrypermith': 4,
        'opencarrypermitl': 4,
    },
    'assault_weapons': {
        'assault': 5,
        'onefeature': 4,
        'assaultlist': 4,
        'assaultregister': 3,
        'assaulttransfer': 3,
        'magazine': 4,
        'tenroundlimit': 3,
        'magazinepreowned': 3,
    },
    'child_access': {
        'lockd': 4,
        'lockp': 3,
        'lockstandards': 3,
        'locked': 4,
        'capliability': 5,
        'capaccess': 4,
        'capuses': 4,
        'capunloaded': 3,
        'cap18': 4,
        'cap16': 4,
        'cap14': 5,
    },
    'gun_trafficking': {
        'traffickingbackground': 4,
        'traffickingprohibited': 5,
        'traffickingprohibitedh': 4,
        'strawpurchase': 5,
        'strawpurchaseh': 4,
        'microstamp': 4,
        'personalized': 3,
    },
    'domestic_violence': {
        'mcdv': 5,
        'mcdvdating': 5,
        'mcdvsurrender': 4,
        'mcdvsurrendernoconditions': 5,
        'mcdvsurrenderdating': 4,
        'mcdvremovalallowed': 3,
        'mcdvremovalrequired': 4,
        'incidentremoval': 3,
        'incidentall': 4,
        'dvro': 5,
        'dvrodating': 4,
        'exparte': 5,
        'expartedating': 4,
        'dvrosurrender': 4,
        'dvrosurrendernoconditions': 5,
        'dvrosurrenderdating': 4,
        'expartesurrender': 4,
        'expartesurrendernoconditions': 5,
        'expartesurrenderdating': 4,
        'dvroremoval': 4,
        'stalking': 4,
    },
    'concealed_carry' : {
    'permitconcealed': 3,
    'mayissue': 2,
    'showing': 2,
    'ccrevoke': 2,
    'ccbackground': 3,
    'ccbackgroundnics': 3,
    'ccrenewbackground': 2,
    }
}

In [43]:
# Applying the adjusted scoring system
for category, indicator_scores in category_scores.items():
    legislation_db[f'{category}_score'] = legislation_db.apply(
        lambda row: max(row[indicator] * score for indicator, score in indicator_scores.items()),
        axis=1
    )


In [44]:
cols = ['state', 'year', 'lawtotal'] + [f'{category}_score' for category in category_scores.keys()]
legislation_db = legislation_db[cols]
legislation_db.sample(5)

Unnamed: 0,state,year,lawtotal,dealer_regulations_score,buyers_regulations_score,high_risk_gun_score,background_checks_score,possession_regulations_score,assault_weapons_score,child_access_score,gun_trafficking_score,domestic_violence_score,concealed_carry_score
897,New Jersey,2018,78,5,5,4,5,5,5,5,3,5,3
1347,Vermont,2018,19,3,4,0,5,4,4,0,0,5,0
477,Kansas,2018,7,0,0,4,0,0,0,0,0,5,0
1497,Wyoming,2018,7,0,4,4,0,4,0,0,0,0,0
447,Iowa,2018,24,0,4,4,4,5,0,5,0,5,3


In [45]:
score_columns = [f'{category}_score' for category in category_scores.keys()]
legislation_db['score_sum'] = legislation_db[score_columns].sum(axis=1)
legislation_db.sample(5)

Unnamed: 0,state,year,lawtotal,dealer_regulations_score,buyers_regulations_score,high_risk_gun_score,background_checks_score,possession_regulations_score,assault_weapons_score,child_access_score,gun_trafficking_score,domestic_violence_score,concealed_carry_score,score_sum
1437,West Virginia,2018,18,4,4,4,0,0,0,0,0,5,0,17
1287,Texas,2018,18,0,3,4,0,4,0,5,0,5,3,24
447,Iowa,2018,24,0,4,4,4,5,0,5,0,5,3,30
417,Indiana,2018,12,4,0,0,0,4,0,0,4,5,3,20
117,Arkansas,2018,11,0,0,4,0,4,0,0,0,0,3,11


In [46]:
category_weights = {
    'dealer_regulations': 0.2,
    'buyers_regulations': 0.15,
    'high_risk_gun': 0.1,
    'background_checks': 0.1,
    'possession_regulations': 0.1,
    'assault_weapons': 0.1,
    'child_access': 0.05,
    'gun_trafficking': 0.05,
    'domestic_violence': 0.1,
    'concealed_carry': 0.15,
}

# Calculate the weighted sum for each row
legislation_db['score_legis'] = 0
for category, weight in category_weights.items():
    legislation_db['score_legis'] += legislation_db[f'{category}_score'] * weight

In [47]:
legislation_db = legislation_db[['state', 'year', 'score_legis']]
legislation_db.sample(5)

Unnamed: 0,state,year,score_legis
387,Illinois,2018,4.5
627,Massachusetts,2018,5.1
807,Nebraska,2018,2.95
27,Alabama,2018,2.15
987,North Carolina,2018,3.6


## Récupération du nombre d'armes vendues chaque année par Etat

Pas de données publiques, le proxy le plus fréquent est le nombre de background checks : https://github.com/BuzzFeedNews/nics-firearm-background-checks/tree/master




In [48]:
bg_checks_db = pd.read_csv('data/bg_checks.csv')
bg_checks_db = bg_checks_db[['month', 'state', 'totals']]
bg_checks_db.sample(5)

Unnamed: 0,month,state,totals
1762,2021-01,Arizona,60548
10379,2008-01,Oregon,12399
4559,2016-11,Virgin Islands,76
1989,2020-09,Florida,154982
13770,2002-11,Maine,4843


In [49]:
bg_checks_db['month'] = pd.to_datetime(bg_checks_db['month'] + '-01')

# Create a new column for the year
bg_checks_db['year'] = bg_checks_db['month'].dt.year

# Create a pivot table
bg_checks_db = bg_checks_db.pivot_table(values='totals', index='state', columns='year', aggfunc='sum', fill_value=0)

# Display the result
bg_checks_db = bg_checks_db.loc[:, 2013:2018]
bg_checks_db.reset_index(inplace=True)

In [50]:
bg_checks_db.columns = ['state', 'bchecks_2013', 'bchecks_2014', 'bchecks_2015', 'bchecks_2016', 'bchecks_2017', 'bchecks_2018']

In [51]:
bg_checks_db.sample(5)

Unnamed: 0,state,bchecks_2013,bchecks_2014,bchecks_2015,bchecks_2016,bchecks_2017,bchecks_2018
14,Illinois,1280613,1344096,1247398,1924070,1601087,2831447
35,North Carolina,574622,1182349,531876,601445,537813,529916
3,Arkansas,279736,234282,257346,266014,237629,248439
1,Alaska,93405,87623,85621,87647,80839,78761
44,South Dakota,92055,83659,100268,111921,98573,90693


## Récupération de la part de vote républicain / démocrate

Pour cela nous disponsons d'une base recensant les votes dans chaque comté aux présidentielles de 2016: https://github.com/tonmcg/US_County_Level_Election_Results_08-20/tree/master. Il sera facile ensuite de merge les résultats car cette base renseigne pour chaque ligne le fips (identifiant des comtés.)

In [62]:
vote_db = pd.read_csv('https://raw.githubusercontent.com/tonmcg/US_County_Level_Election_Results_08-20/master/2016_US_County_Level_Presidential_Results.csv')
vote_db= vote_db[['per_dem', 'per_gop', 'county_name', 'combined_fips']]
vote_db['combined_fips'] = vote_db['combined_fips'].astype(str)
vote_db['combined_fips'] = vote_db['combined_fips'].apply(lambda x: x.zfill(5))
vote_db.sample(5)

Unnamed: 0,per_dem,per_gop,county_name,combined_fips
2654,0.202036,0.764747,Kerr County,48265
2147,0.216165,0.740568,Craig County,40035
2439,0.171026,0.80226,Claiborne County,47025
2935,0.179241,0.799346,Wise County,51195
2969,0.510525,0.3942,Kitsap County,53035


## Fusion des bases et export

In [63]:
final_db = pd.merge(left=counties_db, right=legislation_db, how='left', left_on='Etat', right_on='state').drop(columns=['state', 'year'])
final_db = pd.merge(left=final_db, right=bg_checks_db, how='left', left_on='Etat', right_on='state').drop(columns=['state'])
final_db = pd.merge(left=final_db, right=vote_db, how='left', left_on='fips', right_on='combined_fips').drop(columns=['combined_fips', 'county_name'])

final_db.sample(5)

Unnamed: 0,Nom,Etat,Code_Etat,id_Etat,id_county,fips,geometry,unemp_rate_year_2013_x,unemp_rate_year_2014_x,unemp_rate_year_2015_x,...,bachelors_deg_year_2018_y,score_legis,bchecks_2013,bchecks_2014,bchecks_2015,bchecks_2016,bchecks_2017,bchecks_2018,per_dem,per_gop
3147,Polk,Wisconsin,WI,27333,30481,55095,"POLYGON ((5255.000 8289.000, 5258.000 8210.000...",6.4,5.3,4.8,...,20.4,2.45,434688,334308,383280,561819,526523,452520,0.333524,0.610991
1122,Ascension,Louisiana,LA,27300,28464,22005,"POLYGON ((5769.000 4283.000, 5708.000 4266.000...",4.2,5.2,4.1,...,26.8,2.2,353025,315357,372613,391869,329201,307192,0.301319,0.66105
6,Butler,Alabama,AL,27282,27342,1013,"POLYGON ((6576.000 4764.000, 6575.000 4772.000...",9.3,7.5,7.5,...,16.1,2.15,563880,621305,737509,616947,477345,474294,0.427864,0.563155
11,Choctaw,Alabama,AL,27282,27347,1023,"POLYGON ((6227.000 4878.000, 6274.000 4881.000...",9.0,7.9,8.5,...,13.0,2.15,563880,621305,737509,616947,477345,474294,0.427766,0.564392
1423,Covington,Mississippi,MS,153,28771,28031,"POLYGON ((5942.000 4716.000, 5951.000 4717.000...",6.7,5.7,5.8,...,17.4,0.8,231711,214829,252372,276161,251852,247278,0.371672,0.616951


In [57]:
final_db.to_csv('data/final_db.csv')