# Traitement des données

Dans cette partie, l'objectif est d'importer nos données, les convertir à un format convenable pour la production de statistiques descriptives, et exporter le résultat pour que ces bases puissent être utilisées par les autres programmes

Les bases en question sont :
- la base listant tous les incidents de violence par armes à feu aux USA entre 2013 et 2018
- la base listant les caractéristiques générales des comtés et de ses habitants.

In [3]:
#Pour le traitement classique
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import MultiPolygon

#Pour l'interaction avec l'API 
import requests
from statistics import mean
import time

## Traitement de la base d'incidents armes à feu

In [3]:
#Base incidents armes à feu
url="https://drive.google.com/file/d/1GGOLMc_Ow9yZC9sICegPegDggQuHOD3t/view?usp=drive_link"
url="https://drive.google.com/uc?export=download&confirm=1&id=" + url.split("/")[-2]
gun_violence_db = pd.read_csv(url)
gun_violence_db.sample(5)

Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
45186,217138,2014-11-06,Mississippi,Jackson,Clinton Boulevard,0,0,http://www.gunviolencearchive.org/incident/217138,http://www.wapt.com/news/1-sought-in-family-do...,False,...,,1::Adult 18+,1::Male,,,1::Unharmed,0::Victim||1::Subject-Suspect,http://www.wapt.com/news/1-sought-in-family-do...,67.0,28.0
43958,223685,2014-10-28,Massachusetts,North Adams,375 Church Street,0,0,http://www.gunviolencearchive.org/incident/223685,http://www.masslive.com/news/index.ssf/2015/01...,False,...,0::25,0::Adult 18+,0::Male,0::Christopher LaFrance,,"0::Unharmed, Arrested",0::Subject-Suspect,http://www.berkshireeagle.com/local/ci_2698805...,,
76540,363300,2015-06-24,Illinois,East Saint Louis,I-255 and State St,0,0,http://www.gunviolencearchive.org/incident/363300,http://www.bnd.com/news/local/article106217282...,False,...,0::52,0::Adult 18+,0::Male,0::Gregory K. Nelson,,"0::Unharmed, Arrested",0::Subject-Suspect,http://www.bnd.com/news/local/crime/article253...,114.0,57.0
2950,106299,2014-01-19,Minnesota,Saint Cloud,,0,1,http://www.gunviolencearchive.org/incident/106299,http://www.kvsc.org/news-detail/a-man-accident...,False,...,0::26,0::Adult 18+,0::Male,0::Mark Garrett Blankenship,,0::Injured,0::Victim,http://www.kvsc.org/news-detail/a-man-accident...,,14.0
143000,640460,2016-08-26,Virginia,Newport News,700 Pilot House Dr,0,1,http://www.gunviolencearchive.org/incident/640460,http://pilotonline.com/news/local/crime/newpor...,False,...,0::21,0::Adult 18+,0::Male,,,0::Injured,0::Victim,http://pilotonline.com/news/local/crime/newpor...,94.0,2.0


Conformément à la documentation de la base, certaines colonnes sont codées de façon à pouvoir les reconvertir en dictionnaire :

In [4]:
def convert_to_dict(value):
    if pd.isna(value):
        return value

    pairs = value.split('||')
    result_dict = {}
    for pair in pairs:
        #Some are corrupted : 1: instead of ::
        if '::' in pair:
            key, val = pair.split('::', 1)
            result_dict[int(key)] = val
        else:
            key, val = pair.split(':', 1)
            result_dict[int(key)] = val
    return result_dict

list_of_dict_columns = ['gun_stolen', 'gun_type', 'participant_age', 'participant_age_group', 'participant_gender', 'participant_name', 'participant_relationship', 'participant_status', 'participant_type']
gun_violence_db[list_of_dict_columns] = gun_violence_db[list_of_dict_columns].applymap(convert_to_dict)
gun_violence_db.head()



  gun_violence_db[list_of_dict_columns] = gun_violence_db[list_of_dict_columns].applymap(convert_to_dict)


Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.gunviolencearchive.org/incident/461105,http://www.post-gazette.com/local/south/2013/0...,False,...,{0: '20'},"{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Male', 1: 'Male', 3: 'Male', 4: 'Female'}",{0: 'Julian Sims'},,"{0: 'Arrested', 1: 'Injured', 2: 'Injured', 3:...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Vi...",http://pittsburgh.cbslocal.com/2013/01/01/4-pe...,,
1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.gunviolencearchive.org/incident/460726,http://www.dailybulletin.com/article/zz/201301...,False,...,{0: '20'},"{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...",{0: 'Male'},{0: 'Bernard Gillis'},,"{0: 'Killed', 1: 'Injured', 2: 'Injured', 3: '...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Vi...",http://losangeles.cbslocal.com/2013/01/01/man-...,62.0,35.0
2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,http://www.gunviolencearchive.org/incident/478855,http://chronicle.northcoastnow.com/2013/02/14/...,False,...,"{0: '25', 1: '31', 2: '33', 3: '34', 4: '33'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Male', 1: 'Male', 2: 'Male', 3: 'Male', 4...","{0: 'Damien Bell', 1: 'Desmen Noble', 2: 'Herm...",,"{0: 'Injured, Unharmed, Arrested', 1: 'Unharme...","{0: 'Subject-Suspect', 1: 'Subject-Suspect', 2...",http://www.morningjournal.com/general-news/201...,56.0,13.0
3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.gunviolencearchive.org/incident/478925,http://www.dailydemocrat.com/20130106/aurora-s...,False,...,"{0: '29', 1: '33', 2: '56', 3: '33'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Female', 1: 'Male', 2: 'Male', 3: 'Male'}","{0: 'Stacie Philbrook', 1: 'Christopher Ratlif...",,"{0: 'Killed', 1: 'Killed', 2: 'Killed', 3: 'Ki...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Su...",http://denver.cbslocal.com/2013/01/06/officer-...,40.0,28.0
4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.gunviolencearchive.org/incident/478959,http://www.journalnow.com/news/local/article_d...,False,...,"{0: '18', 1: '46', 2: '14', 3: '47'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Teen 12-1...","{0: 'Female', 1: 'Male', 2: 'Male', 3: 'Female'}","{0: 'Danielle Imani Jameison', 1: 'Maurice Eug...",{3: 'Family'},"{0: 'Injured', 1: 'Injured', 2: 'Killed', 3: '...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Su...",http://myfox8.com/2013/01/08/update-mother-sho...,62.0,27.0


In [5]:
gun_violence_db.to_csv("data/gun_violence_db.csv", index=False)

## Traitements des bases comtés de l'API


La documentation précisant le mode d'interaction avec l'API de StLouisFed se trouve à la page https://fred.stlouisfed.org/docs/api/fred/#API.

In [245]:
#Each request is categorized with an url and an id
#The gist here is to recover the proper id to retrieve data
api_key = "180de2e6a1d1e953d270ebf38341cd44"
param = {"api_key" : api_key, "file_type" : "json", "category_id" : "27281"}
url = "https://api.stlouisfed.org/fred/category/children?"

In [246]:
def request_db(index):
    #this function requests to the API the database associated with the category id index
    param["category_id"] = index #on ajuste les paramètres de la request pour demander la bonne catégorie
    response = requests.get(url, params = param)
    data = response.json()
    return data

In [247]:
#Some names are ambiguous between dframes
def simplify_name(name):
    if name.endswith("County"):
        return name.rsplit("County", 1)[0].strip()
    if name.endswith("Parish"):
        return name.rsplit("Parish", 1)[0].strip()
    if name.endswith("Census Area"):
        return name.replace("Census Area", "CA")
    if name.endswith("Borough/city"):
        return name.replace("Borough/city", "Cty&Bor")      
    if name.endswith("Municipality"):
        return name.replace("Municipality", "Muny")
    if name.endswith("Borough/municipality"):
        return name.replace("Borough/municipality", "Muny")    
    if name.endswith("County/city"):
        return name.rsplit("County/city", 1)[0].strip() 

        
    return name

In [248]:
us_data = request_db(27281)['categories']
#We create our dframe by creating a list of dicts, each element is a new row
database = list()
for state in us_data:
    id_state = state['id']
    state_name = state['name']
    
    #Request to recover id in order to extract counties
    state_info = request_db(id_state)["categories"]
    if state_info != []: #One exception : which one ?
        id_list_of_state_counties = state_info[0]['id']
        list_of_state_counties = request_db(id_list_of_state_counties)["categories"]
        for county in list_of_state_counties:
            dict_county = dict()
            id_county = county['id']
            
            parts = county['name'].split(', ')
            county_name, state_code = parts[0], parts[-1]
            
            if '+' in county_name:
                county_name, county_bis = county_name.split(' + ')
                dict_county['Nom'] = simplify_name(county_bis)
                dict_county['Etat'] = state_name
                dict_county['Code_Etat'] = state_code
                dict_county['id_Etat'] = id_state
                dict_county['id_county'] = id_county
                database.append(dict_county)

            #Update the dict to include basic values on each county
            dict_county['Nom'] = simplify_name(county_name)
            dict_county['Etat'] = state_name
            dict_county['Code_Etat'] = state_code
            dict_county['id_Etat'] = id_state
            dict_county['id_county'] = id_county
            database.append(dict_county)

counties_db = pd.DataFrame(database)

In [249]:
counties_db.sample(5)

Unnamed: 0,Nom,Etat,Code_Etat,id_Etat,id_county
2096,Lorain,Ohio,OH,27317,29456
3026,Lincoln,West Virginia,WV,27332,30398
312,Yuma,Colorado,CO,27287,27643
580,Gem,Idaho,ID,27294,27916
2521,Unicoi,Tennessee,TN,193,29888


In [250]:
#Gestion des exceptions pour le merging(obligé de faire du cas par cas car merging sur le code_Etat + nom)

counties_db.loc[(counties_db['Code_Etat'] == 'Aleutian Islands Census Area'), 'Code_Etat'] = 'AK'
counties_db.loc[(counties_db['Code_Etat'] == 'District of Columbia'), 'Code_Etat'] = 'DC'
counties_db.loc[(counties_db['Nom'] == 'De Soto'), 'Nom'] = 'DeSoto'
counties_db.loc[(counties_db['Nom'] == 'DeSoto') & (counties_db['Code_Etat'] == 'LA'), 'Nom'] = 'De Soto'
counties_db.loc[(counties_db['Nom'] == 'De Kalb'), 'Nom'] = 'DeKalb'
counties_db.loc[(counties_db['Nom'] == 'Du Page'), 'Nom'] = 'DuPage'
counties_db.loc[(counties_db['Nom'] == 'La Salle'), 'Nom'] = 'LaSalle'
counties_db.loc[(counties_db['Nom'] == 'La Porte'), 'Nom'] = 'LaPorte'
counties_db.loc[(counties_db['Nom'] == 'Lagrange'), 'Nom'] = 'LaGrange'
counties_db.loc[(counties_db['Nom'] == 'LaFourche'), 'Nom'] = 'Lafourche'
counties_db.loc[(counties_db['Nom'] == 'Lac Qui Parle'), 'Nom'] = 'Lac qui Parle'
counties_db.loc[(counties_db['Nom'] == 'Dona Ana'), 'Nom'] = 'Doña Ana'
counties_db.loc[(counties_db['Nom'] == 'La Moure'), 'Nom'] = 'LaMoure'
counties_db.loc[(counties_db['Nom'] == 'De Witt'), 'Nom'] = 'DeWitt'
counties_db.loc[(counties_db['Nom'] == 'DeWitt') & (counties_db['Code_Etat'] == 'IL'), 'Nom'] = 'De Witt'
counties_db.loc[(counties_db['Nom'] == 'LaSalle') & (counties_db['Code_Etat'] == 'TX'), 'Nom'] = 'La Salle'
counties_db.loc[(counties_db['Code_Etat'] == 'WI (includes Menominee)'), 'Code_Etat'] = 'WI'
counties_db.loc[(counties_db['Nom'] == 'Fond Du Lac'), 'Nom'] = 'Fond du Lac'

On a maintenant un premier dframe recensant tous les comtés des USA ainsi que les ID permettant de les retrouver dans l'API. On peut désormais extraire pour chaque comté les informations socio-démographiques nous permettant de produire nos statistiques descriptives et notre modèle.

Note : id_county fait office de clé primaire dans cette base (au sein de l'API FRED)

In [15]:
counties_db.to_csv("data/counties_db.csv", index=False)

## Récupération des données géographiques par comté

In [251]:
#On change l'URL pour pouvoir récupérer des données géographiques sur tous les comtés (ainsi que le fips, le nom simplifié)
#L'enjeu ici est d'associer les données aux comtés correspondants : les codes id de l'API n'ont pas de correspondance avec les donneés géographiques à part le nom
url = "https://api.stlouisfed.org/geofred/shapes/file?shape=county"

In [252]:
geom_counties_db = request_db(29802)#The code is arbitrary here, each request gives the geometries of all counties
geom_counties_db = gpd.GeoDataFrame.from_features(geom_counties_db['features'])
geom_counties_db = geom_counties_db.loc[geom_counties_db['hc-group'] == 'admin2']
geom_counties_db['Code_Etat'] = geom_counties_db['hc-key'].apply(lambda x: x.split('-')[1].upper() if len(x.split('-')) > 1 else None)
geom_counties_db['name'] = geom_counties_db['name'].apply(lambda x : x.rsplit("Parish", 1)[0].strip())
geom_counties_db.sample(5)


Unnamed: 0,geometry,hc-group,hc-middle-x,hc-middle-y,hc-key,hc-a2,fips,name,Code_Etat
2442,"POLYGON ((5947.000 6167.000, 5924.000 6165.000...",admin2,0.5,0.5,us-il-181,UN,17181,Union,IL
1918,"POLYGON ((4447.000 8352.000, 4443.000 8321.000...",admin2,0.04,0.5,us-sd-109,RO,46109,Roberts,SD
1229,"MULTIPOLYGON (((8859.000 7570.000, 8850.000 76...",admin2,0.64,0.52,us-ny-119,WE,36119,Westchester,NY
1408,"POLYGON ((5759.000 5728.000, 5756.000 5774.000...",admin2,0.07,0.53,us-ar-093,MI,5093,Mississippi,AR
2259,"POLYGON ((6586.000 5967.000, 6603.000 5968.000...",admin2,0.48,0.57,us-tn-189,WI,47189,Wilson,TN


In [253]:
counties_db = pd.merge(left= counties_db, right=geom_counties_db, how='left', left_on=['Nom', 'Code_Etat'], right_on=['name', 'Code_Etat'])
counties_db = counties_db.loc[:,['Nom', 'Etat', 'Code_Etat', 'id_Etat', 'id_county', 'fips', 'geometry']]

In [255]:
counties_db.loc[counties_db['geometry'] == None]

Unnamed: 0,Nom,Etat,Code_Etat,id_Etat,id_county,fips,geometry
68,Aleutian Islands CA,Alaska,AK,27283,33743,,
90,Prince of Wales-Outer Ketchikan CA,Alaska,AK,27283,27421,,
92,Skagway-Hoonah-Angoon CA,Alaska,AK,27283,27423,,
96,Wade Hampton CA,Alaska,AK,27283,27426,,
97,Wrangell Borough/City,Alaska,AK,27283,33518,,
98,Wrangell-Petersburg CA,Alaska,AK,27283,27427,,
99,Yakutat City and Borough,Alaska,Yakutat City and Borough,27283,32212,,
2425,Shannon,South Dakota,SD,27324,29791,,
2856,Clifton Forge City,Virginia,VA,27330,30228,,
2945,South Boston City,Virginia,VA,27330,32143,,


On a donc intégré les données géographiques pour chaque comté (presque, certains comtés en Alaska diffèrent entre les bases).