# Traitement des données

Dans cette partie, l'objectif est d'importer nos données, les convertir à un format convenable pour la production de statistiques descriptives, et exporter le résultat pour que ces bases puissent être utilisées par les autres programmes

Les bases en question sont :
- la base listant tous les incidents de violence par armes à feu aux USA entre 2013 et 2018
- la base listant les caractéristiques générales des comtés et de ses habitants.

In [2]:
#Pour le traitement classique
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import MultiPolygon

#Pour l'interaction avec l'API 
import requests
from statistics import mean
import time

## Traitement de la base d'incidents armes à feu

In [3]:
#Base incidents armes à feu
url="https://drive.google.com/file/d/1GGOLMc_Ow9yZC9sICegPegDggQuHOD3t/view?usp=drive_link"
url="https://drive.google.com/uc?export=download&confirm=1&id=" + url.split("/")[-2]
gun_violence_db = pd.read_csv(url)
gun_violence_db.sample(5)

Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
45186,217138,2014-11-06,Mississippi,Jackson,Clinton Boulevard,0,0,http://www.gunviolencearchive.org/incident/217138,http://www.wapt.com/news/1-sought-in-family-do...,False,...,,1::Adult 18+,1::Male,,,1::Unharmed,0::Victim||1::Subject-Suspect,http://www.wapt.com/news/1-sought-in-family-do...,67.0,28.0
43958,223685,2014-10-28,Massachusetts,North Adams,375 Church Street,0,0,http://www.gunviolencearchive.org/incident/223685,http://www.masslive.com/news/index.ssf/2015/01...,False,...,0::25,0::Adult 18+,0::Male,0::Christopher LaFrance,,"0::Unharmed, Arrested",0::Subject-Suspect,http://www.berkshireeagle.com/local/ci_2698805...,,
76540,363300,2015-06-24,Illinois,East Saint Louis,I-255 and State St,0,0,http://www.gunviolencearchive.org/incident/363300,http://www.bnd.com/news/local/article106217282...,False,...,0::52,0::Adult 18+,0::Male,0::Gregory K. Nelson,,"0::Unharmed, Arrested",0::Subject-Suspect,http://www.bnd.com/news/local/crime/article253...,114.0,57.0
2950,106299,2014-01-19,Minnesota,Saint Cloud,,0,1,http://www.gunviolencearchive.org/incident/106299,http://www.kvsc.org/news-detail/a-man-accident...,False,...,0::26,0::Adult 18+,0::Male,0::Mark Garrett Blankenship,,0::Injured,0::Victim,http://www.kvsc.org/news-detail/a-man-accident...,,14.0
143000,640460,2016-08-26,Virginia,Newport News,700 Pilot House Dr,0,1,http://www.gunviolencearchive.org/incident/640460,http://pilotonline.com/news/local/crime/newpor...,False,...,0::21,0::Adult 18+,0::Male,,,0::Injured,0::Victim,http://pilotonline.com/news/local/crime/newpor...,94.0,2.0


Conformément à la documentation de la base, certaines colonnes sont codées de façon à pouvoir les reconvertir en dictionnaire :

In [4]:
def convert_to_dict(value):
    if pd.isna(value):
        return value

    pairs = value.split('||')
    result_dict = {}
    for pair in pairs:
        #Some are corrupted : 1: instead of ::
        if '::' in pair:
            key, val = pair.split('::', 1)
            result_dict[int(key)] = val
        else:
            key, val = pair.split(':', 1)
            result_dict[int(key)] = val
    return result_dict

list_of_dict_columns = ['gun_stolen', 'gun_type', 'participant_age', 'participant_age_group', 'participant_gender', 'participant_name', 'participant_relationship', 'participant_status', 'participant_type']
gun_violence_db[list_of_dict_columns] = gun_violence_db[list_of_dict_columns].applymap(convert_to_dict)
gun_violence_db.head()



  gun_violence_db[list_of_dict_columns] = gun_violence_db[list_of_dict_columns].applymap(convert_to_dict)


Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.gunviolencearchive.org/incident/461105,http://www.post-gazette.com/local/south/2013/0...,False,...,{0: '20'},"{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Male', 1: 'Male', 3: 'Male', 4: 'Female'}",{0: 'Julian Sims'},,"{0: 'Arrested', 1: 'Injured', 2: 'Injured', 3:...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Vi...",http://pittsburgh.cbslocal.com/2013/01/01/4-pe...,,
1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.gunviolencearchive.org/incident/460726,http://www.dailybulletin.com/article/zz/201301...,False,...,{0: '20'},"{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...",{0: 'Male'},{0: 'Bernard Gillis'},,"{0: 'Killed', 1: 'Injured', 2: 'Injured', 3: '...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Vi...",http://losangeles.cbslocal.com/2013/01/01/man-...,62.0,35.0
2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,http://www.gunviolencearchive.org/incident/478855,http://chronicle.northcoastnow.com/2013/02/14/...,False,...,"{0: '25', 1: '31', 2: '33', 3: '34', 4: '33'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Male', 1: 'Male', 2: 'Male', 3: 'Male', 4...","{0: 'Damien Bell', 1: 'Desmen Noble', 2: 'Herm...",,"{0: 'Injured, Unharmed, Arrested', 1: 'Unharme...","{0: 'Subject-Suspect', 1: 'Subject-Suspect', 2...",http://www.morningjournal.com/general-news/201...,56.0,13.0
3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.gunviolencearchive.org/incident/478925,http://www.dailydemocrat.com/20130106/aurora-s...,False,...,"{0: '29', 1: '33', 2: '56', 3: '33'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Female', 1: 'Male', 2: 'Male', 3: 'Male'}","{0: 'Stacie Philbrook', 1: 'Christopher Ratlif...",,"{0: 'Killed', 1: 'Killed', 2: 'Killed', 3: 'Ki...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Su...",http://denver.cbslocal.com/2013/01/06/officer-...,40.0,28.0
4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.gunviolencearchive.org/incident/478959,http://www.journalnow.com/news/local/article_d...,False,...,"{0: '18', 1: '46', 2: '14', 3: '47'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Teen 12-1...","{0: 'Female', 1: 'Male', 2: 'Male', 3: 'Female'}","{0: 'Danielle Imani Jameison', 1: 'Maurice Eug...",{3: 'Family'},"{0: 'Injured', 1: 'Injured', 2: 'Killed', 3: '...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Su...",http://myfox8.com/2013/01/08/update-mother-sho...,62.0,27.0


In [5]:
gun_violence_db.to_csv("data/gun_violence_db.csv", index=False)

## Traitements des bases comtés de l'API


La documentation précisant le mode d'interaction avec l'API de StLouisFed se trouve à la page https://fred.stlouisfed.org/docs/api/fred/#API.

In [98]:
#Each request is categorized with an url and an id
#The gist here is to recover the proper id to retrieve data
api_key = "180de2e6a1d1e953d270ebf38341cd44"
param = {"api_key" : api_key, "file_type" : "json", "category_id" : "27281"}
url = "https://api.stlouisfed.org/fred/category/children?"

In [7]:
def request_db(index):
    #this function requests to the API the database associated with the category id index
    param["category_id"] = index #on ajuste les paramètres de la request pour demander la bonne catégorie
    response = requests.get(url, params = param)
    data = response.json()
    return data

In [107]:
def simplify_name(name):
    if name.endswith("County"):
        return name .rsplit("County", 1)[0].strip()
    if name.endswith("Parish"):
        return name .rsplit("Parish", 1)[0].strip()
    if name.endswith("Census Area"):
        return name.replace("Census Area", "CA")
    if name.endswith("Borough/city"):
        return name.replace("Borough/city", "Cty&Bor")      
    if name.endswith("Municipality"):
        return name.replace("Municipality", "Muny")
    if name.endswith("Borough/municipality"):
        return name.replace("Borough/municipality", "Muny")    
    return name

In [108]:
us_data = request_db(27281)['categories']
#We create our dframe by creating a list of dicts, each element is a new row
database = list()
for state in us_data:
    id_state = state['id']
    state_name = state['name']
    
    #Request to recover id in order to extract counties
    state_info = request_db(id_state)["categories"]
    if state_info != []: #One exception : which one ?
        id_list_of_state_counties = state_info[0]['id']
        list_of_state_counties = request_db(id_list_of_state_counties)["categories"]
        for county in list_of_state_counties:
            dict_county = dict()
            id_county = county['id']
            
            if county['name'].count(',') == 1:
                county_name,  state_code = county['name'].split(',')
                state_code = state_code.lstrip()
            else:
                county_name = county['name']
                state_code = np.nan

            #Update the dict to include basic values on each county
            dict_county['Nom'] = simplify_name(county_name)
            dict_county['Etat'] = state_name
            dict_county['Code_Etat'] = state_code
            dict_county['id_Etat'] = id_state
            dict_county['id_county'] = id_county
            database.append(dict_county)

counties_db = pd.DataFrame(database)

In [109]:
counties_db.loc[counties_db['Code_Etat'] == 'AK']

Unnamed: 0,Nom,Etat,Code_Etat,id_Etat,id_county
67,Aleutians East Borough,Alaska,AK,27283,27404
69,Aleutians West CA,Alaska,AK,27283,27405
70,Anchorage Muny,Alaska,AK,27283,27406
71,Bethel CA,Alaska,AK,27283,27407
72,Bristol Bay Borough,Alaska,AK,27283,27408
73,Denali Borough,Alaska,AK,27283,32079
74,Dillingham CA,Alaska,AK,27283,27409
75,Fairbanks North Star Borough,Alaska,AK,27283,27410
76,Haines Borough,Alaska,AK,27283,27411
77,Hoonah-Angoon CA,Alaska,AK,27283,33517


On a maintenant un premier dframe recensant tous les comtés des USA ainsi que les ID permettant de les retrouver dans l'API. On peut désormais extraire pour chaque comté les informations socio-démographiques nous permettant de produire nos statistiques descriptives et notre modèle.

Note : id_county fait office de clé primaire dans cette base

In [15]:
counties_db.to_csv("data/counties_db.csv", index=False)

In [87]:
#On change l'URL pour pouvoir récupérer des données géographiques sur tous les comtés (ainsi que le fips, le nom simplifié)
#L'enjeu ici est d'associer les données aux comtés correspondants : les codes id de l'API n'ont pas de correspondance avec les donneés géographiques à part le nom
url = "https://api.stlouisfed.org/geofred/shapes/file?shape=county"

In [126]:
for id_county in counties_db['id_county']:
    geom_data_county = request_db(id_county)['features'][0]

    simple_name = geom_data_county['properties']['name']
    fips = geom_data_county['properties']['fips']


    

KeyError: 'categories'

In [94]:
test_db = request_db(29802)
test_db = gpd.GeoDataFrame.from_features(test_db['features'])
test_db = test_db.loc[test_db['hc-group'] == 'admin2']
test_db

Unnamed: 0,geometry,hc-group,hc-middle-x,hc-middle-y,hc-key,hc-a2,fips,name
0,"MULTIPOLYGON (((-422.000 5749.000, -422.000 57...",admin2,0.21,0.27,us-ca-083,SB,06083,Santa Barbara
1,"MULTIPOLYGON (((-538.000 5722.000, -524.000 57...",admin2,0.48,0.26,us-ca-111,VE,06111,Ventura
2,"MULTIPOLYGON (((-359.000 5610.000, -321.000 55...",admin2,0.52,0.20,us-ca-037,LA,06037,Los Angeles
3,"MULTIPOLYGON (((9288.000 7745.000, 9279.000 77...",admin2,0.47,0.40,us-ri-009,WA,44009,Washington
4,"MULTIPOLYGON (((8015.000 3042.000, 8011.000 30...",admin2,0.54,0.43,us-fl-087,MO,12087,Monroe
...,...,...,...,...,...,...,...,...
3140,"POLYGON ((43.000 3931.000, 75.000 3920.000, 11...",admin2,0.38,0.56,us-ak-188,,02188,Northwest Arctic Borough
3141,"POLYGON ((-187.000 3393.000, -188.000 3405.000...",admin2,0.49,0.52,us-ak-270,WH,02270,Kusilvak CA
3142,"POLYGON ((962.000 3974.000, 903.000 3972.000, ...",admin2,0.48,0.51,us-ak-185,NS,02185,North Slope Borough
3143,"POLYGON ((990.000 3158.000, 990.000 3150.000, ...",admin2,0.37,0.23,us-ak-282,YA,02282,Yakutat Cty&Bor


In [95]:
test_db.loc[test_db['hc-key'].str.startswith('us-ak')]

Unnamed: 0,geometry,hc-group,hc-middle-x,hc-middle-y,hc-key,hc-a2,fips,name
3116,"POLYGON ((1340.000 3043.000, 1346.000 3033.000...",admin2,0.51,0.46,us-ak-110,JU,2110,Juneau Cty&Bor
3117,"MULTIPOLYGON (((710.000 3060.000, 717.000 3078...",admin2,0.58,0.47,us-ak-261,VC,2261,Valdez-Cordova CA
3118,"MULTIPOLYGON (((0.000 3093.000, -19.000 3088.0...",admin2,0.6,0.45,us-ak-070,DI,2070,Dillingham CA
3119,"MULTIPOLYGON (((-277.000 2715.000, -264.000 27...",admin2,0.57,0.59,us-ak-013,AE,2013,Aleutians East Borough
3120,"MULTIPOLYGON (((-165.000 3831.000, -157.000 38...",admin2,0.69,0.42,us-ak-180,NO,2180,Nome CA
3121,"MULTIPOLYGON (((-563.000 3130.000, -589.000 31...",admin2,0.72,0.92,us-ak-016,AW,2016,Aleutians West CA
3122,"MULTIPOLYGON (((398.000 3041.000, 402.000 3049...",admin2,0.69,0.46,us-ak-150,KI,2150,Kodiak Island Borough
3123,"POLYGON ((427.000 3418.000, 422.000 3384.000, ...",admin2,0.53,0.48,us-ak-290,YK,2290,Yukon-Koyukuk CA
3124,"POLYGON ((427.000 3418.000, 428.000 3427.000, ...",admin2,0.6,0.46,us-ak-170,MS,2170,Matanuska-Susitna Borough
3125,"POLYGON ((668.000 3563.000, 669.000 3563.000, ...",admin2,0.54,0.43,us-ak-068,DE,2068,Denali Borough
