# Traitement des données

Dans cette partie, l'objectif est d'importer nos données, les convertir à un format convenable pour la production de statistiques descriptives, et exporter le résultat pour que ces bases puissent être utilisées par les autres programmes

Les bases en question sont :
- la base listant tous les incidents de violence par armes à feu aux USA entre 2013 et 2018
- la base listant les caractéristiques générales des comtés et de ses habitants.

In [2]:
#Pour le traitement classique
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import MultiPolygon
from tqdm import tqdm

#Pour l'interaction avec l'API 
import requests
from statistics import mean
import time

## Traitement de la base d'incidents armes à feu

In [2]:
#Base incidents armes à feu
url="https://drive.google.com/file/d/1GGOLMc_Ow9yZC9sICegPegDggQuHOD3t/view?usp=drive_link"
url="https://drive.google.com/uc?export=download&confirm=1&id=" + url.split("/")[-2]
gun_violence_db = pd.read_csv(url)
gun_violence_db.sample(5)

Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
105952,477684,2016-01-02,California,Richmond,2200 block of Cutting Boulevard,0,1,http://www.gunviolencearchive.org/incident/477684,http://www.insidebayarea.com/crime-courts/ci_2...,False,...,0::52,0::Adult 18+||1::Adult 18+,0::Female||1::Male,0::Claire Dugan,,0::Injured||1::Unharmed,0::Victim||1::Subject-Suspect,http://www.insidebayarea.com/breaking-news/ci_...,15.0,9.0
127111,563704,2016-05-21,Michigan,Detroit,13900 block of Wyoming,0,3,http://www.gunviolencearchive.org/incident/563704,http://www.detroitnews.com/story/news/local/de...,False,...,0::20||1::21||2::21,0::Adult 18+||1::Adult 18+||2::Adult 18+,0::Male||1::Male||2::Male,,,0::Injured||1::Injured||2::Injured,0::Victim||1::Victim||2::Victim,http://www.detroitnews.com/story/news/local/de...,7.0,3.0
40017,216857,2014-10-03,California,Oakland,1100 10th Avenue,0,0,http://www.gunviolencearchive.org/incident/216857,https://data.oaklandnet.com/Public-Safety/Crim...,False,...,,,0::Male||1::Male,,,,0::Victim||1::Subject-Suspect,https://data.oaklandnet.com/Public-Safety/Crim...,18.0,9.0
157829,737796,2016-11-22,Alaska,Soldotna,,0,0,http://www.gunviolencearchive.org/incident/737796,http://www.ktuu.com/content/news/Soldotna-conv...,False,...,0::32,0::Adult 18+,0::Male,0::Scott Hashemian,,"0::Unharmed, Arrested||1::Unharmed",0::Subject-Suspect||1::Subject-Suspect,http://www.ktuu.com/content/news/Soldotna-conv...,29.0,
171978,774387,2017-02-14,Arkansas,Jonesboro,109 Harvester Dr,0,0,http://www.gunviolencearchive.org/incident/774387,http://www.kait8.com/story/34500618/deputys-ca...,False,...,,,,,,,,http://www.kait8.com/story/34500618/deputys-ca...,58.0,21.0


Conformément à la documentation de la base, certaines colonnes sont codées de façon à pouvoir les reconvertir en dictionnaire :

In [3]:
def convert_to_dict(value):
    if pd.isna(value):
        return value

    pairs = value.split('||')
    result_dict = {}
    for pair in pairs:
        #Some are corrupted : 1: instead of ::
        if '::' in pair:
            key, val = pair.split('::', 1)
            result_dict[int(key)] = val
        else:
            key, val = pair.split(':', 1)
            result_dict[int(key)] = val
    return result_dict

list_of_dict_columns = ['gun_stolen', 'gun_type', 'participant_age', 'participant_age_group', 'participant_gender', 'participant_name', 'participant_relationship', 'participant_status', 'participant_type']
gun_violence_db[list_of_dict_columns] = gun_violence_db[list_of_dict_columns].applymap(convert_to_dict)
gun_violence_db.head()



  gun_violence_db[list_of_dict_columns] = gun_violence_db[list_of_dict_columns].applymap(convert_to_dict)


Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.gunviolencearchive.org/incident/461105,http://www.post-gazette.com/local/south/2013/0...,False,...,{0: '20'},"{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Male', 1: 'Male', 3: 'Male', 4: 'Female'}",{0: 'Julian Sims'},,"{0: 'Arrested', 1: 'Injured', 2: 'Injured', 3:...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Vi...",http://pittsburgh.cbslocal.com/2013/01/01/4-pe...,,
1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.gunviolencearchive.org/incident/460726,http://www.dailybulletin.com/article/zz/201301...,False,...,{0: '20'},"{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...",{0: 'Male'},{0: 'Bernard Gillis'},,"{0: 'Killed', 1: 'Injured', 2: 'Injured', 3: '...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Vi...",http://losangeles.cbslocal.com/2013/01/01/man-...,62.0,35.0
2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,http://www.gunviolencearchive.org/incident/478855,http://chronicle.northcoastnow.com/2013/02/14/...,False,...,"{0: '25', 1: '31', 2: '33', 3: '34', 4: '33'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Male', 1: 'Male', 2: 'Male', 3: 'Male', 4...","{0: 'Damien Bell', 1: 'Desmen Noble', 2: 'Herm...",,"{0: 'Injured, Unharmed, Arrested', 1: 'Unharme...","{0: 'Subject-Suspect', 1: 'Subject-Suspect', 2...",http://www.morningjournal.com/general-news/201...,56.0,13.0
3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.gunviolencearchive.org/incident/478925,http://www.dailydemocrat.com/20130106/aurora-s...,False,...,"{0: '29', 1: '33', 2: '56', 3: '33'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Female', 1: 'Male', 2: 'Male', 3: 'Male'}","{0: 'Stacie Philbrook', 1: 'Christopher Ratlif...",,"{0: 'Killed', 1: 'Killed', 2: 'Killed', 3: 'Ki...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Su...",http://denver.cbslocal.com/2013/01/06/officer-...,40.0,28.0
4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.gunviolencearchive.org/incident/478959,http://www.journalnow.com/news/local/article_d...,False,...,"{0: '18', 1: '46', 2: '14', 3: '47'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Teen 12-1...","{0: 'Female', 1: 'Male', 2: 'Male', 3: 'Female'}","{0: 'Danielle Imani Jameison', 1: 'Maurice Eug...",{3: 'Family'},"{0: 'Injured', 1: 'Injured', 2: 'Killed', 3: '...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Su...",http://myfox8.com/2013/01/08/update-mother-sho...,62.0,27.0


In [4]:
gun_violence_db.to_csv("data/gun_violence_db.csv", index=False)

## Traitements des bases comtés de l'API


La documentation précisant le mode d'interaction avec l'API de StLouisFed se trouve à la page https://fred.stlouisfed.org/docs/api/fred/#API.

In [3]:
#Each request is categorized with an url and an id
#The gist here is to recover the proper id to retrieve data
api_key = "180de2e6a1d1e953d270ebf38341cd44"
param = {"api_key" : api_key, "file_type" : "json", "category_id" : "27281"}
url = "https://api.stlouisfed.org/fred/category/children?"

In [4]:
def request_db(index):
    #this function requests to the API the database associated with the category id index
    param["category_id"] = index #on ajuste les paramètres de la request pour demander la bonne catégorie
    response = requests.get(url, params = param)
    data = response.json()
    return data

In [5]:
#Some names are ambiguous between dframes
def simplify_name(name):
    if name.endswith("County"):
        return name.rsplit("County", 1)[0].strip()
    if name.endswith("Parish"):
        return name.rsplit("Parish", 1)[0].strip()
    if name.endswith("Census Area"):
        return name.replace("Census Area", "CA")
    if name.endswith("Borough/city"):
        return name.replace("Borough/city", "Cty&Bor")      
    if name.endswith("Municipality"):
        return name.replace("Municipality", "Muny")
    if name.endswith("Borough/municipality"):
        return name.replace("Borough/municipality", "Muny")    
    if name.endswith("County/city"):
        return name.rsplit("County/city", 1)[0].strip() 

        
    return name

In [6]:
us_data = request_db(27281)['categories']
#We create our dframe by creating a list of dicts, each element is a new row
database = list()
for state in us_data:
    id_state = state['id']
    state_name = state['name']
    
    #Request to recover id in order to extract counties
    state_info = request_db(id_state)["categories"]
    if state_info != []: #One exception : which one ?
        id_list_of_state_counties = state_info[0]['id']
        list_of_state_counties = request_db(id_list_of_state_counties)["categories"]
        for county in list_of_state_counties:
            dict_county = dict()
            id_county = county['id']
            
            parts = county['name'].split(', ')
            county_name, state_code = parts[0], parts[-1]
            
            if '+' in county_name:
                county_name, county_bis = county_name.split(' + ')
                dict_county['Nom'] = simplify_name(county_bis)
                dict_county['Etat'] = state_name
                dict_county['Code_Etat'] = state_code
                dict_county['id_Etat'] = id_state
                dict_county['id_county'] = id_county
                database.append(dict_county)

            #Update the dict to include basic values on each county
            dict_county['Nom'] = simplify_name(county_name)
            dict_county['Etat'] = state_name
            dict_county['Code_Etat'] = state_code
            dict_county['id_Etat'] = id_state
            dict_county['id_county'] = id_county
            database.append(dict_county)

counties_db = pd.DataFrame(database)

In [7]:
counties_db.sample(5)

Unnamed: 0,Nom,Etat,Code_Etat,id_Etat,id_county
1627,Judith Basin,Montana,MT,27308,28977
382,St. Johns,Florida,FL,27291,27717
1208,Charles,Maryland,MD,27302,28552
188,White,Arkansas,AR,149,654
1067,Lewis,Kentucky,KY,152,28408


In [8]:
#Gestion des exceptions pour le merging(obligé de faire du cas par cas car merging sur le code_Etat + nom)

counties_db.loc[(counties_db['Code_Etat'] == 'Aleutian Islands Census Area'), 'Code_Etat'] = 'AK'
counties_db.loc[(counties_db['Code_Etat'] == 'District of Columbia'), 'Code_Etat'] = 'DC'
counties_db.loc[(counties_db['Nom'] == 'De Soto'), 'Nom'] = 'DeSoto'
counties_db.loc[(counties_db['Nom'] == 'DeSoto') & (counties_db['Code_Etat'] == 'LA'), 'Nom'] = 'De Soto'
counties_db.loc[(counties_db['Nom'] == 'De Kalb'), 'Nom'] = 'DeKalb'
counties_db.loc[(counties_db['Nom'] == 'Du Page'), 'Nom'] = 'DuPage'
counties_db.loc[(counties_db['Nom'] == 'La Salle'), 'Nom'] = 'LaSalle'
counties_db.loc[(counties_db['Nom'] == 'La Porte'), 'Nom'] = 'LaPorte'
counties_db.loc[(counties_db['Nom'] == 'Lagrange'), 'Nom'] = 'LaGrange'
counties_db.loc[(counties_db['Nom'] == 'LaFourche'), 'Nom'] = 'Lafourche'
counties_db.loc[(counties_db['Nom'] == 'Lac Qui Parle'), 'Nom'] = 'Lac qui Parle'
counties_db.loc[(counties_db['Nom'] == 'Dona Ana'), 'Nom'] = 'Doña Ana'
counties_db.loc[(counties_db['Nom'] == 'La Moure'), 'Nom'] = 'LaMoure'
counties_db.loc[(counties_db['Nom'] == 'De Witt'), 'Nom'] = 'DeWitt'
counties_db.loc[(counties_db['Nom'] == 'DeWitt') & (counties_db['Code_Etat'] == 'IL'), 'Nom'] = 'De Witt'
counties_db.loc[(counties_db['Nom'] == 'LaSalle') & (counties_db['Code_Etat'] == 'TX'), 'Nom'] = 'La Salle'
counties_db.loc[(counties_db['Code_Etat'] == 'WI (includes Menominee)'), 'Code_Etat'] = 'WI'
counties_db.loc[(counties_db['Nom'] == 'Fond Du Lac'), 'Nom'] = 'Fond du Lac'

On a maintenant un premier dframe recensant tous les comtés des USA ainsi que les ID permettant de les retrouver dans l'API. On peut désormais extraire pour chaque comté les informations socio-démographiques nous permettant de produire nos statistiques descriptives et notre modèle.

Note : id_county fait office de clé primaire dans cette base (au sein de l'API FRED)

In [11]:
counties_db.to_csv("data/counties_db.csv", index=False)

## Récupération des données géographiques par comté

In [9]:
#On change l'URL pour pouvoir récupérer des données géographiques sur tous les comtés (ainsi que le fips, le nom simplifié)
#L'enjeu ici est d'associer les données aux comtés correspondants : les codes id de l'API n'ont pas de correspondance avec les donneés géographiques à part le nom
url = "https://api.stlouisfed.org/geofred/shapes/file?shape=county"

In [10]:
geom_counties_db = request_db(29802)#The code is arbitrary here, each request gives the geometries of all counties
geom_counties_db = gpd.GeoDataFrame.from_features(geom_counties_db['features'])
geom_counties_db = geom_counties_db.loc[geom_counties_db['hc-group'] == 'admin2']
geom_counties_db['Code_Etat'] = geom_counties_db['hc-key'].apply(lambda x: x.split('-')[1].upper() if len(x.split('-')) > 1 else None)
geom_counties_db['name'] = geom_counties_db['name'].apply(lambda x : x.rsplit("Parish", 1)[0].strip())
geom_counties_db.sample(5)


Unnamed: 0,geometry,hc-group,hc-middle-x,hc-middle-y,hc-key,hc-a2,fips,name,Code_Etat
2445,"POLYGON ((7694.000 4481.000, 7665.000 4427.000...",admin2,0.48,0.43,us-fl-125,UN,12125,Union,FL
2335,"POLYGON ((8694.000 7529.000, 8716.000 7537.000...",admin2,0.56,0.54,us-nj-027,MO,34027,Morris,NJ
1288,"POLYGON ((8507.000 6369.000, 8509.000 6367.000...",admin2,0.69,0.58,us-va-620,FR,51620,Franklin City,VA
396,"POLYGON ((8145.000 6171.000, 8116.000 6166.000...",admin2,0.49,0.5,us-nc-145,PE,37145,Person,NC
1511,"POLYGON ((7310.000 5153.000, 7356.000 5083.000...",admin2,0.37,0.5,us-ga-289,TW,13289,Twiggs,GA


In [11]:
counties_db = pd.merge(left= counties_db, right=geom_counties_db, how='left', left_on=['Nom', 'Code_Etat'], right_on=['name', 'Code_Etat'])
counties_db = counties_db.loc[:,['Nom', 'Etat', 'Code_Etat', 'id_Etat', 'id_county', 'fips', 'geometry']]
counties_db.sample(10)

Unnamed: 0,Nom,Etat,Code_Etat,id_Etat,id_county,fips,geometry
2018,Kidder,North Dakota,ND,27316,29377,38043,"POLYGON ((3927.000 8545.000, 3881.000 8547.000..."
1780,Sullivan,New Hampshire,NH,27311,29134,33019,"MULTIPOLYGON (((8985.000 8225.000, 8985.000 82..."
2433,Walworth,South Dakota,SD,27324,29799,46129,"POLYGON ((3869.000 8276.000, 3867.000 8211.000..."
2626,Hall,Texas,TX,27326,29994,48191,"POLYGON ((3576.000 5467.000, 3594.000 5466.000..."
1005,Bath,Kentucky,KY,152,28346,21011,"POLYGON ((7074.000 6508.000, 7097.000 6487.000..."
490,Meriwether,Georgia,GA,27292,27826,13199,"POLYGON ((7063.000 5139.000, 7065.000 5124.000..."
360,Leon,Florida,FL,27291,27696,12073,"POLYGON ((7193.000 4554.000, 7201.000 4555.000..."
1025,Clay,Kentucky,KY,152,28366,21051,"POLYGON ((7073.000 6288.000, 7099.000 6294.000..."
2870,Fauquier,Virginia,VA,27330,30242,51061,"POLYGON ((8198.000 6831.000, 8174.000 6854.000..."
1495,Barton,Missouri,MO,154,28844,29011,"POLYGON ((4947.000 6183.000, 4947.000 6143.000..."


Les comtés où la jointure géographique n'est pas possible :

In [12]:
counties_db.loc[counties_db['geometry'] == None]

Unnamed: 0,Nom,Etat,Code_Etat,id_Etat,id_county,fips,geometry
68,Aleutian Islands CA,Alaska,AK,27283,33743,,
90,Prince of Wales-Outer Ketchikan CA,Alaska,AK,27283,27421,,
92,Skagway-Hoonah-Angoon CA,Alaska,AK,27283,27423,,
96,Wade Hampton CA,Alaska,AK,27283,27426,,
97,Wrangell Borough/City,Alaska,AK,27283,33518,,
98,Wrangell-Petersburg CA,Alaska,AK,27283,27427,,
99,Yakutat City and Borough,Alaska,Yakutat City and Borough,27283,32212,,
2425,Shannon,South Dakota,SD,27324,29791,,
2856,Clifton Forge City,Virginia,VA,27330,30228,,
2945,South Boston City,Virginia,VA,27330,32143,,


## Récupération des séries FRED

Ici, le but est de récupérer des données clé pour chaque comté : population, taux de chômage, bénéficiaires d'aides sociales etc.

In [13]:
def check_response(data, url, param):
    #To optimize execution, we try to retrieve data until the API accepts our requests
    #As the limit is not explicit, wait until we can retrieve again
    if 'error_message' not in data:
        return 
    while data['error_message'] == 'Too Many Requests.  Exceeded Rate Limit':
        time.sleep(5)
        data = requests.get(url, params= param)
        data = data.json()

        if 'error_message' not in data:
            return


        

def recover_unemp_mean_rate(id_county):
     #Aim : recover date for the unemployement rate for each county
     #We choose here to keep only the mean value from 2013 to 2018

    series_id = None
    data_unemp = None
    
    #Request type to retrieve the id of the series
    url = "https://api.stlouisfed.org/fred/category/series?"
    param = {"api_key" : api_key, "file_type" : "json", "category_id" : id_county}
    response = requests.get(url, params= param)
    series_id = response.json()
    check_response(series_id, url, param)


    if 'seriess' not in series_id.keys():
        return None

    series_id = series_id['seriess']

    id_unemp_series = None
    for serie in series_id:
        if ("Unemployment Rate" in serie["title"]) and ("Monthly" in serie["frequency"]):
            id_unemp_series = serie['id']
    
    #Now that the id of the unemployement series is known, it is time to retrieve values
    #Request type to retrieve the values in the series
    unemployment_rate_mean = np.nan
    if id_unemp_series:
        url = "https://api.stlouisfed.org/fred/series/observations?" #on va chercher les séries correspondant au chomage mensuel
        param = {"api_key" : api_key, "file_type" : "json", "series_id" : id_unemp_series, "observation_start" : "2013-01-01", "observation_end" : "2018-03-01"} 
        response = requests.get(url, params= param)
        data_unemp = response.json()
        check_response(data_unemp, url, param)

        if 'observations' not in data_unemp:
            return None

        data_unemp = data_unemp['observations']


        #Mean of all monthly unemp rates
        list_of_unemp_rates = [float(obs["value"]) for obs in data_unemp]
        if len(list_of_unemp_rates) > 1:
            unemployment_rate_mean = mean(list_of_unemp_rates)

    return unemployment_rate_mean

In [14]:
#As expected, the extraction takes time because of the API requests limit.
tqdm.pandas(desc = 'Extraction données chômage')
counties_db['unemp_rate'] = counties_db['id_county'].progress_apply(recover_unemp_mean_rate)

Extraction données chômage: 100%|██████████| 3195/3195 [52:21<00:00,  1.02it/s]  


In [17]:
counties_db

Unnamed: 0,Nom,Etat,Code_Etat,id_Etat,id_county,fips,geometry,unemp_rate
0,Autauga,Alabama,AL,27282,27336,01001,"POLYGON ((6581.000 4919.000, 6555.000 4969.000...",5.219048
1,Baldwin,Alabama,AL,27282,27337,01003,"MULTIPOLYGON (((6355.000 4470.000, 6354.000 44...",5.520635
2,Barbour,Alabama,AL,27282,27338,01005,"POLYGON ((6976.000 4890.000, 6979.000 4880.000...",8.660317
3,Bibb,Alabama,AL,27282,27339,01007,"POLYGON ((6431.000 5078.000, 6453.000 5080.000...",6.439683
4,Blount,Alabama,AL,27282,27340,01009,"POLYGON ((6608.000 5424.000, 6619.000 5411.000...",5.401587
...,...,...,...,...,...,...,...,...
3190,Sweetwater,Wyoming,WY,27334,30524,56037,"POLYGON ((1838.000 7537.000, 1847.000 7605.000...",4.790476
3191,Teton,Wyoming,WY,27334,30525,56039,"POLYGON ((1787.000 7869.000, 1743.000 7876.000...",4.171429
3192,Uinta,Wyoming,WY,27334,30526,56041,"POLYGON ((1614.000 7375.000, 1619.000 7403.000...",5.163492
3193,Washakie,Wyoming,WY,27334,30527,56043,"POLYGON ((2369.000 7855.000, 2361.000 7856.000...",4.560317
