# Traitement des données

Dans cette partie, l'objectif est d'importer nos données, les convertir à un format convenable pour la production de statistiques descriptives, et exporter le résultat pour que ces bases puissent être utilisées par les autres programmes

Les bases en question sont :
- la base listant tous les incidents de violence par armes à feu aux USA entre 2013 et 2018
- la base listant les caractéristiques générales des comtés et de ses habitants.

In [34]:
#Pour le traitement classique
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import MultiPolygon

#Pour l'interaction avec l'API 
import requests
from statistics import mean
import time

## Traitement de la base d'incidents armes à feu

In [5]:
#Base incidents armes à feu
url="https://drive.google.com/file/d/1GGOLMc_Ow9yZC9sICegPegDggQuHOD3t/view?usp=drive_link"
url="https://drive.google.com/uc?export=download&confirm=1&id=" + url.split("/")[-2]
gun_violence_db = pd.read_csv(url)
gun_violence_db.sample(5)

Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
79345,373650,2015-07-12,Texas,Corpus Christi,,0,1,http://www.gunviolencearchive.org/incident/373650,http://ccpdblotter.com/,False,...,0::32||1::30,0::Adult 18+||1::Adult 18+,0::Female||1::Male,,1::Family,0::Injured||1::Unharmed,0::Victim||1::Subject-Suspect,http://ccpdblotter.com/,34.0,20.0
24325,154356,2014-07-03,California,San Francisco,Connecticut and 25th,0,2,http://www.gunviolencearchive.org/incident/154356,http://www.ktvu.com/news/news/crime-law/two-in...,False,...,,0::Teen 12-17||1::Adult 18+,0::Male,,,0::Injured||1::Injured,0::Victim||1::Victim,http://www.ktvu.com/news/news/crime-law/two-in...,17.0,11.0
186195,840009,2017-05-08,Louisiana,Eunice,E Dean St,1,0,http://www.gunviolencearchive.org/incident/840009,http://www.theadvertiser.com/story/news/2017/0...,False,...,0::12,0::Teen 12-17,0::Male,,,0::Killed,0::Victim,http://www.theadvertiser.com/story/news/2017/0...,41.0,28.0
184600,831589,2017-04-29,Virginia,Newport News,13200 block of Sojourner Court,1,0,http://www.gunviolencearchive.org/incident/831589,http://wavy.com/2017/05/04/newport-news-police...,False,...,0::27,0::Adult 18+,0::Male,0::Joshua Aaron Kaplan,,0::Killed,0::Victim,http://wavy.com/2017/04/29/deadly-shooting-on-...,95.0,1.0
204596,916982,2017-08-20,South Carolina,Hartsville,New Market Road,0,0,http://www.gunviolencearchive.org/incident/916982,http://www.live5news.com/story/36176046/hartsv...,False,...,0::26,0::Adult 18+,0::Male,0::Bradley Dale Wright,,"0::Unharmed, Arrested",0::Subject-Suspect,http://www.live5news.com/story/36176046/hartsv...,65.0,29.0


Conformément à la documentation de la base, certaines colonnes sont codées de façon à pouvoir les reconvertir en dictionnaire :

In [6]:
def convert_to_dict(value):
    if pd.isna(value):
        return value

    pairs = value.split('||')
    result_dict = {}
    for pair in pairs:
        #Some are corrupted : 1: instead of ::
        if '::' in pair:
            key, val = pair.split('::', 1)
            result_dict[int(key)] = val
        else:
            key, val = pair.split(':', 1)
            result_dict[int(key)] = val
    return result_dict

list_of_dict_columns = ['gun_stolen', 'gun_type', 'participant_age', 'participant_age_group', 'participant_gender', 'participant_name', 'participant_relationship', 'participant_status', 'participant_type']
gun_violence_db[list_of_dict_columns] = gun_violence_db[list_of_dict_columns].applymap(convert_to_dict)
gun_violence_db.head()



  gun_violence_db[list_of_dict_columns] = gun_violence_db[list_of_dict_columns].applymap(convert_to_dict)


Unnamed: 0,incident_id,date,state,city_or_county,address,n_killed,n_injured,incident_url,source_url,incident_url_fields_missing,...,participant_age,participant_age_group,participant_gender,participant_name,participant_relationship,participant_status,participant_type,sources,state_house_district,state_senate_district
0,461105,2013-01-01,Pennsylvania,Mckeesport,1506 Versailles Avenue and Coursin Street,0,4,http://www.gunviolencearchive.org/incident/461105,http://www.post-gazette.com/local/south/2013/0...,False,...,{0: '20'},"{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Male', 1: 'Male', 3: 'Male', 4: 'Female'}",{0: 'Julian Sims'},,"{0: 'Arrested', 1: 'Injured', 2: 'Injured', 3:...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Vi...",http://pittsburgh.cbslocal.com/2013/01/01/4-pe...,,
1,460726,2013-01-01,California,Hawthorne,13500 block of Cerise Avenue,1,3,http://www.gunviolencearchive.org/incident/460726,http://www.dailybulletin.com/article/zz/201301...,False,...,{0: '20'},"{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...",{0: 'Male'},{0: 'Bernard Gillis'},,"{0: 'Killed', 1: 'Injured', 2: 'Injured', 3: '...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Vi...",http://losangeles.cbslocal.com/2013/01/01/man-...,62.0,35.0
2,478855,2013-01-01,Ohio,Lorain,1776 East 28th Street,1,3,http://www.gunviolencearchive.org/incident/478855,http://chronicle.northcoastnow.com/2013/02/14/...,False,...,"{0: '25', 1: '31', 2: '33', 3: '34', 4: '33'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Male', 1: 'Male', 2: 'Male', 3: 'Male', 4...","{0: 'Damien Bell', 1: 'Desmen Noble', 2: 'Herm...",,"{0: 'Injured, Unharmed, Arrested', 1: 'Unharme...","{0: 'Subject-Suspect', 1: 'Subject-Suspect', 2...",http://www.morningjournal.com/general-news/201...,56.0,13.0
3,478925,2013-01-05,Colorado,Aurora,16000 block of East Ithaca Place,4,0,http://www.gunviolencearchive.org/incident/478925,http://www.dailydemocrat.com/20130106/aurora-s...,False,...,"{0: '29', 1: '33', 2: '56', 3: '33'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Adult 18+...","{0: 'Female', 1: 'Male', 2: 'Male', 3: 'Male'}","{0: 'Stacie Philbrook', 1: 'Christopher Ratlif...",,"{0: 'Killed', 1: 'Killed', 2: 'Killed', 3: 'Ki...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Su...",http://denver.cbslocal.com/2013/01/06/officer-...,40.0,28.0
4,478959,2013-01-07,North Carolina,Greensboro,307 Mourning Dove Terrace,2,2,http://www.gunviolencearchive.org/incident/478959,http://www.journalnow.com/news/local/article_d...,False,...,"{0: '18', 1: '46', 2: '14', 3: '47'}","{0: 'Adult 18+', 1: 'Adult 18+', 2: 'Teen 12-1...","{0: 'Female', 1: 'Male', 2: 'Male', 3: 'Female'}","{0: 'Danielle Imani Jameison', 1: 'Maurice Eug...",{3: 'Family'},"{0: 'Injured', 1: 'Injured', 2: 'Killed', 3: '...","{0: 'Victim', 1: 'Victim', 2: 'Victim', 3: 'Su...",http://myfox8.com/2013/01/08/update-mother-sho...,62.0,27.0


In [8]:
gun_violence_db.to_csv("data/gun_violence_db.csv", index=False)

## Traitements des bases comtés de l'API


La documentation précisant le mode d'interaction avec l'API de StLouisFed se trouve à la page https://fred.stlouisfed.org/docs/api/fred/#API.

In [156]:
#Each request is categorized with an url and an id
#The gist here is to recover the proper id to retrieve data
api_key = "180de2e6a1d1e953d270ebf38341cd44"
param = {"api_key" : api_key, "file_type" : "json", "category_id" : "27281"}
url = "https://api.stlouisfed.org/fred/category/children?"

In [149]:
def request_db(index):
    #this function requests to the API the database associated with the category id index
    param["category_id"] = index #on ajuste les paramètres de la request pour demander la bonne catégorie
    response = requests.get(url, params = param)
    data = response.json()
    return data

In [152]:
us_data = request_db(27281)['categories']
#We create our dframe by creating a list of dicts, each element is a new row
database = list()
for state in us_data:
    id_state = state['id']
    state_name = state['name']
    
    #Request to recover id in order to extract counties
    state_info = request_db(id_state)["categories"]
    if state_info != []: #One exception : which one ?
        id_list_of_state_counties = state_info[0]['id']
        list_of_state_counties = request_db(id_list_of_state_counties)["categories"]
        for county in list_of_state_counties:
            dict_county = dict()
            id_county = county['id']
            
            if county['name'].count(',') == 1:
                county_name,  state_code = county['name'].split(',')
                state_code = state_code.lstrip()
            else:
                county_name = county['name']
                state_code = np.nan

            #Update the dict to include basic values on each county
            dict_county['Nom'] = county_name
            dict_county['Etat'] = state_name
            dict_county['Code_Etat'] = state_code
            dict_county['id_Etat'] = id_state
            dict_county['id_county'] = id_county
            database.append(dict_county)

counties_db = pd.DataFrame(database)

In [153]:
counties_db.sample(10)

Unnamed: 0,Nom,Etat,Code_Etat,id_Etat,id_county
796,Adams County,Iowa,IA,27297,28136
2711,Palo Pinto County,Texas,TX,27326,30080
580,Gooding County,Idaho,ID,27294,27917
1705,Hooker County,Nebraska,NE,27309,29058
1379,Polk County,Minnesota,MN,27305,28727
2206,Texas County,Oklahoma,OK,27318,29568
2682,Lynn County,Texas,TX,27326,30051
1518,Dallas County,Missouri,MO,154,1049
324,District of Columbia,District of Columbia,,27290,33508
1194,Sagadahoc County,Maine,ME,27301,28538


On a maintenant un premier dframe recensant tous les comtés des USA ainsi que les ID permettant de les retrouver dans l'API. On peut désormais extraire pour chaque comté les informations socio-démographiques nous permettant de produire nos statistiques descriptives et notre modèle.

Note : id_county fait office de clé primaire dans cette base

In [15]:
counties_db.to_csv("data/counties_db.csv", index=False)

In [158]:
#On change l'URL pour pouvoir récupérer des données géographiques sur tous les comtés (ainsi que le fips, le nom simplifié)
url = "https://api.stlouisfed.org/geofred/shapes/file?shape=county&api_key=180de2e6a1d1e953d270ebf38341cd44"

In [126]:
for id_county in counties_db['id_county']:
    geom_data_county = request_db(id_county)['features'][0]

    simple_name = geom_data_county['properties']['name']
    fips = geom_data_county['properties']['fips']


    

KeyError: 'categories'

In [239]:
test = request_db(30505)
test

{'title': 'United States of America, admin2, highres',
 'version': '1.1.2',
 'type': 'FeatureCollection',
 'copyright': 'Copyright (c) 2015 Highsoft AS, Based on data from The United States Census Bureau',
 'copyrightShort': 'USA Census Bureau',
 'copyrightUrl': 'http://www.census.gov',
 'crs': {'type': 'name',
  'properties': {'name': 'urn:ogc:def:crs:EPSG:102004'}},
 'hc-transform': {'default': {'crs': '+proj=lcc +lat_1=33 +lat_2=45 +lat_0=39 +lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs',
   'scale': 0.000151599359356,
   'jsonres': 15.5,
   'jsonmarginX': -999,
   'jsonmarginY': 9851,
   'xoffset': -2361356.09818,
   'yoffset': 1406281.44289},
  'us-all-all-hawaii-highres': {'xpan': 190,
   'ypan': 417,
   'hitZone': {'type': 'Polygon',
    'coordinates': [[[1747, 3920],
      [3651, 2950],
      [3651, -999],
      [1747, -999],
      [1747, 3920]]]},
   'crs': '+proj=aea +lat_1=8 +lat_2=18 +lat_0=13 +lon_0=-157 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs',
   'scale': 

In [237]:
test2 = pd.json_normalize(test['features'])
test2

Unnamed: 0,type,id,properties.hc-group,properties.hc-middle-x,properties.hc-middle-y,properties.hc-key,properties.hc-a2,properties.fips,properties.name,geometry.type,geometry.coordinates
0,Feature,US.CA.083,admin2,0.21,0.27,us-ca-083,SB,06083,Santa Barbara,MultiPolygon,"[[[[-422, 5749], [-422, 5745], [-425, 5743], [..."
1,Feature,US.CA.111,admin2,0.48,0.26,us-ca-111,VE,06111,Ventura,MultiPolygon,"[[[[-538, 5722], [-524, 5706], [-547, 5712], [..."
2,Feature,US.CA.037,admin2,0.52,0.20,us-ca-037,LA,06037,Los Angeles,MultiPolygon,"[[[[-359, 5610], [-321, 5543], [-338, 5543], [..."
3,Feature,US.RI.009,admin2,0.47,0.40,us-ri-009,WA,44009,Washington,MultiPolygon,"[[[[9288, 7745], [9279, 7744], [9282, 7765], [..."
4,Feature,US.FL.087,admin2,0.54,0.43,us-fl-087,MO,12087,Monroe,MultiPolygon,"[[[[8015, 3042], [8011, 3037], [7998, 3031], [..."
...,...,...,...,...,...,...,...,...,...,...,...
3142,Feature,US.AK.185,admin2,0.48,0.51,us-ak-185,NS,02185,North Slope Borough,Polygon,"[[[962, 3974], [903, 3972], [840, 3974], [778,..."
3143,Feature,US.AK.282,admin2,0.37,0.23,us-ak-282,YA,02282,Yakutat Cty&Bor,Polygon,"[[[990, 3158], [990, 3150], [1013, 3142], [101..."
3144,Feature,US.AK.275,admin2,0.49,0.47,us-ak-275,WR,02275,Wrangell Cty&Bor,Polygon,"[[[1439, 2791], [1445, 2801], [1471, 2820], [1..."
3145,Feature,,__border_lines__,,,,,,,MultiLineString,"[[[1340, 3043], [1346, 3033], [1369, 3023], [1..."


In [238]:
# Replace 'geometry.coordinates' with the actual name of your geometry column
geometry_col_name = 'geometry.coordinates'

# Select the first row of the GeoDataFrame
first_row = test2.iloc[[0]]

# Create a GeoDataFrame containing only the first row
first_row_gdf = gpd.GeoDataFrame(first_row, geometry=gpd.GeoSeries(first_row[geometry_col_name]))

# Plot the first row
first_row_gdf.plot()

# Show the plot
plt.show()

TypeError: Non geometry data passed to GeoSeries constructor, received data of dtype 'object'