Import des packages

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

#### Import de la base de données sur les incidents par arme à feu aux Etats-Unis entre 2013 et 2018.

In [None]:
url="https://drive.google.com/file/d/1GGOLMc_Ow9yZC9sICegPegDggQuHOD3t/view?usp=drive_link"
url="https://drive.google.com/uc?export=download&confirm=1&id=" + url.split("/")[-2]
base = pd.read_csv(url)

In [None]:
pd.set_option('display.max_columns', None)
base.head()

#### Création de fonctions permettant de transformer les colonnes codées pour comporter des dictionnaires et des listes.

In [None]:
import re

In [None]:
def convert_to_dict(value):
    if pd.isna(value):
        return value

    if '||' not in str(value):

        #some are corrupted: '|' instead of '||'
        if '|' not in str(value):

            result_dict = {}
            #Some are corrupted : 1: instead of ::
            if '::' in str(value):
                key, val = re.split(r'::?|:', str(value))
                result_dict[int(key)] = val
            elif ':' in str(value):
                key, val = value.split(':', 1)
                result_dict[int(key)] = val
            else:
                result_dict={0:value}
        
        else: 
            pairs = str(value).split('|')
            pairs[1].strip("'])")

            result_dict = {}
            for pair in pairs:
                #Some are corrupted : 1: instead of ::
                if '::' in pair:
                    key, val = re.split(r'::?|:', str(pair))
                    result_dict[int(key)] = val
                elif ':' in pair:
                    key, val = pair.split(':', 1)
                    result_dict[int(key)] = val
                else:
                    key, val=0,pair
                    result_dict[key]=val

        return result_dict

    if '||' in str(value):

        pairs = re.split(r'\|\|?\|', str(value))

        result_dict = {}
        for pair in pairs:
            #Some are corrupted : 1: instead of ::
            if '::' in pair:
                key, val = re.split(r'::?|:', str(pair))
                result_dict[int(key)] = val
            elif ':' in pair:
                key, val = pair.split(':', 1)
                result_dict[int(key)] = val
            else:
                key, val=0,pair
                result_dict[key]=val
        return result_dict


updated_base = base.copy()
list_of_dict_columns = ['gun_stolen', 'gun_type', 'participant_age', 'participant_age_group', 'participant_gender', 'participant_relationship', 'participant_status', 'participant_type']
updated_base[list_of_dict_columns] = updated_base[list_of_dict_columns].applymap(convert_to_dict)
updated_base.head()

In [None]:
def convert_to_list(value):
    if pd.isna(value):
        return value

    if '||' in str(value):
        liste = value.split('||')
        return liste


list_of_list_columns = ['incident_characteristics', 'sources']
updated_base[list_of_list_columns] = updated_base[list_of_list_columns].applymap(convert_to_list)
updated_base.head()


### Création d'indicatrices pour pouvoir accéder à l'information disponible dans des listes

In [None]:
def make_indic1(var,description,new_var):
    list_indic=[]

    for i in updated_base[var]:
        if i is not None:
            if description in str(i) :
                list_indic.append(1)
            else:
                list_indic.append(0)
        else:
            list_indic.append(0)
        

    updated_base[new_var]=list_indic

In [None]:
carac=[("incident_characteristics","uicide","indic_suicide"),
    ("incident_characteristics","efensive","indic_defens"),
    ("incident_characteristics", "chool", "indic_school"),
    ("incident_characteristics","Home Invasion", "indic_home_inv"),
    ("incident_characteristics","ccident","indic_accident")]
    #excluding the first letter of the word because of the case-sensitivity

for var,description,new_var in carac:
    make_indic1(var,description,new_var)

updated_base.head()

Export de la base en csv pour pouvoir être utilisée dans les statistiques descriptives dans le fichier stat_desc_gun_violence.ipynb.

In [None]:
updated_base.to_csv('data/gun_violence_db.csv')

### Création d'une base de données comprenant les informations relatives aux suspects

In [None]:
data_suspects=[]
index=0

for i in updated_base['participant_type']:
    if type(i)==dict and len(i)>0:
        for key,value in i.items():
            if value=='Subject-Suspect':
                result_dict={'id':index}

                if type(updated_base['participant_age_group'][index])==dict:
                    if key in updated_base['participant_age_group'][index].keys():
                        result_dict['age_group_sus']=updated_base['participant_age_group'][index][key]
                else:
                    result_dict['age_group_sus']=np.nan
                
                if type(updated_base['participant_age'][index])==dict:
                    if key in updated_base['participant_age'][index].keys():
                        result_dict['age_sus']=updated_base['participant_age'][index][key]
                else:
                    result_dict['age_sus']=np.nan

                if type(updated_base['participant_gender'][index])==dict:
                    if key in updated_base['participant_gender'][index].keys():
                        result_dict['gender_sus']=updated_base['participant_gender'][index][key]
                else:
                    result_dict['gender_sus']=np.nan

                if type(updated_base['participant_status'][index])==dict:
                    if key in updated_base['participant_status'][index].keys():
                        result_dict['status_sus']=updated_base['participant_status'][index][key]
                else:
                    result_dict['status_sus']=np.nan

                if type(updated_base['gun_type'][index])==dict:
                    if key in updated_base['gun_type'][index].keys():
                        result_dict['gun_type']=updated_base['gun_type'][index][key]
                else:
                    result_dict['gun_type']=np.nan

                if type(updated_base['gun_stolen'][index])==dict:
                    if key in updated_base['gun_stolen'][index].keys():
                        result_dict['gun_stolen']=updated_base['gun_stolen'][index][key]
                else:
                    result_dict['gun_stolen']=np.nan
                
                if type(updated_base['participant_relationship'][index])==dict:
                    if key in updated_base['participant_relationship'][index].keys():
                        result_dict['relation']=updated_base['participant_relationship'][index][key]
                else:
                    result_dict['relation']=np.nan


                data_suspects.append(result_dict)

    elif np.isnan(i)==False:
        if updated_base['participant_type'][i]=='Subject-Suspect':
            result_dict={'id':index}

            if type(updated_base['participant_age_group'][index])==dict:
                if key in updated_base['participant_age_group'][index].keys():
                        result_dict['age_group_sus']=updated_base['participant_age_group'][index][key]
            else:
                result_dict['age_group_sus']=np.nan
            
            if type(updated_base['participant_age'][index])==dict:
                if key in updated_base['participant_age'][index].keys():
                    result_dict['age_sus']=updated_base['participant_age'][index][key]
            else:
                result_dict['age_sus']=np.nan

            if type(updated_base['participant_gender'][index])==dict:
                if key in updated_base['participant_gender'][index].keys():
                    result_dict['gender_sus']=updated_base['participant_gender'][index][key]
            else:
                result_dict['gender_sus']=np.nan

            if type(updated_base['participant_status'][index])==dict:
                if key in updated_base['participant_status'][index].keys():
                    result_dict['status_sus']=updated_base['participant_status'][index][key]
            else:
                result_dict['status_sus']=np.nan

            if type(updated_base['gun_type'][index])==dict:
                if key in updated_base['gun_type'][index].keys():
                    result_dict['gun_type']=updated_base['gun_type'][index][key]
            else:
                result_dict['gun_type']=np.nan

            if type(updated_base['gun_stolen'][index])==dict:
                if key in updated_base['gun_stolen'][index].keys():
                    result_dict['gun_stolen']=updated_base['gun_stolen'][index][key]
            else:
                result_dict['gun_stolen']=np.nan

            if type(updated_base['participant_relationship'][index])==dict:
                if key in updated_base['participant_relationship'][index].keys():
                    result_dict['relation']=updated_base['participant_relationship'][index][key]
            else:
                result_dict['relation']=np.nan
                
            data_suspects.append(result_dict)

    index+=1


df_suspects=pd.DataFrame(data_suspects)

In [None]:
df_suspects.head()

On obtient ici un dataframe permettant d'obtenir l'âge, le genre, le status et l'arme de tous les participants aux incidents caractérisés comme 'Subject-Suspect' afin de pouvoir faire des statistiques descriptives. La colonne 'id' corresponds à l'index de l'incident concerné dans la base updated_base. La colonne 'relation' correspond à la relation du suspect à la victime, ou la circonstance de l'incident. 

In [None]:
df_suspects.to_csv('data/df_suspects.csv')

### Création d'une base de données similaire contenant toutes les informations sur les victimes

In [None]:
data_victimes=[]
index=0

for i in updated_base['participant_type']:
    if type(i)==dict and len(i)>0:
        for key,value in i.items():
            if value=='Victim':
                result_dict={'id':index}

                if type(updated_base['participant_age_group'][index])==dict:
                    if key in updated_base['participant_age_group'][index].keys():
                        result_dict['age_group_vict']=updated_base['participant_age_group'][index][key]
                else:
                    result_dict['age_group_vict']=np.nan
                
                if type(updated_base['participant_age'][index])==dict:
                    if key in updated_base['participant_age'][index].keys():
                        result_dict['age_vict']=updated_base['participant_age'][index][key]
                else:
                    result_dict['age_vict']=np.nan

                if type(updated_base['participant_gender'][index])==dict:
                    if key in updated_base['participant_gender'][index].keys():
                        result_dict['gender_vict']=updated_base['participant_gender'][index][key]
                else:
                    result_dict['gender_vict']=np.nan

                if type(updated_base['participant_status'][index])==dict:
                    if key in updated_base['participant_status'][index].keys():
                        result_dict['status_vict']=updated_base['participant_status'][index][key]
                else:
                    result_dict['status_vict']=np.nan

                if type(updated_base['gun_type'][index])==dict:
                    if key in updated_base['gun_type'][index].keys():
                        result_dict['gun_type']=updated_base['gun_type'][index][key]
                else:
                    result_dict['gun_type']=np.nan

                if type(updated_base['gun_stolen'][index])==dict:
                    if key in updated_base['gun_stolen'][index].keys():
                        result_dict['gun_stolen']=updated_base['gun_stolen'][index][key]
                else:
                    result_dict['gun_stolen']=np.nan
            
                
                data_victimes.append(result_dict)


    elif np.isnan(i)==False:
        if updated_base['participant_type'][i]=='Victim':
            result_dict={'id':index}
            if type(updated_base['participant_age_group'][index])==dict:
                    if key in updated_base['participant_age_group'][index].keys():
                        result_dict['age_group_vict']=updated_base['participant_age_group'][index][key]
            else:
                result_dict['age_group_vict']=np.nan
            
            if type(updated_base['participant_age'][index])==dict:
                if key in updated_base['participant_age'][index].keys():
                    result_dict['age_vict']=updated_base['participant_age'][index][key]
            else:
                result_dict['age_vict']=np.nan

            if type(updated_base['participant_gender'][index])==dict:
                if key in updated_base['participant_gender'][index].keys():
                    result_dict['gender_vict']=updated_base['participant_gender'][index][key]
            else:
                result_dict['gender_vict']=np.nan

            if type(updated_base['participant_status'][index])==dict:
                if key in updated_base['participant_status'][index].keys():
                    result_dict['status_vict']=updated_base['participant_status'][index][key]
            else:
                result_dict['status_vict']=np.nan

            if type(updated_base['gun_type'][index])==dict:
                if key in updated_base['gun_type'][index].keys():
                    result_dict['gun_type']=updated_base['gun_type'][index][key]
            else:
                result_dict['gun_type']=np.nan

            if type(updated_base['gun_stolen'][index])==dict:
                if key in updated_base['gun_stolen'][index].keys():
                    result_dict['gun_stolen']=updated_base['gun_stolen'][index][key]
            else:
                result_dict['gun_stolen']=np.nan


            data_victimes.append(result_dict)

    index+=1


df_victim=pd.DataFrame(data_victimes)

In [None]:
df_victim.head()

In [None]:
df_victim.to_csv('data/df_victim.csv')

### Création d'indicatrices sur les variables d'intérêt identifiées

Avoir des indicatrices sur ces sujets permettra d'obtenir plus facilement certaines informations clefs qui seront importantes lorsque l'on va croiser cette base de données avec des données extérieures

In [None]:
#création d'une base unique avec les informations sur les suspects et les victimes
updated_base=updated_base.reset_index()
merge_sus=pd.merge(updated_base, df_suspects, how='outer', left_on='index', right_on='id')
merge_tot=pd.merge(merge_sus, df_victim, how='outer', right_on='id', left_on='index')
merge_tot.head()

In [None]:
def make_indic2(var, val1, new_name):
    list_indic=[]
    for i in data_indic[var]:
        
        if i!=None:
            if type(i)==str:
                if val1 in i :
                    list_indic.append(1)
                else:
                    list_indic.append(0)
            else:
                list_indic.append(np.nan)
        else:
            list_indic.append(np.nan)
    data_indic[new_name]=list_indic

In [None]:
data_indic=merge_tot.copy()

In [None]:
to_indic=[('age_group_sus', 'Adult 18+', 'sus_adulte'),
    ('age_group_vict', 'Adult 18+', 'vict_adulte'),
    ('gender_sus', 'Male', 'sus_male'),
    ('gender_vict', 'Male', 'vict_male'),
    ('status_sus', 'Injured', 'sus_injured'),
    ('status_sus', 'Arrested', 'sus_arrested'),
    ('status_sus', 'Unharmed', 'sus_unharmed'),
    ('status_sus', 'Killed', 'sus_killed'),
    ('status_vict', 'Injured', 'vict_injured'),
    ('status_vict', 'Arrested', 'vict_arrested'),
    ('status_vict', 'Unharmed', 'vict_unharmed'),
    ('status_vict', 'Killed', 'vict_killed'),
    ('gun_type', 'Handgun', 'handgun_1'),
    ('gun_type', 'Rifle', 'rifle_1'),
    ('gun_type', '9mm', '9mm_1'),
    ('gun_type', 'Shotgun', 'shotgun_1'),
    ('gun_type_y', 'Handgun', 'handgun_2'),
    ('gun_type_y', 'Rifle', 'rifle_2'),
    ('gun_type_y', '9mm', '9mm_2'),
    ('gun_type_y', 'Shotgun', 'shotgun_2')]

for col, val1, new_name in to_indic: 
    make_indic2(col, val1, new_name)

In [None]:
data_indic.head()

In [None]:
#taking care of the issue of the gun_type variable

def one_col(col1, col2, new_col):
    data_indic[new_col] = np.where(
        (data_indic[col1] == 1) | (data_indic[col2] == 1),  # If either column is 1
        1,
        np.where(
            (data_indic[col1] == 0) | (data_indic[col2] == 0),  # If either column is 0
            0,
            np.nan  # If both columns are NaN
        )
    )

In [None]:
gun_type_list=[('handgun_1', 'handgun_2', 'handgun'),
    ('rifle_1','rifle_2','rifle'),
    ('9mm_1', '9mm_2', '9mm'),
    ('shotgun_1','shotgun_2','shotgun')]

for col1, col2, new_col in gun_type_list:
    one_col(col1,col2, new_col)

In [None]:
del data_indic['9mm_1'], data_indic['9mm_2'], data_indic['rifle_1'], data_indic['rifle_2'], data_indic['handgun_1'], data_indic['handgun_2'], data_indic['shotgun_1'], data_indic['shotgun_2']

In [None]:
data_indic.head()

In [None]:
young=list(range(18,26))
young=[str(num) for num in young]

def indic_age(var, list1, new_name):
    list_indic=[]
    for i in data_indic[var]:
        
        if i!=None:
            if type(i)==str:
                if i in list1 :
                    list_indic.append(1)
                else:
                    list_indic.append(0)
            else:
                list_indic.append(np.nan)
        else:
            list_indic.append(np.nan)
    data_indic[new_name]=list_indic

indic_age('age_sus', young, 'young_sus')
indic_age('age_vict', young, 'young_vict')

In [None]:
col=list(data_indic.columns)
col

Maintenant que toutes les variables d'intérêt sont présentes dans le dataframe, on revient au format initial pour lequel il y avait une seule ligne par incident.

In [None]:
aggregated_data = data_indic.groupby('incident_id').agg(
    date=('date','first'), #using first to get the first non-null value
    state=('state','first'),
    city_or_county=('city_or_county','first'),
    address=('address','first'),
    n_killed=('n_killed', 'first'),
    n_injured=('n_injured','first'),
    incident_url=('incident_url','first'),
    source_url=('source_url','first'),
    incident_url_fields_missing=('incident_url_fields_missing','first'),
    congressional_district=('congressional_district','first'),
    gun_stolen=('gun_stolen_x','first'),
    gun_type_dict=('gun_type_x','first'),
    incident_characteristics=('incident_characteristics','first'),
    latitude=('latitude','first'),
    location_description=('location_description','first'),
    longitude=('longitude','first'),
    n_guns_involved=('n_guns_involved','first'),
    notes=('notes','first'),
    participant_age=('participant_age','first'),
    participant_age_group=('participant_age_group','first'),
    participant_gender=('participant_gender','first'),
    participant_name=('participant_name','first'),
    participant_relationship=('participant_relationship','first'),
    participant_status=('participant_status','first'),
    participant_type=('participant_type','first'),
    sources=('sources','first'),
    state_house_district=('state_house_district','first'),
    state_senate_district=('state_senate_district','first'),
    indic_suicide=('indic_suicide','first'),
    indic_defens=('indic_defens','first'),
    indic_school=('indic_school','first'),
    indic_home_inv=('indic_home_inv','first'),
    indic_accident=('indic_accident','first'),
    gun_type=('gun_type_y','first'), #even if multiple guns could be used in one incident, keeping one is already a good enough information
    relation=('relation', 'first'), #same reasoning
    sus_adulte=('sus_adulte', lambda x: 1 if x.sum() >= 1 else 0), #custom lambda function used to indicate the presence instead of counting the indicators (because it would give corrupted answers because of the merged data)
    vict_adulte=('vict_adulte', lambda x: 1 if x.sum() >= 1 else 0),
    sus_male=('sus_male', lambda x: 1 if x.sum() >= 1 else 0),
    vict_male=('vict_male', lambda x: 1 if x.sum() >= 1 else 0),
    sus_injured=('sus_injured', lambda x: 1 if x.sum() >= 1 else 0),
    sus_arrested=('sus_arrested', lambda x: 1 if x.sum() >= 1 else 0),
    sus_unharmed=('sus_unharmed', lambda x: 1 if x.sum() >= 1 else 0),
    sus_killed=('sus_killed', lambda x: 1 if x.sum() >= 1 else 0),
    vict_injured=('vict_injured', lambda x: 1 if x.sum() >= 1 else 0),
    vict_arrested=('vict_arrested', lambda x: 1 if x.sum() >= 1 else 0),
    vict_unharmed=('vict_unharmed', lambda x: 1 if x.sum() >= 1 else 0),
    vict_killed=('vict_killed', lambda x: 1 if x.sum() >= 1 else 0),
    handgun=('handgun', lambda x: 1 if x.sum() >= 1 else 0),
    rifle=('rifle', lambda x: 1 if x.sum() >= 1 else 0),
    Nine_mm=('9mm', lambda x: 1 if x.sum() >= 1 else 0),
    shotgun=('shotgun', lambda x: 1 if x.sum() >= 1 else 0),
    young_sus=('young_sus', lambda x: 1 if x.sum() >= 1 else 0),
    young_vict=('young_vict', lambda x: 1 if x.sum() >= 1 else 0)
)

aggregated_data.reset_index(inplace=True)

In [None]:
aggregated_data.head()

In [None]:
#sorting the dataframe by date to get the same order as in the original dataframe
agg_sorted = aggregated_data.sort_values(by='date', ascending=True)
agg_sorted

In [30]:
agg_sorted.to_csv('data/gun_violence_db.csv')