In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
from copy import deepcopy

In [2]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 1000)

In [48]:
df = pd.read_csv('../Data Sources/gun_violence_archive_github_download.csv')

df['participant_type'] = df['participant_type'].replace('\|', ':', regex = True).str.split(":")
df['participant_status'] = df['participant_status'].replace('\|', ':', regex = True).str.split(":")
df['participant_gender'] = df['participant_gender'].replace('\|', ':', regex = True).str.split(":")
df['participant_age'] = df['participant_age'].replace('\|', ':', regex = True).str.split(":")

df.rename({'n_guns_involved': 'num_guns_involved'}, axis = 1, inplace = True)
df.drop(['n_killed', 'n_injured', 'source_url', 'incident_url_fields_missing'], axis = 1, inplace = True)

merge here

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214814 entries, 0 to 214813
Data columns (total 25 columns):
incident_id                 214814 non-null int64
date                        214814 non-null object
state                       214814 non-null object
city_or_county              214814 non-null object
address                     199576 non-null object
incident_url                214814 non-null object
congressional_district      204128 non-null float64
gun_stolen                  124623 non-null object
gun_type                    124655 non-null object
incident_characteristics    214537 non-null object
latitude                    207413 non-null float64
location_description        37296 non-null object
longitude                   207413 non-null float64
n_guns_involved             124655 non-null float64
notes                       142863 non-null object
participant_age             147379 non-null object
participant_age_group       197558 non-null object
participant_gender  

In [49]:
null_columns = df.columns[df.isnull().any()]
print(df[null_columns].isnull().sum())

address                      16497
congressional_district       11944
gun_stolen                   99498
gun_type                     99451
incident_characteristics       326
latitude                      7923
location_description        197588
longitude                     7923
n_guns_involved              99451
notes                        81017
participant_age              92298
participant_age_group        42119
participant_gender           36362
participant_name            122253
participant_relationship    223903
participant_status           27626
participant_type             24863
sources                        609
state_house_district         38772
state_senate_district        32335
dtype: int64


In [50]:
df = deepcopy(df[df['participant_type'].isnull()==False].reset_index())
df.drop('index', axis = 1, inplace = True)

In [52]:
df['date'] = pd.to_datetime(df['date'])
df['gun_stolen'] = df['gun_stolen'].astype(str)
df['incident_characteristics'] = df['incident_characteristics'].astype(str)
df['participant_relationship'] = df['participant_relationship'].astype(str)

In [53]:
df.loc[(['Stolen' in s for s in df['gun_stolen']]) or
       (['stolen' in s for s in df['gun_stolen']]) or
       (['Gun(s) stolen' in s for s in df['incident_characteristics']]) or
       (['Stolen/Illegally owned gun{s}' in s for s in df['incident_characteristics']]) or
       (['stolen gun' in s for s in df['incident_characteristics']]), 'gun_stolen'] = 1
df.loc[df['gun_stolen']!=1, 'gun_stolen'] = 0
df.loc[(['Suicide' in s for s in df['incident_characteristics']]) or
       (['suicide' in s for s in df['incident_characteristics']]), 'suicide'] = 1
df.loc[df['suicide']!=1, 'suicide'] = 0
df.loc[(['Accidental' in s for s in df['incident_characteristics']]) or
       (['accidental' in s for s in df['incident_characteristics']]), 'accidental'] = 1
df.loc[df['accidental']!=1, 'accidental'] = 0
df.loc[(['Domestic Violence' in s for s in df['incident_characteristics']]), 'domestic_violence'] = 1
df.loc[(['Domestic Violence' not in s for s in df['incident_characteristics']]), 'domestic_violence'] = 0
df.loc[(['Gang' in s for s in df['participant_relationship']]) or
       (['Gang' in s for s in df['incident_characteristics']]), 'gang_related'] = 1
df.loc[df['gang_related']!=1, 'gang_related'] = 0
df.loc[(['Non-Shooting Incident' in s for s in df['incident_characteristics']]), 'non_shooting_incident'] = 1
df.loc[df['non_shooting_incident']!=1, 'non_shooting_incident'] = 0
df.loc[(['Gun shop robbery' in s for s in df['incident_characteristics']]), 'gun_shop_robbery'] = 1
df.loc[df['gun_shop_robbery']!=1, 'gun_shop_robbery'] = 0
df.loc[(['Drive-by' in s for s in df['incident_characteristics']]), 'drive_by_shooting'] = 1
df.loc[df['drive_by_shooting']!=1, 'drive_by_shooting'] = 0
df.loc[(['Officer Involved Shooting' in s for s in df['incident_characteristics']]), 'officer_involved_shooting'] = 1
df.loc[df['officer_involved_shooting']!=1, 'officer_involved_shooting'] = 0
df.loc[(['Child' in s for s in df['incident_characteristics']]), 'child_involved'] = 1
df.loc[df['child_involved']!=1, 'child_involved'] = 0
df.loc[(['Mass Shooting' in s for s in df['incident_characteristics']]), 'mass_shooting'] = 1
df.loc[df['mass_shooting']!=1, 'mass_shooting'] = 0
df.loc[(['Drug' in s for s in df['incident_characteristics']]), 'drug_involved'] = 1
df.loc[df['drug_involved']!=1, 'drug_involved'] = 0
df.loc[(['Assault weapon' in s for s in df['incident_characteristics']]), 'assault_weapon'] = 1
df.loc[df['assault_weapon']!=1, 'assault_weapon'] = 0

In [11]:
boolean_columns = ['gun_stolen', 'suicide', 'accidental', 'domestic_violence',
                   'gang_related', 'non_shooting_incident', 'gun_shop_robbery',
                   'drive_by_shooting', 'officer_involved_shooting', 'child_involved',
                   'mass_shooting', 'drug_involved', 'assault_weapon']

for column in boolean_columns:
    df[column] = df[column].astype(int)
    print("{}: ".format(column) + str(len(df[df[column]==1])))

gun_stolen: 5543
suicide: 6101
accidental: 8131
domestic_violence: 10794
gang_related: 288
non_shooting_incident: 38321
gun_shop_robbery: 422
drive_by_shooting: 12030
officer_involved_shooting: 13810
child_involved: 2104
mass_shooting: 1637
drug_involved: 16722
assault_weapon: 1877


In [54]:
df.drop(['address', 'incident_url', 'sources',
         'incident_characteristics', 'participant_name',
         'participant_gender', 'participant_relationship',
         'gun_type', 'participant_age_group',
         'location_description', 'notes'], axis = 1, inplace = True)

In [None]:
df.to_pickle('../Pickles/df1.pkl')

In [9]:
df = pd.read_pickle('../Pickles/df1.pkl')

In [4]:
len(df)

214814

In [56]:
df.head(3)

Unnamed: 0,incident_id,date,state,city_or_county,congressional_district,gun_stolen,latitude,longitude,num_guns_involved,participant_age,participant_status,participant_type,state_house_district,state_senate_district,suicide,accidental,domestic_violence,gang_related,non_shooting_incident,gun_shop_robbery,drive_by_shooting,officer_involved_shooting,child_involved,mass_shooting,drug_involved,assault_weapon
0,461105,2013-01-01,Pennsylvania,Mckeesport,14.0,0,40.35,-79.86,,"[0, , 20]","[0, , Arrested, , 1, , Injured, , 2, , Injured, , 3, , Injured, , 4, , Injured]","[0, , Victim, , 1, , Victim, , 2, , Victim, , 3, , Victim, , 4, , Subject-Suspect]",,,0,0,0,0,0,0,0,0,0,1,0,0
1,460726,2013-01-01,California,Hawthorne,43.0,0,33.91,-118.33,,"[0, , 20]","[0, , Killed, , 1, , Injured, , 2, , Injured, , 3, , Injured]","[0, , Victim, , 1, , Victim, , 2, , Victim, , 3, , Victim, , 4, , Subject-Suspect]",62.0,35.0,0,0,0,0,0,0,0,0,0,1,0,0
2,478855,2013-01-01,Ohio,Lorain,9.0,0,41.45,-82.14,2.0,"[0, , 25, , 1, , 31, , 2, , 33, , 3, , 34, , 4, , 33]","[0, , Injured, Unharmed, Arrested, , 1, , Unharmed, Arrested, , 2, , Killed, , 3, , Injured, , 4, , Injured]","[0, , Subject-Suspect, , 1, , Subject-Suspect, , 2, , Victim, , 3, , Victim, , 4, , Victim]",56.0,13.0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
part_type_df = df.set_index(['incident_id'])['participant_type'].apply(pd.Series).stack().reset_index(level=1, drop=True)
part_type_df = part_type_df.reset_index()
part_type_df.columns = ['incident_id','participant_type']
part_type_df = part_type_df[(part_type_df['participant_type']=='Victim') | (part_type_df['participant_type']=='Subject-Suspect')]
part_type_df['participant_index'] = part_type_df.groupby('incident_id').cumcount()
part_type_df = part_type_df.reset_index()
part_type_df.drop(['index'], axis = 1, inplace = True)

part_age_df = df.set_index(['incident_id'])['participant_age'].apply(pd.Series).stack().reset_index(level=1, drop=True)
part_age_df = part_age_df.reset_index()
part_age_df.columns = ['incident_id','participant_age']
part_age_df = part_age_df[part_age_df['participant_age'] != '']
part_age_df = pd.DataFrame({'incident_id': part_age_df['incident_id'].iloc[::2].values, 'participant_index': part_age_df['participant_age'].iloc[::2].values, 'participant_age': part_age_df['participant_age'].iloc[1::2].values})
part_age_df['participant_index'] = part_age_df['participant_index'].astype(int)

part_status_df = df.set_index(['incident_id'])['participant_status'].apply(pd.Series).stack().reset_index(level=1, drop=True)
part_status_df = part_status_df.reset_index()
part_status_df.columns = ['incident_id','participant_status']
part_status_df = part_status_df[part_status_df['participant_status'] != '']
part_status_df = pd.DataFrame({'incident_id': part_status_df['incident_id'].iloc[::2].values, 'participant_index': part_status_df['participant_status'].iloc[::2].values, 'participant_status': part_status_df['participant_status'].iloc[1::2].values})
part_status_df['participant_index'] = part_status_df['participant_index'].astype(int)

part_df = pd.merge(part_type_df, part_age_df, on=['incident_id', 'participant_index'], how='outer')
part_df = pd.merge(part_df, part_status_df, on=['incident_id', 'participant_index'], how='outer')

part_df['participant_type'] = part_df['participant_type'].astype(str)
part_df['participant_status'] = part_df['participant_status'].astype(str)
part_df['participant_age'] = part_df['participant_age'].astype(float)

part_df.loc[(['Unharmed' in s for s in part_df['participant_status']]), 'num_unharmed'] = 1
part_df.loc[(['Unharmed' not in s for s in part_df['participant_status']]), 'num_unharmed'] = 0
part_df.loc[(['Arrested' in s for s in part_df['participant_status']]), 'num_arrested'] = 1
part_df.loc[(['Arrested' not in s for s in part_df['participant_status']]), 'num_arrested'] = 0
part_df.loc[(['Killed' in s for s in part_df['participant_status']]), 'num_killed'] = 1
part_df.loc[(['Killed' not in s for s in part_df['participant_status']]), 'num_killed'] = 0
part_df.loc[(['Injured' in s for s in part_df['participant_status']]), 'num_injured'] = 1
part_df.loc[(['Injured' not in s for s in part_df['participant_status']]), 'num_injured'] = 0

In [None]:
len(part_df['incident_id'].unique())==len(df['incident_id'].unique())

In [None]:
part_df.head()

In [None]:
aggregations_1 = {
    'num_unharmed': 'sum',
    'num_arrested': 'sum',
    'num_killed': 'sum',
    'num_injured': 'sum',
    }

aggregations_2 = {
    'participant_index': 'count',
    'participant_age': 'mean',
    }

part_status_df_group = part_df.groupby(['incident_id']).agg(aggregations_1).reset_index()
part_df_group = part_df.groupby(['incident_id']).agg(aggregations_2).reset_index()
part_type_df_group = part_df.groupby(['incident_id', 'participant_type']).agg(aggregations_2).reset_index()
victim_df = deepcopy(part_type_df_group[part_type_df_group['participant_type']=='Victim'])
suspect_df = deepcopy(part_type_df_group[part_type_df_group['participant_type']=='Subject-Suspect'])

part_df_group.rename({'participant_age': 'avg_participant_age', 'participant_index': 'num_participants'}, axis = 1, inplace = True)
victim_df.rename({'participant_age': 'avg_victim_age', 'participant_index': 'num_victims'}, axis = 1, inplace = True)
suspect_df.rename({'participant_age': 'avg_suspect_age', 'participant_index': 'num_suspects'}, axis = 1, inplace = True)

victim_df.drop(['participant_type'], axis = 1, inplace = True)
suspect_df.drop(['participant_type'], axis = 1, inplace = True)

In [None]:
len(df)

In [None]:
df = pd.merge(df, part_df_group, on=['incident_id'], how='outer')
df = pd.merge(df, part_status_df_group, on=['incident_id'], how='outer')
df = pd.merge(df, victim_df, on=['incident_id'], how='outer')
df = pd.merge(df, suspect_df, on=['incident_id'], how='outer')

df.loc[df['num_suspects'].isnull(), 'num_suspects'] = 0
df.drop(['participant_age', 'participant_status', 'participant_type'], axis = 1, inplace = True)

In [None]:
# # id for las vegas shooting chosing arbitrarily
# # added las vegas shooting (originally missing from dataset)
# df_append = pd.DataFrame(np.array([
#     100117, pd.Timestamp('2017-10-01'), 'Nevada', 'Las Vegas', 59,
#     489, None, 0, 36.08833298, -115.171499314, 47,
#     None, None, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
#     910, 35.23232323, 0.028383028, 851, 0, 195,
#     34.93877551, 5.581191589, 1, 64, 0.015625]).reshape(1, -1),
#                            columns = list(df.columns))
# df = df.append([df_append], ignore_index = True)

In [None]:
# # added incident 1081885 (originally missing from dataset)
# df_append_2 = pd.DataFrame(np.array([
#     1081885, '2018-02-18', 'Arkansas', 'Hermitage',
#     1, 0, 4, 0, 33.393644, -92.224399, 1,
#     8, 26, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#     2, 48, 0.04166666666, 1, 1, 1,
#     38, 0.02631578947, 1, 58, 0.01724137931]).reshape(1, -1),
#                          columns = list(df.columns))
# df = df.append([df_append_1], ignore_index = True)

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['num_killed'] = df['num_killed'].astype(int)
df['num_injured'] = df['num_injured'].astype(int)
df['gun_stolen'] = df['gun_stolen'].astype(int)
df['suicide'] = df['suicide'].astype(int)
df['accidental'] = df['accidental'].astype(int)
df['domestic_violence'] = df['domestic_violence'].astype(int)
df['gang_related'] = df['gang_related'].astype(int)
df['non_shooting_incident'] = df['non_shooting_incident'].astype(int)
df['gun_shop_robbery'] = df['gun_shop_robbery'].astype(int)
df['drive_by_shooting'] = df['drive_by_shooting'].astype(int)
df['officer_involved_shooting'] = df['officer_involved_shooting'].astype(int)
df['child_involved'] = df['child_involved'].astype(int)
df['mass_shooting'] = df['mass_shooting'].astype(int)
df['drug_involved'] = df['drug_involved'].astype(int)
df['assault_weapon'] = df['assault_weapon'].astype(int)
df['incident_id'] = df['incident_id'].astype(int)
df['num_suspects'] = df['num_suspects'].astype(int)
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)
df['num_guns_involved'] = df['num_guns_involved'].astype(float)
df['congressional_district'] = df['congressional_district'].astype(float)
df['state_house_district'] = df['state_house_district'].astype(float)
df['state_senate_district'] = df['state_senate_district'].astype(float)
df['avg_participant_age'] = df['avg_participant_age'].astype(float)
df['num_unharmed'] = df['num_unharmed'].astype(float)
df['num_arrested'] = df['num_arrested'].astype(float)
df['num_victims'] = df['num_victims'].astype(float)
df['avg_victim_age'] = df['avg_victim_age'].astype(float)
df['avg_suspect_age'] = df['avg_suspect_age'].astype(float)
df['num_participants'] = df['num_participants'].astype(float)

In [None]:
len(df)

In [None]:
df.to_pickle('../Pickles/df2.pkl')

In [None]:
df = pd.read_pickle('../Pickles/df2.pkl')

In [None]:
len(df)

In [None]:
df.head()

In [None]:
aggregations = {
    'incident_id': 'count',
    'num_killed': 'sum',
    'num_injured': 'sum',
    'congressional_district': 'mean',
    'gun_stolen': 'sum',
    'num_guns_involved': 'sum',
    'state_house_district': 'mean',
    'state_senate_district': 'mean',
    'suicide': 'sum',
    'accidental': 'sum',
    'domestic_violence': 'sum',
    'gang_related': 'sum',
    'non_shooting_incident': 'sum',
    'gun_shop_robbery': 'sum',
    'drive_by_shooting': 'sum',
    'officer_involved_shooting': 'sum',
    'child_involved': 'sum',
    'mass_shooting': 'sum',
    'drug_involved': 'sum',
    'assault_weapon': 'sum',
    'num_unharmed': 'sum',
    'num_arrested': 'sum',
    'num_suspects': 'sum',
    'avg_suspect_age': 'mean',
    'num_victims': 'sum',
    'avg_victim_age': 'mean',
    'num_participants': 'sum',
    'avg_participant_age': 'mean',
    }

df_states = df.groupby(['state', 'date']).agg(aggregations).reset_index()
df_states.rename({'incident_id': 'num_incidents'}, axis = 1, inplace = True)
df_date = df.groupby(['date']).agg(aggregations).reset_index()
df_date.rename({'incident_id': 'num_incidents'}, axis = 1, inplace = True)

In [None]:
df_states.to_pickle('../Pickles/df_states.pkl')
df_date.to_pickle('../Pickles/df_date.pkl')

In [40]:
df_states = pd.read_pickle('../Pickles/df_states.pkl')
df_date = pd.read_pickle('../Pickles/df_date.pkl')
state_laws_df = pd.read_pickle('../Pickles/state_firearm_laws.pkl')

In [41]:
df_states.drop(['avg_victim_age', 'avg_suspect_age',
                'num_participants', 'avg_participant_age',
                'congressional_district', 'state_house_district',
                'state_senate_district'],
               axis = 1, inplace = True)
df_date.drop(['avg_victim_age', 'avg_suspect_age', 'num_participants',
              'avg_participant_age', 'congressional_district',
              'state_house_district', 'state_senate_district'],
             axis = 1, inplace = True)

In [42]:
missing_dates = set(pd.period_range(min(df_date['date']), max(df_date['date']))) - set(list(pd.DatetimeIndex(df_date['date']).to_period('D')))
print("{} missing dates across the US.".format(len(missing_dates)))
df_append = pd.DataFrame(list(missing_dates), columns = ['date'])
df_append['date'] = pd.to_datetime(df_append['date'].astype(str))
for column in df_date.columns:
    if column != 'date':
        df_append[column] = 0
df_date = df_date.append(df_append, sort=False, ignore_index = True)

for state in df_states['state'].unique():
    missing_dates = set(pd.period_range(min(df_states['date']), max(df_states['date']))) - set(list(pd.DatetimeIndex(df_states[df_states['state']==state]['date']).to_period('D')))
    print("{} missing dates in {}.".format(len(missing_dates), state))
    df_append = pd.DataFrame(list(missing_dates), columns = ['date'])
    df_append['date'] = pd.to_datetime(df_append['date'].astype(str))
    df_append['state'] = state
    for column in df_states.columns:
        if column != 'date' and column != 'state':
            df_append[column] = 0
    df_states = df_states.append(df_append, sort=False, ignore_index = True)

191 missing dates across the US.
449 missing dates in Alabama.
1174 missing dates in Alaska.
812 missing dates in Arizona.
683 missing dates in Arkansas.
331 missing dates in California.
681 missing dates in Colorado.
685 missing dates in Connecticut.
957 missing dates in Delaware.
814 missing dates in District of Columbia.
356 missing dates in Florida.
384 missing dates in Georgia.
1681 missing dates in Hawaii.
1424 missing dates in Idaho.
354 missing dates in Illinois.
420 missing dates in Indiana.
797 missing dates in Iowa.
848 missing dates in Kansas.
525 missing dates in Kentucky.
379 missing dates in Louisiana.
1378 missing dates in Maine.
437 missing dates in Maryland.
465 missing dates in Massachusetts.
424 missing dates in Michigan.
899 missing dates in Minnesota.
567 missing dates in Mississippi.
420 missing dates in Missouri.
1460 missing dates in Montana.
1057 missing dates in Nebraska.
887 missing dates in Nevada.
1300 missing dates in New Hampshire.
459 missing dates in N

In [43]:
len(list(set(df_states['state'].value_counts().values)))==1

True

In [44]:
for column in df_states.columns:
    if column=='date':
        df_states['date'] = pd.to_datetime(df_states['date'])
    elif column=='state':
        pass
    else:
        df_states[column] = df_states[column].astype(int)
        
for column in df_date.columns:
    if column=='date':
        df_date['date'] = pd.to_datetime(df_date['date'])
    else:
        df_date[column] = df_date[column].astype(int)

In [45]:
df_states['year'] = df_states['date'].dt.year
df_states = pd.merge(df_states, state_laws_df, on=['state', 'year'], how='left')
df_states = df_states.reset_index()
df_states.drop(['index'], axis = 1, inplace = True)

df_states = df_states.sort_values(by = 'date', ascending = True)
# exporting df_date un_scaled data to assets repository for d3.js visualization
df_date = df_date.sort_values(by = 'date', ascending = True)

df_states.to_pickle('../Pickles/df_states_2.pkl')
df_date.to_pickle('../Pickles/df_date_2.pkl')
df_date.to_csv('../../cyaris.github.io/assets/df_date_observations.csv')

In [46]:
len(df_date)==len(df_states)/51

True

In [47]:
df_states = pd.read_pickle('../Pickles/df_states_2.pkl')
df_date = pd.read_pickle('../Pickles/df_date_2.pkl')

In [None]:
# print(len(df_date[df_date['num_incidents']==0]))
# print(len(df_states[df_states['num_incidents']==0]))