In [6]:
import numpy as np
import pandas as pd
import sqlite3
import dask.dataframe as ddf
import matplotlib.pyplot as plt
import seaborn as sns

# plt.style.use("classic")
# %matplotlib inline
sns.set_style('darkgrid')

In [7]:
db = sqlite3.connect('switrs.sqlite')
cur = db.cursor()
date_threshold = "2016-01-01"

tables = [name[0] for name in cur.execute("SELECT name FROM sqlite_master")]
tables = ['collisions', 'parties']
df = {}

In [9]:
# Read collisions table
cmd = \
    "SELECT c.*\
        ,p.drivers_using_phone\
        ,p.drivers_on_drugs\
        ,p.parties_not_safe\
    FROM collisions c\
    LEFT JOIN\
        (SELECT case_id\
                ,SUM(CASE WHEN party_type='driver' THEN cellphone_in_use END) AS drivers_using_phone\
                ,SUM(CASE WHEN party_type='driver' AND party_drug_physical='under drug influence' THEN 1\
                            ELSE 0\
                        END) AS drivers_on_drugs\
                ,SUM(CASE WHEN party_safety_equipment_1 IN ('air bag deployed') OR\
                            party_safety_equipment_2 IN ('air bag deployed')\
                    THEN 0\
                    WHEN party_safety_equipment_1 IN ('lap/shoulder harness used','lap belt used','shoulder harness used') OR\
                            party_safety_equipment_2 IN ('lap/shoulder harness used','lap belt used','shoulder harness used')\
                    THEN 0.5\
                    ELSE 1\
                    END) AS parties_not_safe\
        FROM parties\
        GROUP BY case_id) p\
    ON c.case_id = p.case_id\
    WHERE c.collision_date>='{}'".format(date_threshold)
collisions = pd.read_sql_query(cmd, db)
# names = list(map(lambda x: x[0], cur.description))
# collisions = pd.DataFrame([list(x) for x in cur.execute(cmd)], columns=names+['drivers_using_phone','drivers_on_drugs', 'parties_not_safe'])

# Display
display(collisions.head())
collisions.info()

Unnamed: 0,case_id,jurisdiction,officer_id,reporting_district,chp_shift,population,county_city_location,county_location,special_condition,beat_type,...,primary_ramp,secondary_ramp,latitude,longitude,collision_date,collision_time,process_date,drivers_using_phone,drivers_on_drugs,parties_not_safe
0,81715,1941.0,11342,212,not chp,>250000,1941,los angeles,0,not chp,...,,,,,2020-03-14,07:45:00,2020-06-22,0.0,0,0.5
1,726202,3600.0,8945,64,not chp,50000 to 100000,3612,san bernardino,0,not chp,...,,,,,2020-07-26,02:50:00,2020-09-30,,0,1.0
2,6292799,1942.0,41054,729,not chp,>250000,1942,los angeles,0,not chp,...,,,,,2016-03-11,11:30:00,2018-09-25,0.0,1,1.5
3,6292800,1942.0,40164,1415,not chp,>250000,1942,los angeles,0,not chp,...,,,,,2016-04-13,05:15:00,2018-09-27,0.0,0,1.5
4,6292881,1602.0,1484,50,not chp,25000 to 50000,1602,kings,0,not chp,...,,,36.34256,-119.62345,2017-08-31,01:07:00,2019-04-05,0.0,1,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2429886 entries, 0 to 2429885
Data columns (total 78 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   case_id                          object 
 1   jurisdiction                     float64
 2   officer_id                       object 
 3   reporting_district               object 
 4   chp_shift                        object 
 5   population                       object 
 6   county_city_location             object 
 7   county_location                  object 
 8   special_condition                object 
 9   beat_type                        object 
 10  chp_beat_type                    object 
 11  city_division_lapd               object 
 12  chp_beat_class                   object 
 13  beat_number                      object 
 14  primary_road                     object 
 15  secondary_road                   object 
 16  distance                         float64
 17  directio

In [10]:
# Alcohol Drugs involved
def alcohol_drugs_involved(df):
    alcohol = df['alcohol_involved']
    drugs = df['drivers_on_drugs']
    
    if alcohol==1 or drugs>0:
        return 1
    else:
        return 0

# Get hour from datetime
def hour(time):
    days, seconds = time.days, time.seconds
    return days * 24 + seconds // 3600

# Get part of day from collision time
def part_of_day(hour):
    if hour>=6 and hour<12:
        return 'Morning'
    elif hour>=12 and hour<17:
        return 'Afternoon'
    elif hour>=17 and hour<20:
        return 'Evening'
    elif hour>=20 and hour<6:
        return 'Night'

# Is caused by condition
def is_bc_condition(df):
    return 0;

# Is caused by human error
def is_bc_human_error(df):
    return 0;

# Is caused by violation
def is_bc_violation(df):
    return 0;

# Adding columns from date
collisions["collision_date"] = pd.to_datetime(collisions["collision_date"])
collisions["collision_year_month"] = collisions["collision_date"].dt.to_period('M')
collisions["collision_year"] = collisions["collision_date"].dt.year
collisions["collision_month"] = collisions["collision_date"].dt.month
collisions['collision_time'] = pd.to_timedelta(collisions['collision_time'])
collisions["collision_hour"] = collisions['collision_time'].apply(lambda x: hour(x))
collisions["collision_dow"] = collisions["collision_date"].dt.day_of_week
collisions["collision_day_name"] = collisions["collision_date"].dt.day_name()
collisions['has_injured'] = collisions['injured_victims'].apply(lambda x: 1 if x>0 else 0)
collisions['killed_victims'] = collisions['killed_victims'].astype(np.float)
collisions['is_fatal'] = np.array([1 if x > 0 else 0 for x in collisions['killed_victims']])
collisions['alcohol_involved'] = collisions['alcohol_involved'].apply(lambda x: 1 if x==1 else 0)
collisions['alcohol_drugs_involved'] = collisions[['alcohol_involved','drivers_on_drugs']].apply(alcohol_drugs_involved, axis=1)
collisions['part_of_day'] = collisions['collision_hour'].apply(lambda x: part_of_day(x))
# collisions['is_bc_condition'] = collisions[['road_surface','weather_1','weather_2']].apply(part_of_day)
# collisions['is_bc_human_error'] = collisions[['pcf_violation_category','','']].apply(is_bc_human_error)
collisions.head()

# Display
display(collisions.head())
display(collisions.info())

# Save as CSV
collisions.to_csv("collisions.csv")
del collisions

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  collisions['killed_victims'] = collisions['killed_victims'].astype(np.float)


Unnamed: 0,case_id,jurisdiction,officer_id,reporting_district,chp_shift,population,county_city_location,county_location,special_condition,beat_type,...,collision_year_month,collision_year,collision_month,collision_hour,collision_dow,collision_day_name,has_injured,is_fatal,alcohol_drugs_involved,part_of_day
0,81715,1941.0,11342,212,not chp,>250000,1941,los angeles,0,not chp,...,2020-03,2020,3,7.0,5,Saturday,0,0,0,Morning
1,726202,3600.0,8945,64,not chp,50000 to 100000,3612,san bernardino,0,not chp,...,2020-07,2020,7,2.0,6,Sunday,0,0,0,
2,6292799,1942.0,41054,729,not chp,>250000,1942,los angeles,0,not chp,...,2016-03,2016,3,11.0,4,Friday,0,1,1,Morning
3,6292800,1942.0,40164,1415,not chp,>250000,1942,los angeles,0,not chp,...,2016-04,2016,4,5.0,2,Wednesday,0,1,0,
4,6292881,1602.0,1484,50,not chp,25000 to 50000,1602,kings,0,not chp,...,2017-08,2017,8,1.0,3,Thursday,1,1,1,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2429886 entries, 0 to 2429885
Data columns (total 88 columns):
 #   Column                           Dtype          
---  ------                           -----          
 0   case_id                          object         
 1   jurisdiction                     float64        
 2   officer_id                       object         
 3   reporting_district               object         
 4   chp_shift                        object         
 5   population                       object         
 6   county_city_location             object         
 7   county_location                  object         
 8   special_condition                object         
 9   beat_type                        object         
 10  chp_beat_type                    object         
 11  city_division_lapd               object         
 12  chp_beat_class                   object         
 13  beat_number                      object         
 14  primary_road      

None

In [11]:
# Read collisions table
cmd = "SELECT *\
        FROM parties p\
        LEFT JOIN \
            (SELECT case_id, collision_date\
            FROM collisions) c\
        ON p.case_id = c.case_id\
        WHERE collision_date>='{}'".format(date_threshold)
parties = pd.read_sql_query(cmd, db)

In [12]:
# Get age bucket
def party_age_bucket(age):
    if age<16:
        return '<16'
    elif age<25:
        return '16-24'
    elif age<35:
        return '25-34'
    elif age<45:
        return '35-44'
    elif age<55:
        return '45-54'
    elif age<65:
        return '55-64'
    elif age>=65:
        return '65+'
    else:
        return ''

parties["party_age_bucket"] = parties["party_age"].apply(lambda x: party_age_bucket(x))

# Display
parties.to_csv('parties.csv',index=False)
display(parties.head())
parties.info()

Unnamed: 0,id,case_id,party_number,party_type,at_fault,party_sex,party_age,party_sobriety,party_drug_physical,direction_of_travel,...,movement_preceding_collision,vehicle_year,vehicle_make,statewide_vehicle_type,chp_vehicle_type_towing,chp_vehicle_type_towed,party_race,case_id.1,collision_date,party_age_bucket
0,1,81715,1,driver,1,female,35.0,not applicable,not applicable,north,...,proceeding straight,2007.0,ford,,,,other,81715,2020-03-14,35-44
1,2,81715,2,driver,0,female,43.0,not applicable,not applicable,north,...,proceeding straight,2019.0,,,,,hispanic,81715,2020-03-14,35-44
2,3,726202,1,driver,1,,,impairment unknown,G,north,...,proceeding straight,2005.0,,passenger car,"passenger car, station",,,726202,2020-07-26,
3,3846789,6292799,1,driver,0,male,44.0,had not been drinking,,west,...,making left turn,2015.0,chevrolet,passenger car,"passenger car, station",,black,6292799,2016-03-11,35-44
4,3846790,6292799,2,driver,1,male,41.0,,under drug influence,east,...,proceeding straight,2005.0,kawasaki,motorcycle or scooter,motorcycle,,white,6292799,2016-03-11,35-44


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4836559 entries, 0 to 4836558
Data columns (total 35 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   id                            int64  
 1   case_id                       object 
 2   party_number                  int64  
 3   party_type                    object 
 4   at_fault                      int64  
 5   party_sex                     object 
 6   party_age                     float64
 7   party_sobriety                object 
 8   party_drug_physical           object 
 9   direction_of_travel           object 
 10  party_safety_equipment_1      object 
 11  party_safety_equipment_2      object 
 12  financial_responsibility      object 
 13  hazardous_materials           float64
 14  cellphone_in_use              float64
 15  cellphone_use_type            object 
 16  school_bus_related            float64
 17  oaf_violation_code            object 
 18  oaf_violation_category