# COGS 108 - Capstone Project

## Project links, files, and basic information

### Websites with datasets:
- San Diego Vehicle Stops:  https://data.sandiego.gov/datasets/police-vehicle-stops/
- Dan Diego Population Data:  http://www.city-data.com/city/San-Diego-California.html

### Websites of needed information:
- San Diego police service areas https://www.sandiego.gov/police/services/divisions (vehcle stop data only records the first two digits)
- San Diego zip code map: http://www.city-data.com/zipmaps/San-Diego-California.html

### Names of datasets
#### *Vehicle Stops*
- 'vehicle_stops_2017.csv'
- 'vehicle_stops_2016.csv'
- 'vehicle_stops_2015.csv'
- 'vehicle_stops_2014.csv'

#### *Vehicle Stops Details*
- 'vehicle_stops_search_details_2017.csv'
- 'vehicle_stops_search_details_2016.csv'
- 'vehicle_stops_search_details_2015.csv'
- 'vehicle_stops_search_details_2014.csv'

#### *Files needed to read Vehicle Stops information*
- Race Codes: 'vehicle_stops_race_codes.csv'    
- Title explanations for Vehicle Stops data: 'vehicle_stops_dictionary.csv'
- Title explanations for Vehicle Stops Details data: 'vehicle_stops_search_details_dictionary.csv'
- Possible actions taken when stopped for Vehicle Stops Details data: 'vehicle_stops_search_details_description_list.csv'

# TODO
 - Map zip-code and police area 
 - Combine stops and info datasets (careful not loosing information/counting duplicates)
 - Web scrape zip-code demographics
 - Plan/start thinking about how we are correlate the data

## Imports

In [None]:
%matplotlib inline

# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Web scrapping
import sys
#!conda install --yes --prefix {sys.prefix} beautifulsoup4

# Data analysis
import patsy
import statsmodels.api as sm
import scipy.stats as stats
from scipy.stats import ttest_ind, chisquare, normaltest

## Cleaning stops dataframe - functions

- ### Clean Unwanted Columns

In [None]:
# Wanted column titles for stops dataframe
stops_col_titles = ['stop_id','stop_cause','service_area','subject_race','subject_sex','subject_age',
                    'arrested','searched','contraband_found','property_seized']

In [None]:
# Funtion to get rid of unwanted columns in vehicle stop dataset - Alberto
# Params: stops - dataset of stops to clean
def clean_stops_cols(stops):
    
    #Obtain unwated columns and drop them
    drop_list = np.setdiff1d(list(stops),stops_col_titles)
    stops.drop(drop_list, axis=1, inplace=True)
    
    return stops

- ### Clean Nans and missing values of stops dataset

In [None]:
#If nans exist of these columns the entry will be dropped
clean_nans_cols = ['stop_cause', 'stop_id', 'subject_race', 'subject_sex', 'subject_age']

In [None]:
# Funtion to get rid of nans vehicle stop dataset - Alberto
# Params: stops - dataset of stops to clean
def clean_stops_nans(stops):
    
    # Here we assume a Nan means a No in these columns (Since the majority of columns had 'Nan' instead of 'N')
    stops['arrested'] = stops['arrested'].replace({np.nan:'N'})
    stops['searched'] = stops['searched'].replace({np.nan:'N'})
    stops['contraband_found'] = stops['contraband_found'].replace({np.nan:'N'})
    stops['property_seized'] = stops['property_seized'].replace({np.nan:'N'})
    
    stops.dropna(how = 'any', subset = clean_nans_cols, inplace = True)
    
    return stops

## Cleaning stops detail dataframe - functions

- ### Clean Unwanted Columns of stop details dataset

In [None]:
# Wanted column titles for stops information dataframe
stops_info_col_titles = ['stop_id','search_details_type','search_details_description']

In [None]:
# Funtion to get rid of unwanted columns in vehicle stop informationdataset - Alberto
# Params: stops_info - dataset of stops information to clean
def clean_stops_info_cols(stops_info):
    
    #Obtain unwated columns and drop them
    drop_list = np.setdiff1d(list(stops_info),stops_info_col_titles)
    stops_info.drop(drop_list, axis=1, inplace=True) 
    
    return stops_info

- ### Clean Nans and missing values of stops details dataset

In [None]:
# Take out meaningless entry
# Params: stops_info - dataset of stops information to clean
def clean_stops_info_meaningless(stops_info):
    
    stops_info = stops_info[~((stops_info['search_details_type'] == 'ActionTakenOther') 
                                      & stops_info['search_details_description'].isnull())]
    stops_info = stops_info[~((stops_info['search_details_type'] == 'ActionTaken') 
                                      & (stops_info['search_details_description'] == 'Other'))]
    stops_info = stops_info[~((stops_info['search_details_type'] == 'SearchBasis') 
                                      & (stops_info['search_details_description'] == 'Other'))]
    return stops_info

In [None]:
# Standarize action type entry
# Params: action - string to be standarized
def standardize_action_type(action_type):
    action_type = str(action_type)
    action_type = action_type.lower()
    
    if 'action' in action_type:
        action_type = 'action'
    
    elif 'search' in action_type:
        action_type = 'search'
        
    return action_type

In [None]:
# Standarize action details entry
# Params: action - string to be standarized
def standardize_action_desc(action):
    
    # Otherwise move onto parsinf
    action = str(action)
    action = action.lower()

    if 'nan' in action:
        action = np.nan
        
    elif 'arrest' in action:
        action = ['arrest']
        
    elif '310' in action:
        action = ['310']
        
    elif 'imp' in action:
        action = ['impound']

    elif 'tow' in action:
        action = ['tow']
        
    elif 'mistake' in action:
        action = ['released']
        
    elif 'released' in action:
        action = ['released']
        
    elif 'leave' in action:
        action = ['released']
        
    elif 'free' in action:
        action = ['released']
        
    elif 'no vio' in action:
        action = ['released']
        
    elif 'no dui' in action:
        action = ['released']
        
    elif 'nothing' in action:
        action = ['released']
         
    elif 'notice' in action:
        action = ['suspension notice']
        
    elif 'plate' in action:
        action = ['check plate']
        
    elif 'passenger' in action:
        action = ['passenger']
        
    elif 'license' in action:
        action = ['license']
        
    elif 'dui' in action:
        action = ['dui eval']
        
    elif 'det' in action:
        action = ['detention']
        
    elif 'contact' in action:
        action = ['contact']
        
    elif 'suspen' in action:
        action = ['suspension']
    
    elif 'susp' in action:
        action = ['suspect']
        
    elif 'cit' in action:
        action = ['citation']
        
    elif 'dmv' in action:
        action = ['DMV issue']
        
    else:
        action = 'Other'
        
    return action

In [None]:
# Clean nans and reduce descriptions
# Params: stops_info - dataset of stops information to clean
def clean_stops_info_nans(stops_info):
    
    # Clean meaningless columns
    stops_info = clean_stops_info_meaningless(stops_info)
    
    # Clean type column
    type_title = 'search_details_type'
    stops_info[type_title] = stops_info[type_title].apply(standardize_action_type)
    
    # Clean details column
    desc_title = 'search_details_description'
    stops_info[desc_title] = stops_info[desc_title].apply(standardize_action_desc)
    
    # Remove 'Other' and nan entries as they do not give us any extra information
    stops_info = stops_info[~(stops_info['search_details_description'] == "Other")]
    stops_info.dropna(how = 'any', subset = stops_info_col_titles, inplace = True)
    
    return stops_info

## Final cleaning functions

In [None]:
# Combine cleaning dataframe functions into one
# Params: stops - stops dataframe to be cleaned
def clean_stops(stops):
    stops = clean_stops_cols(stops)
    stops = clean_stops_nans(stops)
    
    return stops

In [None]:
# Combine cleaning dataframe functions into one
# Params: stops_info - stops information dataframe to be cleaned
def clean_stops_info(stops_info):
    stops_info = clean_stops_info_cols(stops_info)
    stops_info = clean_stops_info_nans(stops_info)
    return stops_info

## Cleaning Testing

In [None]:
# Read in 2017 datasets
df_stops_17 = pd.read_csv('vehicle_stops_2017.csv')
df_stops_17 = clean_stops(df_stops_17)

In [None]:
assert df_stops_17.isnull().sum().sum() == 0

In [None]:
# Read in 2017 datasets
df_stops_info_17 = pd.read_csv('vehicle_stops_search_details_2017.csv')
df_stops_info_17 = clean_stops_info(df_stops_info_17)

In [None]:
assert df_stops_info_17.isnull().sum().sum() == 0

# Merging the stops and details datasets

In [None]:
# Helper function: Merges duplicates within the information dataser
# Params: info - dataframe with stops information
def merge_duplicates(info):
    
    deleted = 0
    last_index = len(info) -1

    for index, row in info.iterrows():
    
        if deleted > 0:
            deleted -= 1
        
        elif index < last_index:
        
            s_id = row['stop_id']
        
            next_index = index+1
            next_id = info['stop_id'][next_index]
    
            while (s_id == next_id) & (next_index <= last_index):
            
                # Grab entry of duplicate
                entry = info.loc[next_index, 'search_details_description']
            
                # Append duplicate entry to original
                info.loc[index, 'search_details_description'].append(entry[0])
            
                # Drop duplicate row
                info.drop(next_index, inplace=True)
            
                # Increase index of next row
                next_index += 1
            
                # Check for out of bounds
                if next_index  < last_index:
                    next_id = info['stop_id'][next_index]
                
                deleted += 1
    return info

In [None]:
# Function: Merge the stops and details dataframes
# Params: stops - dataframe with stops information
#          info - dataframe with stop details
def merge_dataframes(stops, info):
    
    # Drop type information
    info.drop('search_details_type', axis=1, inplace=True)
    
    # Reset indeces
    info = info.reset_index()
    info.drop('index', axis=1, inplace=True)
    
    # Merge duplicates of information dataset
    info = merge_duplicates(info)
    
    df_merged = df_stops_17.merge(df_stops_info_17, on = ['stop_id'], how = 'left')
    
    return df_merged

## Testing cleaning and merging

In [1246]:
# Read in 2017 datasets
df_stops_17 = pd.read_csv('vehicle_stops_2014.csv')
df_stops_17 = clean_stops(df_stops_17)

# Read in 2017 datasets
df_stops_info_17 = pd.read_csv('vehicle_stops_search_details_2014.csv')
df_stops_info_17 = clean_stops_info(df_stops_info_17)

In [1247]:
df_merged = merge_dataframes(df_stops_17, df_stops_info_17)

In [1249]:
assert(df_merged[df_merged['stop_id'].duplicated()].sum().sum() == 0)

In [1251]:
df_merged.head(1)

Unnamed: 0,stop_id,stop_cause,service_area,subject_race,subject_sex,subject_age,arrested,searched,contraband_found,property_seized,search_details_description
0,1444799,Moving Violation,120,I,M,37,N,N,N,N,[citation]


In [None]:
todo: clean missing values on service area, race, sex, and age.