In [5]:
import pandas
import numpy as np
import scipy.optimize

# Police dataset
This dataset is from [https://data.police.uk/](https://data.police.uk/). Code for retrieving the dataset is given at the bottom.

In [3]:
import os
if os.path.exists('stop-and-search.csv'):
    print("file already downloaded")
else:
    !wget "https://www.cl.cam.ac.uk/teaching/2021/DataSci/data/stop-and-search.csv"
police = pandas.read_csv('stop-and-search.csv')

file already downloaded


### Basic tabulations

In [13]:
print(police['outcome'].value_counts())
print('Missing values:', np.sum(pandas.isna(police['outcome'])))

A no further action disposal                                    467106
False                                                           239660
Arrest                                                           93384
Suspect arrested                                                 63191
Community resolution                                             35319
Summons / charged by post                                        11168
Penalty Notice for Disorder                                      10628
Local resolution                                                  8164
Article found - Detailed outcome unavailable                      6705
Suspect summonsed to court                                        5758
Offender given penalty notice                                     5623
Caution (simple or conditional)                                   2947
Offender cautioned                                                1747
Suspected psychoactive substances seized - No further action        17
Name: 

In [14]:
print(police['officer_defined_ethnicity'].value_counts())
print('Missing values:', np.sum(pandas.isna(police['officer_defined_ethnicity'])))

White    532584
Black    253315
Asian    125646
Other     27809
Mixed      1644
Name: officer_defined_ethnicity, dtype: int64
Missing values: 72917


In [15]:
print(police['gender'].value_counts())
print('Missing values:', np.sum(pandas.isna(police['gender'])))

Male      861605
Female     76568
Other       2467
Name: gender, dtype: int64
Missing values: 73275


# 3.4 Analysis of gender + ethnicity as they affect police actions

### Data preparation

In [45]:
# Let's treat outcomes "A no further action disposal" and "False" as nothing found
# Prepare vectors y=1[something found], eth, and gender,
# and remove rows with missing values.

df = police[['outcome', 'officer_defined_ethnicity', 'gender']].dropna()
df['outcome'] = np.where(df['outcome'].isin(['False','A no further action disposal']), 'nothing', 'find')

y = np.where(df['outcome']=='find', 1, 0)
eth = df['officer_defined_ethnicity']
gender = df['gender']

# Define e and g to be integer-encoded versions of ethnicity and gender.
# (For the purposes of this analysis, it'll be more useful than one-hot encoding.)

ethnicity_levels = ['Asian','Black','Mixed','Other','White']
gender_levels = ['Female', 'Male', 'Other']
assert all(eth.isin(ethnicity_levels))
assert all(gender.isin(gender_levels))
ethnicity_code = {k:i for i,k in enumerate(ethnicity_levels)}
gender_code = {k:i for i,k in enumerate(gender_levels)}
e = np.array([ethnicity_code[v] for v in eth])
g = np.array([gender_code[v] for v in gender])

In [42]:
x = df.groupby(['outcome','gender','officer_defined_ethnicity']).apply(len).unstack(fill_value=0)
print(x)

officer_defined_ethnicity  Asian   Black  Mixed  Other   White
outcome gender                                                
find    Female              1051    3327     36    346   15789
        Male               31557   66128    468   7293  138926
        Other                 21      40      0     32     135
nothing Female              2375    8189    108    706   41064
        Male               79242  172521    996  17356  317957
        Other                 97     157      3     71     460


### A simple Bernoulli model

In [71]:
def loglik1(β):
    ξ = β[e] # get a vector [β[e_1], β[e_2], ..., β[e_n]]
    return np.sum(np.log(np.where(y==1, ξ, 1-ξ)))

initial_guess = np.full(5,.1)
βhat = scipy.optimize.fmin(lambda β: -loglik1(β), initial_guess)

pandas.Series(βhat, index=ethnicity_levels)

  return np.sum(np.log(np.where(y==1, ξ, 1-ξ)))


Optimization terminated successfully.
         Current function value: 547602.977403
         Iterations: 334
         Function evaluations: 549


Asian    0.285352
Black    0.277579
Mixed    0.312868
Other    0.297266
White    0.301073
dtype: float64

In [44]:
x = pandas.Series(βhat, index=ethnicity_levels)
print(x.to_frame().T.to_string(index=False))

   Asian     Black     Mixed     Other     White
 0.28233  0.281194  0.319455  0.266409  0.301475


### A Bernoulli model with interaction

In [56]:
def loglik2(θ):
    β, γ = θ[:5], θ[5:]
    ξ = β[e] + γ[g]
    return np.sum(np.log(np.where(y==1, ξ, 1-ξ)))

initial_guess = np.full(8, .2)
mle = scipy.optimize.fmin(lambda θ: -loglik2(θ), initial_guess)
βhat, γhat = mle[:5], mle[5:]

Optimization terminated successfully.
         Current function value: 547547.227247
         Iterations: 675
         Function evaluations: 999


In [58]:
βhat, γhat

(array([0.34922542, 0.34169947, 0.37748171, 0.3616269 , 0.36619801]),
 array([-0.07964008, -0.0632652 , -0.13476383]))

In [57]:
ξhat = βhat[np.newaxis, :] + γhat[:, np.newaxis]
print(pandas.DataFrame(np.round(ξhat,3), index=gender_levels, columns=ethnicity_levels))

        Asian  Black  Mixed  Other  White
Female  0.270  0.262  0.298  0.282  0.287
Male    0.286  0.278  0.314  0.298  0.303
Other   0.214  0.207  0.243  0.227  0.231


In [94]:
# Identifiable version of the model

def loglik2b(θ):
    α, β, γ = θ[0], θ[1:5], θ[5:]
    β, γ = np.insert(β,0,0), np.insert(γ,0,0)
    ξ = α + β[e] + γ[g]
    return np.sum(np.log(np.where(y==1, ξ, 1-ξ)))

initial_guess = np.array([.5, 0,0,0,0, 0,0])
θhat = scipy.optimize.fmin(lambda θ: -loglik2b(θ), initial_guess)

Optimization terminated successfully.
         Current function value: 547592.263723
         Iterations: 423
         Function evaluations: 678


In [96]:
α,β,γ = θhat[0], θhat[1:5], θhat[5:]
β,γ = np.insert(β,0,0), np.insert(γ,0,0)
ξ = α + β[np.newaxis,:] + γ[:,np.newaxis]
print(pandas.DataFrame(np.round(ξ,3), index=gender_levels, columns=ethnicity_levels))

        Asian  Black  Mixed  Other  White
Female  0.270  0.265  0.261  0.262  0.287
Male    0.286  0.281  0.277  0.277  0.303
Other   0.271  0.266  0.262  0.262  0.288


### Natural parameters (full logistic regression)

In [101]:
def loglik3(θ):
    α, β, γ = θ[0], θ[1:5], θ[5:]
    β, γ = np.insert(β,0,0), np.insert(γ,0,0)
    ξ = α + β[e] + γ[g]
    p1 = np.exp(ξ) / (1 + np.exp(ξ))
    return np.sum(np.where(y==1, np.log(p1), np.log(1-p1)))
    
initial_guess = np.array([.5, 0,0,0,0, 0,0])
θhat = scipy.optimize.fmin(lambda θ: -loglik3(θ), initial_guess)

Optimization terminated successfully.
         Current function value: 547581.066001
         Iterations: 640
         Function evaluations: 984


In [103]:
α,β,γ = θhat[0], θhat[1:5], θhat[5:]
β,γ = np.insert(β,0,0), np.insert(γ,0,0)
ξ = α + β[np.newaxis,:] + γ[:,np.newaxis]
p = np.exp(ξ) / (1 + np.exp(ξ))
print(pandas.DataFrame(np.round(p,3), index=gender_levels, columns=ethnicity_levels))

        Asian  Black  Mixed  Other  White
Female  0.278  0.264  0.287  0.271  0.288
Male    0.291  0.278  0.301  0.284  0.302
Other   0.291  0.278  0.301  0.284  0.302


In [104]:
x = pandas.Series(β, index=ethnicity_levels)
print(x.to_frame().T.to_string(index=False))

 Asian     Black     Mixed     Other     White
   0.0 -0.066871  0.047821 -0.034717  0.052221


In [105]:
x = pandas.Series(γ, index=gender_levels)
print(x.to_frame().T.to_string(index=False))

 Female      Male     Other
    0.0  0.067568  0.067881


# Retrieving the dataset

In [1]:
import numpy as np
import pandas
import requests, requests_cache, urllib, io
import json, re
import collections, numbers
requests_cache.install_cache('data/cache')
BASE_URL = 'https://data.police.uk/api/'

In [None]:
# Fetch all available forces*dates, and cache them
availability_url = urllib.parse.urljoin(BASE_URL, 'crimes-street-dates')
availability = requests.get(availability_url).json()
for df in availability:
    date = df['date']
    print(date)
    for force in df['stop-and-search']:
        p = [('force',force), ('date',date)]
        url = urllib.parse.urljoin(BASE_URL, 'stops-force') + '?' + urllib.parse.urlencode(p)
        requests.get(url)

In [9]:
# Fetch a list of all forces * dates
availability_url = urllib.parse.urljoin(BASE_URL, 'crimes-street-dates')
availability = requests.get(availability_url).json()
availability = [(force, df['date']) for df in availability for force in df['stop-and-search']]

def get_dataframe(force, date):
    p = [('force',force), ('date',date)]
    url = urllib.parse.urljoin(BASE_URL, 'stops-force') + '?' + urllib.parse.urlencode(p)
    response = requests.get(url)
    df = response.json()
    df = pandas.json_normalize(df, sep='_')
    df.insert(0, 'month', date)
    df.insert(0, 'force', force)
    return df
df = [get_dataframe(*x) for x in availability]

In [10]:
police = pandas.concat(df, axis=0, ignore_index=True, sort=False)

In [11]:
police.to_csv('data/stop-and-search.csv', index=False)