In [1]:
import json
import joblib
import pickle
import pandas as pd
import numpy as np
# import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('data/train.csv')

# transforming Date column in datetime
train['Date']=pd.to_datetime(train['Date'], infer_datetime_format=True)

# auxiliar Date feature (for temporal analyzis)
train['referencia'] = train['Date'].dt.year*100 + train['Date'].dt.month

# according to Dr Wilson's instructions: 
# "If there was a search and the outcome linked to object of search has not been written please consider it to be False"
train['Outcome linked to object of search'] = train['Outcome linked to object of search'].fillna(False)

# according to Dr Wilson's instructions missing values means False 
train['Part of a policing operation'] = train['Part of a policing operation'].fillna(False)

# Outer clothing be filled with False? 
#Yes, except when it’s just a vehicle search, in which case it makes no sense and should be kept as NaN
mask= ((train['Type']!='Vehicle search') & 
      (train['Removal of more than just outer clothing'] != True) &
      (train['Removal of more than just outer clothing'] != False))

train.loc[mask, 'Removal of more than just outer clothing']=False


positive_outcome = ['Local resolution',
           'Community resolution',
           'Arrest',
           'Article found - Detailed outcome unavailable',
           'Caution (simple or conditional)',
           'Khat or Cannabis warning',
           'Offender cautioned',
           'Offender given drugs possession warning',
           'Offender given penalty notice',
           'Penalty Notice for Disorder',
           'Summons / charged by post',
           'Suspect arrested',
           'Suspect summonsed to court',
           'Suspected psychoactive substances seized - No further action']

train['positive_outcome'] = train['Outcome'].isin(positive_outcome)



train['target'] = False
mask=((train["positive_outcome"] == True) & (train["Outcome linked to object of search"]==True))
train.loc[mask, 'target']=True



stations_to_exclude = ['metropolitan', 'gwent', 'humberside']
train_filter = train[~train['station'].isin(stations_to_exclude)]
print("Exclusion of {} observations".format(train[train['station'].isin(stations_to_exclude)]['observation_id'].nunique()))
train_filter.shape

Exclusion of 355849 observations


(304762, 19)

In [7]:
train.groupby(['Outcome linked to object of search','positive_outcome'], dropna=False)['Outcome'].count().unstack()

positive_outcome,False,True
Outcome linked to object of search,Unnamed: 1_level_1,Unnamed: 2_level_1
False,433543,126795
True,37987,62286


### Discrimination

In [3]:
train.groupby(['station','Officer-defined ethnicity'], dropna=False)['Outcome'].count().unstack()

Officer-defined ethnicity,Asian,Black,Mixed,Other,White
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
avon-and-somerset,288.0,1468.0,428.0,143.0,7688.0
bedfordshire,974.0,832.0,2.0,59.0,2342.0
btp,757.0,2931.0,,177.0,5690.0
cambridgeshire,67.0,106.0,,23.0,681.0
cheshire,101.0,151.0,,19.0,4574.0
city-of-london,753.0,792.0,,135.0,1895.0
cleveland,111.0,50.0,,24.0,1687.0
cumbria,35.0,24.0,,9.0,2061.0
derbyshire,315.0,281.0,,44.0,2212.0
devon-and-cornwall,79.0,227.0,,47.0,7216.0


In [10]:
train.groupby(['station','Gender'], dropna=False)['Outcome'].count().unstack()

Gender,Female,Male,Other
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
avon-and-somerset,1336.0,8679.0,
bedfordshire,379.0,3825.0,5.0
btp,703.0,8849.0,3.0
cambridgeshire,137.0,739.0,1.0
cheshire,429.0,4416.0,
city-of-london,354.0,3218.0,3.0
cleveland,175.0,1697.0,
cumbria,303.0,1825.0,1.0
derbyshire,279.0,2572.0,1.0
devon-and-cornwall,1178.0,6391.0,


In [11]:
train.groupby(['station','target'], dropna=False)['Outcome'].count().unstack()

target,False,True
station,Unnamed: 1_level_1,Unnamed: 2_level_1
avon-and-somerset,7571.0,2444.0
bedfordshire,3396.0,813.0
btp,7455.0,2100.0
cambridgeshire,722.0,155.0
cheshire,3492.0,1353.0
city-of-london,2581.0,994.0
cleveland,1406.0,466.0
cumbria,1647.0,482.0
derbyshire,2149.0,703.0
devon-and-cornwall,5896.0,1673.0


In [4]:
train.groupby(['station','positive_outcome'], dropna=False)['Outcome'].count().unstack()

positive_outcome,False,True
station,Unnamed: 1_level_1,Unnamed: 2_level_1
avon-and-somerset,6871,3144
bedfordshire,3151,1058
btp,6971,2584
cambridgeshire,541,336
cheshire,2901,1944
city-of-london,2363,1212
cleveland,1272,600
cumbria,1543,586
derbyshire,1955,897
devon-and-cornwall,5235,2334


In [14]:
pd.crosstab(train["station"],train["Gender"],values=train["target"], aggfunc='mean')

Gender,Female,Male,Other
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
avon-and-somerset,0.188623,0.252564,
bedfordshire,0.16095,0.196601,0.0
btp,0.236131,0.218443,0.333333
cambridgeshire,0.182482,0.175913,0.0
cheshire,0.270396,0.280118,
city-of-london,0.200565,0.286513,0.333333
cleveland,0.2,0.253978,
cumbria,0.221122,0.227397,0.0
derbyshire,0.18638,0.25311,0.0
devon-and-cornwall,0.204584,0.224065,


In [27]:
train_filter.groupby(['Gender','target'], dropna=False)['Outcome'].count().unstack()

target,False,True
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,25205,5691
Male,217047,56547
Other,224,48


In [20]:
train_filter.groupby(['Officer-defined ethnicity','positive_outcome'], dropna=False)['Outcome'].count().unstack()

positive_outcome,False,True
Officer-defined ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1
Asian,16858,7740
Black,22167,10176
Mixed,1267,563
Other,4057,1926
White,168556,71452


In [21]:
train_filter.groupby(['Officer-defined ethnicity','target'], dropna=False)['Outcome'].count().unstack()

target,False,True
Officer-defined ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1
Asian,19626,4972
Black,25596,6747
Mixed,1447,383
Other,4707,1276
White,191100,48908


In [22]:
pd.crosstab(train_filter["station"],train_filter["Officer-defined ethnicity"],values=train_filter["target"], aggfunc='mean')

Officer-defined ethnicity,Asian,Black,Mixed,Other,White
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
avon-and-somerset,0.267361,0.219346,0.21729,0.335664,0.247659
bedfordshire,0.212526,0.176683,0.5,0.101695,0.192997
btp,0.217966,0.174684,,0.220339,0.243234
cambridgeshire,0.328358,0.113208,,0.086957,0.174743
cheshire,0.336634,0.324503,,0.315789,0.276345
city-of-london,0.25498,0.255051,,0.17037,0.304485
cleveland,0.306306,0.22,,0.291667,0.245406
cumbria,0.257143,0.291667,,0.444444,0.224163
derbyshire,0.307937,0.252669,,0.159091,0.238698
devon-and-cornwall,0.21519,0.23348,,0.212766,0.220759


In [24]:
train_filter['Object of search'].value_counts(dropna=False)

Controlled drugs                              192161
Offensive weapons                              35391
Article for use in theft                       30287
Stolen goods                                   26617
Articles for use in criminal damage             6494
Anything to threaten or harm anyone             5241
Firearms                                        2957
Evidence of offences under the Act              1930
Psychoactive substances                         1701
Fireworks                                       1695
Detailed object of search unavailable            129
Game or poaching equipment                        96
Goods on which duty has not been paid etc.        23
Evidence of wildlife offences                     21
Crossbows                                         17
Seals or hunting equipment                         2
Name: Object of search, dtype: int64

In [25]:
pd.crosstab(train_filter["station"],train_filter["Object of search"],values=train_filter["target"], aggfunc='mean')

Object of search,Anything to threaten or harm anyone,Article for use in theft,Articles for use in criminal damage,Controlled drugs,Crossbows,Detailed object of search unavailable,Evidence of offences under the Act,Evidence of wildlife offences,Firearms,Fireworks,Game or poaching equipment,Goods on which duty has not been paid etc.,Offensive weapons,Psychoactive substances,Seals or hunting equipment,Stolen goods
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
avon-and-somerset,0.058824,0.132075,0.119565,0.296877,0.0,,0.185185,,0.070423,0.0,0.0,0.25,0.112163,0.8,,0.219474
bedfordshire,0.0,0.067797,0.034483,0.253642,,,,,0.086957,0.111111,,,0.079433,0.0,,0.164502
btp,0.063465,0.192612,0.333333,0.276552,0.0,,0.117647,1.0,0.106383,,,0.0,0.087664,0.454545,,0.268357
cambridgeshire,,0.0,0.0,0.280943,,,,,0.0,,,,0.023952,,,0.195122
cheshire,,0.122402,0.218182,0.330009,0.0,,,,0.266667,0.0,,,0.123641,,,0.359431
city-of-london,0.078947,0.20649,0.074074,0.359896,,,0.182796,,0.114286,0.0,,,0.125984,0.0,,0.203488
cleveland,0.25641,0.161392,0.185185,0.286307,,,0.352941,,0.178571,,,,0.215447,,,0.264423
cumbria,,0.119318,0.131579,0.239055,,,,,0.333333,0.0,0.0,,0.147186,,,0.336323
derbyshire,,0.1675,0.130435,0.294851,,,0.142857,,0.290323,0.0,,0.0,0.118421,,,0.224806
devon-and-cornwall,,0.076923,0.090909,0.255546,,,,,0.142857,,,,0.1,,,0.168293


In [26]:
train_filter.groupby(['Object of search','target'], dropna=False)['Outcome'].count().unstack()

target,False,True
Object of search,Unnamed: 1_level_1,Unnamed: 2_level_1
Anything to threaten or harm anyone,4866,375
Article for use in theft,27240,3047
Articles for use in criminal damage,6211,283
Controlled drugs,143764,48397
Crossbows,15,2
Detailed object of search unavailable,104,25
Evidence of offences under the Act,1722,208
Evidence of wildlife offences,16,5
Firearms,2645,312
Fireworks,1665,30


In [4]:
train_filter.groupby(['Age range','target'], dropna=False)['Outcome'].count().unstack()

target,False,True
Age range,Unnamed: 1_level_1,Unnamed: 2_level_1
10-17,51691,8320
18-24,78129,25894
25-34,57724,16150
over 34,54663,11877
under 10,269,45


### Clothing removal

In [9]:
train.groupby(['station','Removal of more than just outer clothing'], dropna=False)['Outcome'].count().unstack()

Removal of more than just outer clothing,False,True,NaN
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
avon-and-somerset,9384.0,631.0,
bedfordshire,4143.0,54.0,12.0
btp,9526.0,29.0,
cambridgeshire,855.0,22.0,
cheshire,4636.0,209.0,
city-of-london,3452.0,123.0,
cleveland,1870.0,,2.0
cumbria,1978.0,146.0,5.0
derbyshire,2626.0,226.0,
devon-and-cornwall,7422.0,147.0,


In [28]:
train.groupby(['Removal of more than just outer clothing','target'], dropna=False)['Outcome'].count().unstack()

target,False,True
Removal of more than just outer clothing,Unnamed: 1_level_1,Unnamed: 2_level_1
False,589958,59087
True,7483,3182
,884,17


In [29]:
train_filter.groupby(['Removal of more than just outer clothing','target'], dropna=False)['Outcome'].count().unstack()

target,False,True
Removal of more than just outer clothing,Unnamed: 1_level_1,Unnamed: 2_level_1
False,235051,59087
True,7379,3182
,46,17


In [31]:
train_filter.groupby(['Gender','Removal of more than just outer clothing'], dropna=False)['Outcome'].count().unstack()

Removal of more than just outer clothing,False,True,NaN
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,29602.0,1287.0,7.0
Male,264276.0,9262.0,56.0
Other,260.0,12.0,


In [34]:
train_filter.groupby(['Age range','Removal of more than just outer clothing'], dropna=False)['Outcome'].count().unstack()

Removal of more than just outer clothing,False,True,NaN
Age range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10-17,59049.0,953.0,9.0
18-24,99972.0,4022.0,29.0
25-34,70862.0,2998.0,14.0
over 34,63953.0,2576.0,11.0
under 10,302.0,12.0,


In [48]:
train_filter.groupby(['Officer-defined ethnicity','Removal of more than just outer clothing'], dropna=False)['Outcome'].count().unstack()

Removal of more than just outer clothing,False,True,NaN
Officer-defined ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Asian,23619,971,8
Black,30046,2288,9
Mixed,1719,110,1
Other,5769,209,5
White,232985,6983,40


In [33]:
train_filter.groupby(['Age range','Gender','Removal of more than just outer clothing'], dropna=False)['Outcome'].count().unstack()

Unnamed: 0_level_0,Removal of more than just outer clothing,False,True,NaN
Age range,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10-17,Female,5720.0,81.0,3.0
10-17,Male,53261.0,872.0,6.0
10-17,Other,68.0,,
18-24,Female,8611.0,315.0,1.0
18-24,Male,91272.0,3703.0,28.0
18-24,Other,89.0,4.0,
25-34,Female,7352.0,405.0,1.0
25-34,Male,63464.0,2589.0,13.0
25-34,Other,46.0,4.0,
over 34,Female,7891.0,482.0,2.0


In [47]:
t = train_filter.groupby(['Age range','Gender','Officer-defined ethnicity','Removal of more than just outer clothing'], dropna=False)['Outcome'].count().unstack()
t.tail(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Removal of more than just outer clothing,False,True,NaN
Age range,Gender,Officer-defined ethnicity,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
25-34,Male,Other,1326.0,54.0,
25-34,Male,White,49634.0,1728.0,9.0
25-34,Other,Asian,6.0,,
25-34,Other,Black,4.0,1.0,
25-34,Other,Other,1.0,,
25-34,Other,White,35.0,3.0,
over 34,Female,Asian,126.0,8.0,
over 34,Female,Black,343.0,15.0,
over 34,Female,Mixed,24.0,4.0,
over 34,Female,Other,90.0,6.0,


In [30]:
train_filter.groupby(['station','Removal of more than just outer clothing'], dropna=False)['Outcome'].count().unstack()

Removal of more than just outer clothing,False,True,NaN
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
avon-and-somerset,9384.0,631.0,
bedfordshire,4143.0,54.0,12.0
btp,9526.0,29.0,
cambridgeshire,855.0,22.0,
cheshire,4636.0,209.0,
city-of-london,3452.0,123.0,
cleveland,1870.0,,2.0
cumbria,1978.0,146.0,5.0
derbyshire,2626.0,226.0,
devon-and-cornwall,7422.0,147.0,
