In [28]:
import json
import joblib
import pickle
import pandas as pd
import numpy as np
# import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.metrics import precision_recall_curve

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import GradientBoostingClassifier

### Import dataset

In [29]:
train = pd.read_csv('data/train.csv')

In [30]:
# no duplicate lines
train.duplicated().value_counts()

False    660611
dtype: int64

In [31]:
# no duplicate observation_id
train['observation_id'].duplicated().value_counts()

False    660611
Name: observation_id, dtype: int64

### Features adjustments

In [32]:
# transforming Date column in datetime
train['Date']=pd.to_datetime(train['Date'], infer_datetime_format=True)

# auxiliar Date feature (for temporal analyzis)
train['referencia'] = train['Date'].dt.year*100 + train['Date'].dt.month

# according to Dr Wilson's instructions: 
# "If there was a search and the outcome linked to object of search has not been written please consider it to be False"
train['Outcome linked to object of search'] = train['Outcome linked to object of search'].fillna(False)

# according to Dr Wilson's instructions missing values means False 
train['Part of a policing operation'] = train['Part of a policing operation'].fillna(False)

# Outer clothing be filled with False? 
#Yes, except when it’s just a vehicle search, in which case it makes no sense and should be kept as NaN
mask= ((train['Type']!='Vehicle search') & 
      (train['Removal of more than just outer clothing'] != True) &
      (train['Removal of more than just outer clothing'] != False))

train.loc[mask, 'Removal of more than just outer clothing']=False

### Filtering / Exclusions

In [33]:
# The Metropolitan station (which is very large and has lots of data in the training set) has the features 
# Outcome linked to object of search and Removal of outer clothing without any data (always missing). 
# Is this a known problem, and if so how should we proceed? 
# Thank you for bringing this to our attention, I’ve contacted the administration at the Metropolitan and asked that 
# they fix their data entry. Please include this in your report, and do not use the Metropolitan station’s data 
# for training your models. They will not be in the test set. 

In [34]:
# Gwent and Humberside have the same problem of Metropolitan. So I will exclude this observations

In [35]:
stations_to_exclude = ['metropolitan', 'gwent', 'humberside']
train_model = train[~train['station'].isin(stations_to_exclude)]
print("Exclusion of {} observations".format(train[train['station'].isin(stations_to_exclude)]['observation_id'].nunique()))
train_model.shape

Exclusion of 355849 observations


(304762, 17)

In [36]:
# Search on individuals with less than 10 years seems unrealist, so I will remove it

In [37]:
# age_to_exclude = ['under 10']
# print("Exclusion of {} observations".format(train_model[train_model['Age range'].isin(age_to_exclude)]['observation_id'].nunique()))
# train_model = train_model[~train_model['Age range'].isin(age_to_exclude)]
# train_model.shape

### Target

In [38]:
positive_outcome = ['Local resolution',
           'Community resolution',
           'Arrest',
           'Article found - Detailed outcome unavailable',
           'Caution (simple or conditional)',
           'Khat or Cannabis warning',
           'Offender cautioned',
           'Offender given drugs possession warning',
           'Offender given penalty notice',
           'Penalty Notice for Disorder',
           'Summons / charged by post',
           'Suspect arrested',
           'Suspect summonsed to court',
           'Suspected psychoactive substances seized - No further action']

train_model['positive_outcome'] = train_model['Outcome'].isin(positive_outcome)

In [39]:
train_model['target'] = 0
mask=((train_model["positive_outcome"] == True) & (train_model["Outcome linked to object of search"]==True))
train_model.loc[mask, 'target']=1

In [40]:
pd.crosstab(train_model["positive_outcome"],train_model["Outcome linked to object of search"],
            margins=True, margins_name="Total",dropna=False)

Outcome linked to object of search,False,True,Total
positive_outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,174918,37987,212905
True,29571,62286,91857
Total,204489,100273,304762


In [41]:
print(train_model['target'].value_counts(dropna=False))
train_model['target'].value_counts(dropna=False,normalize=True)

0    242476
1     62286
Name: target, dtype: int64


0    0.795624
1    0.204376
Name: target, dtype: float64

In [42]:
# print(train_model['positive_outcome'].value_counts(dropna=False))
# train_model['positive_outcome'].value_counts(dropna=False,normalize=True)
# pd.crosstab(train_model["referencia"],train_model["target"],normalize='index')
# pd.crosstab(train_model["Gender"],train_model["target"],normalize='index')
# pd.crosstab(train_model["station"],train_model["target"],normalize='index')
# pd.crosstab(train_model["station"],train_model["positive_outcome"],normalize='index')

### Feature engineering

In [43]:
# get the hour and day of the week, maybe they will be useful 
train_model['hour'] = train_model['Date'].dt.hour
train_model['month'] = train_model['Date'].dt.month
train_model['day_of_week'] = train_model['Date'].dt.day_name()

In [44]:
# Legislation - reducing the quantity of categories
# train_model.groupby(['Legislation','target'], dropna=False)['Outcome'].count().unstack()

train_model['Legislation']=train_model['Legislation'].fillna('missing infomation')
train_model['Legislation'] = train_model['Legislation'].apply(lambda x: str(x).lower())

legislation_categories = ['misuse of drugs act 1971 (section 23)', 'police and criminal evidence act 1984 (section 1)', 
                          'criminal justice and public order act 1994 (section 60)', 'firearms act 1968 (section 47)',
                          'missing infomation']

mask=(~train_model['Legislation'].isin(legislation_categories))
train_model.loc[mask, 'Legislation']='others'

train_model['Legislation'].value_counts()

misuse of drugs act 1971 (section 23)                      178620
police and criminal evidence act 1984 (section 1)           92651
missing infomation                                          27931
criminal justice and public order act 1994 (section 60)      2670
firearms act 1968 (section 47)                               1830
others                                                       1060
Name: Legislation, dtype: int64

### Text Data normalization

In [45]:
train_model['Object of search'] = train_model['Object of search'].apply(lambda x: str(x).lower())
train_model['Object of search'].value_counts()

controlled drugs                              192161
offensive weapons                              35391
article for use in theft                       30287
stolen goods                                   26617
articles for use in criminal damage             6494
anything to threaten or harm anyone             5241
firearms                                        2957
evidence of offences under the act              1930
psychoactive substances                         1701
fireworks                                       1695
detailed object of search unavailable            129
game or poaching equipment                        96
goods on which duty has not been paid etc.        23
evidence of wildlife offences                     21
crossbows                                         17
seals or hunting equipment                         2
Name: Object of search, dtype: int64

In [46]:
train_model['station'] = train_model['station'].apply(lambda x: str(x).lower())
# train_model['station'].value_counts()

### Latitude and Longitude

In [47]:
print(train_model['Latitude'].describe())
train_model['Longitude'].describe()

count    228038.000000
mean         52.511589
std           1.133850
min          49.892149
25%          51.485978
50%          52.609826
75%          53.422686
max          57.143856
Name: Latitude, dtype: float64


count    228038.000000
mean         -1.343263
std           1.365160
min          -8.053397
25%          -2.599163
50%          -1.464553
75%          -0.207006
max           1.756480
Name: Longitude, dtype: float64

In [48]:
train_model['Latitude'] = train_model['Longitude'].fillna(0)
train_model['Longitude'] = train_model['Longitude'].fillna(50)

### Data split

In [49]:
df_train, df_test = train_test_split(train_model, test_size=0.3, random_state=42)

In [50]:
df_train.shape

(213333, 22)

In [51]:
df_test.shape

(91429, 22)

In [52]:
df_train.day_of_week.value_counts()

Saturday     34161
Friday       33994
Thursday     30593
Wednesday    30283
Tuesday      28674
Monday       27828
Sunday       27800
Name: day_of_week, dtype: int64

### Hyperparameter Tuning

In [53]:
all_features = ['Type', 'Part of a policing operation', 'Age range', 'Latitude', 'Longitude',
                'Legislation', 'hour', 'month', 'day_of_week',
                'Gender', 'Officer-defined ethnicity']

numerical_features = ['hour', 'month','Latitude', 'Longitude']

categorical_features = ['Type', 'Part of a policing operation', 'Age range',
                        'Legislation', 'day_of_week','Gender', 'Officer-defined ethnicity']

X_train = df_train[all_features]
y_train = df_train['target']

X_test = df_test[all_features]
y_test = df_test['target']

ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(X_train)
X_train_ohe = ohe.transform(X_train)

In [59]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

estimator = GradientBoostingClassifier()

In [65]:
range(50, 200,10)

range(50, 200, 10)

#### Grid Search

In [66]:
grid_search_parameter_space =  {
                   "n_estimators": range(50, 200, 10),
                   "max_depth": range(1, 20, 1),
                   "loss": ['deviance', 'exponential']
                  }

grid_search = GridSearchCV(
                estimator,
                grid_search_parameter_space,
                cv=5,
                scoring="roc_auc",
                return_train_score=True
                )

In [67]:
%%timeit -n 1 -r 1

grid_search.fit(X_train_ohe, y_train)

KeyboardInterrupt: 

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_estimator_.get_params()

#### Randomized Search

In [None]:
random_search_parameter_space_dist = {
                   "n_estimators": randint(50, 200),
                   "max_depth": randint(1, 100),
                   "loss": ['deviance', 'exponential']
                  }

In [None]:
randomized_search = RandomizedSearchCV(
                        estimator, 
                        random_search_parameter_space_dist,
                        cv=5, n_iter=250,
                        random_state=42,
                        return_train_score=True
                        )

In [None]:
%%timeit -n 1 -r 1

randomized_search.fit(X_train_ohe, y_train)

In [None]:
randomized_search.best_estimator_

In [None]:
randomized_search.best_score_