In [45]:
import json
import joblib
import pickle
import pandas as pd
import numpy as np
# import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.metrics import precision_recall_curve

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from lightgbm import LGBMClassifier

### Import dataset

In [46]:
train = pd.read_csv('data/train.csv')

In [47]:
# no duplicate lines
train.duplicated().value_counts()

False    660611
dtype: int64

In [48]:
# no duplicate observation_id
train['observation_id'].duplicated().value_counts()

False    660611
Name: observation_id, dtype: int64

### Features adjustments

In [49]:
# transforming Date column in datetime
train['Date']=pd.to_datetime(train['Date'], infer_datetime_format=True)

# auxiliar Date feature (for temporal analyzis)
train['referencia'] = train['Date'].dt.year*100 + train['Date'].dt.month

# according to Dr Wilson's instructions: 
# "If there was a search and the outcome linked to object of search has not been written please consider it to be False"
train['Outcome linked to object of search'] = train['Outcome linked to object of search'].fillna(False)

# according to Dr Wilson's instructions missing values means False 
train['Part of a policing operation'] = train['Part of a policing operation'].fillna(False)

# Outer clothing be filled with False? 
#Yes, except when it’s just a vehicle search, in which case it makes no sense and should be kept as NaN
mask= ((train['Type']!='Vehicle search') & 
      (train['Removal of more than just outer clothing'] != True) &
      (train['Removal of more than just outer clothing'] != False))

train.loc[mask, 'Removal of more than just outer clothing']=False

### Filtering / Exclusions

In [50]:
# The Metropolitan station (which is very large and has lots of data in the training set) has the features 
# Outcome linked to object of search and Removal of outer clothing without any data (always missing). 
# Is this a known problem, and if so how should we proceed? 
# Thank you for bringing this to our attention, I’ve contacted the administration at the Metropolitan and asked that 
# they fix their data entry. Please include this in your report, and do not use the Metropolitan station’s data 
# for training your models. They will not be in the test set. 

In [51]:
# Gwent and Humberside have the same problem of Metropolitan. So I will exclude this observations

In [52]:
stations_to_exclude = ['metropolitan', 'gwent', 'humberside']
train_model = train[~train['station'].isin(stations_to_exclude)]
print("Exclusion of {} observations".format(train[train['station'].isin(stations_to_exclude)]['observation_id'].nunique()))
train_model.shape

Exclusion of 355849 observations


(304762, 17)

In [53]:
# Search on individuals with less than 10 years seems unrealist, so I will remove it

In [54]:
# age_to_exclude = ['under 10']
# print("Exclusion of {} observations".format(train_model[train_model['Age range'].isin(age_to_exclude)]['observation_id'].nunique()))
# train_model = train_model[~train_model['Age range'].isin(age_to_exclude)]
# train_model.shape

### Target

In [55]:
positive_outcome = ['Local resolution',
           'Community resolution',
           'Arrest',
           'Article found - Detailed outcome unavailable',
           'Caution (simple or conditional)',
           'Khat or Cannabis warning',
           'Offender cautioned',
           'Offender given drugs possession warning',
           'Offender given penalty notice',
           'Penalty Notice for Disorder',
           'Summons / charged by post',
           'Suspect arrested',
           'Suspect summonsed to court',
           'Suspected psychoactive substances seized - No further action']

train_model['positive_outcome'] = train_model['Outcome'].isin(positive_outcome)

In [56]:
train_model['target'] = 0
mask=((train_model["positive_outcome"] == True) & (train_model["Outcome linked to object of search"]==True))
train_model.loc[mask, 'target']=1

In [57]:
pd.crosstab(train_model["positive_outcome"],train_model["Outcome linked to object of search"],
            margins=True, margins_name="Total",dropna=False)

Outcome linked to object of search,False,True,Total
positive_outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,174918,37987,212905
True,29571,62286,91857
Total,204489,100273,304762


In [58]:
print(train_model['target'].value_counts(dropna=False))
train_model['target'].value_counts(dropna=False,normalize=True)

0    242476
1     62286
Name: target, dtype: int64


0    0.795624
1    0.204376
Name: target, dtype: float64

In [59]:
# print(train_model['positive_outcome'].value_counts(dropna=False))
# train_model['positive_outcome'].value_counts(dropna=False,normalize=True)
# pd.crosstab(train_model["referencia"],train_model["target"],normalize='index')
# pd.crosstab(train_model["Gender"],train_model["target"],normalize='index')
# pd.crosstab(train_model["station"],train_model["target"],normalize='index')
# pd.crosstab(train_model["station"],train_model["positive_outcome"],normalize='index')

### Feature engineering

In [60]:
# get the hour and day of the week, maybe they will be useful 
train_model['hour'] = train_model['Date'].dt.hour
train_model['month'] = train_model['Date'].dt.month
train_model['day_of_week'] = train_model['Date'].dt.day_name()

In [61]:
# Legislation - reducing the quantity of categories
# train_model.groupby(['Legislation','target'], dropna=False)['Outcome'].count().unstack()

train_model['Legislation']=train_model['Legislation'].fillna('missing infomation')
train_model['Legislation'] = train_model['Legislation'].apply(lambda x: str(x).lower())

legislation_categories = ['misuse of drugs act 1971 (section 23)', 'police and criminal evidence act 1984 (section 1)', 
                          'criminal justice and public order act 1994 (section 60)', 'firearms act 1968 (section 47)',
                          'missing infomation']

mask=(~train_model['Legislation'].isin(legislation_categories))
train_model.loc[mask, 'Legislation']='others'


### Text Data normalization

In [62]:
train_model['Object of search'] = train_model['Object of search'].apply(lambda x: str(x).lower())

In [63]:
train_model['station'] = train_model['station'].apply(lambda x: str(x).lower())
# train_model['station'].value_counts()

### Latitude and Longitude

In [64]:
# print(train_model['Latitude'].describe())
# train_model['Longitude'].describe()

In [65]:
train_model['Latitude'] = train_model['Longitude'].fillna(0)
train_model['Longitude'] = train_model['Longitude'].fillna(50)

### Data split

In [66]:
df_train, df_test = train_test_split(train_model, test_size=0.3, random_state=42)

In [67]:
df_train.shape

(213333, 22)

In [68]:
df_test.shape

(91429, 22)

In [69]:
# df_train.day_of_week.value_counts()

### Hyperparameter Tuning

In [70]:
all_features = ['Type', 'Part of a policing operation', 'Age range', 'Latitude', 'Longitude',
                'Legislation', 'hour', 'month', 'day_of_week',
                'Gender', 'Officer-defined ethnicity']

numerical_features = ['hour', 'month','Latitude', 'Longitude']

categorical_features = ['Type', 'Part of a policing operation', 'Age range',
                        'Legislation', 'day_of_week','Gender', 'Officer-defined ethnicity']

# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())])

# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numerical_features),
#         ('cat', categorical_transformer, categorical_features)])

# pipeline = make_pipeline(
#     preprocessor,
# )

# pipeline.fit(X_train, y_train)

X_train = df_train[all_features]
y_train = df_train['target']

X_test = df_test[all_features]
y_test = df_test['target']


In [71]:
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(X_train)
X_train_ohe = ohe.transform(X_train)

In [72]:
X_train_ohe

<213333x131019 sparse matrix of type '<class 'numpy.float64'>'
	with 2346663 stored elements in Compressed Sparse Row format>

In [73]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

estimator = RandomForestClassifier()

#### Grid Search

In [74]:
grid_search_parameter_space = {'class_weight': ['balanced', None]}

In [75]:
grid_search = GridSearchCV(
                estimator,
                grid_search_parameter_space,
#                 cv=5,
                scoring="roc_auc",
                return_train_score=True
                )

In [76]:
%%timeit -n 1 -r 1

grid_search.fit(X_train_ohe, y_train)

KeyboardInterrupt: 

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_estimator_.get_params()

In [33]:
?RandomForestClassifier

#### Randomized Search

In [None]:
from scipy.stats import randint

random_search_parameter_space_dist = {
                   "max_depth": randint(1, 100),
                   "min_samples_leaf": randint(0, 0.1),
                   "class_weight": ["balanced", None]
                  }


In [None]:
randomized_search = RandomizedSearchCV(
                        estimator, 
                        random_search_parameter_space_dist,
                        cv=5, n_iter=250,
                        random_state=42,
                        return_train_score=True
                        )

In [None]:
%%timeit -n 1 -r 1

randomized_search.fit(X_train_ohe, y_train)

In [None]:
randomized_search.best_estimator_

In [None]:
randomized_search.best_score_