In [1]:
import json
import joblib
import pickle
import pandas as pd
import numpy as np
# import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.metrics import precision_recall_curve

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


In [3]:
def check_request(request):
    """
        Validates that our request is well formatted
        
        Returns:
        - assertion value: True if request is ok, False otherwise
        - error message: empty if request is ok, False otherwise
    """
    
    if "observation_id" not in request:
        error = "Field 'observation_id' missing from request: {}".format(request)
        return False, error
    
    return True, ""


In [4]:
def check_valid_column(observation):
    """
        Validates that our observation only has valid columns
        
        Returns:
        - assertion value: True if all provided columns are valid, False otherwise
        - error message: empty if all provided columns are valid, False otherwise
    """
    
    valid_columns = {
        "observation_id",
        "Type",
        "Date",
        "Part of a policing operation",
        "Latitude",
        "Longitude",
        "Gender",
        "Age range",
        "Officer-defined ethnicity",
        "Legislation",
        "Object of search",
        "station"
    }
    
    keys = set(observation.keys())
    
    if len(valid_columns - keys) > 0: 
        missing = valid_columns - keys
        error = "Missing columns: {}".format(missing)
        return False, error
    
    if len(keys - valid_columns) > 0: 
        extra = keys - valid_columns
        error = "Unrecognized columns provided: {}".format(extra)
        return False, error    

    return True, ""

In [86]:
def check_latitude_longitude(observation):
    """
        Validates that latitude and longitude have valid values
        
        Returns:
        - assertion value: True if latitude/longitude is valid, False otherwise
        - error message: empty if latitude/longitude is valid, False otherwise
    """
    
    lat = observation.get("Latitude")
    lon = observation.get("Longitude")
        
    if lat == None:
        error = "Field 'Latitude' is missing"
        return False, error
    
    if lon == None: 
        error = "Field 'Longitude' is missing"
        return False, error

    if not isinstance(lat, float):
        if not isinstance(lat, int):
            error = "Field 'Latitude' is not a number"
            return False, error
    
    if not isinstance(lon, float):
        if not isinstance(lon, int):
            error = "Field 'Longitude' is not a number"
            return False, error

    return True, ""

In [198]:
def check_Part_policing_operation(observation):
    """
        Validates that Part of a policing operation has valid values
        
        Returns:
        - assertion value: True if Part of a policing operation is valid, False otherwise
        - error message: empty if Part of a policing operation is valid, False otherwise
    """
    
    part_oper = observation.get("Part of a policing operation")
    
    if not isinstance(part_oper, bool):
        error = "Field 'Part of a policing operation' is not a boolean"
        return False, error        

    if not ((part_oper == False) or (part_oper == True)):
        error = "Field 'Part of a policing operation' is not a boolean"
        return False, error
    
    return True, ""

In [4]:
def transform_date(observation):
    
    """
        Creates month, hour and day_of_week features from Date
        
        Returns:
        - values if feature Date can be read as a date
        - None for all features if Date cannot be read as a date
    """
    
    date_ = observation.get("Date")
    
    try:
        date = pd.Timestamp(date_)
        hour = date.hour
        month = date.month
        day_of_week = date.day_name()
    except:
        hour = np.nan
        month = np.nan
        day_of_week = np.nan    

    return hour, month, day_of_week

In [224]:
req_test ={
  "observation_id": "teste1 X",
  "Type": "x",
  "Date": "2019-12-06T15:40:00+00:00",
  "Part of a policing operation": False,
  "Latitude": 0,
  "Longitude": 0,
  "Gender": "string",
  "Age range": "string",
  "Officer-defined ethnicity": "strinXXg",
  "Legislation": "string",
  "Object of search": "strinAAg",
  "station": "stringX"
}

In [204]:
print(check_request(req_test))
print(check_valid_column(req_test))
print(check_latitude_longitude(req_test))
print(check_Part_policing_operation(req_test))
print(transform_date(req_test))

(True, '')
(True, '')
(True, '')
(True, '')
(15, 12, 'Friday')


#### Criar dict para chamar prediction

In [209]:
hour, month, day_of_week  = transform_date(req_test)
observation=req_test
obs_dataframe = {
    "Type": observation.get("Type"),
    "Part of a policing operation": observation.get("Part of a policing operation"),
    "Age range": observation.get("Age range"),
    "Latitude": observation.get("Latitude"),
    "Longitude": observation.get("Longitude"),
    "Legislation": observation.get("Legislation"),
    "hour": hour,
    "month": month,
    "day_of_week": day_of_week,
    "Gender": observation.get("Gender"),
    "Officer-defined ethnicity": observation.get("Officer-defined ethnicity")
}

In [210]:
obs_dataframe

{'Type': 'x',
 'Part of a policing operation': False,
 'Age range': 'string',
 'Latitude': 0,
 'Longitude': 0,
 'Legislation': 'string',
 'hour': 15,
 'month': 12,
 'day_of_week': 'Friday',
 'Gender': 'string',
 'Officer-defined ethnicity': 'string'}

In [211]:
with open('columns.json') as fh:
    columns = json.load(fh)

pipeline = joblib.load('pipeline.pickle')

with open('dtypes.pickle', 'rb') as fh:
    dtypes = pickle.load(fh)

In [212]:
pd.DataFrame([obs_dataframe], columns=columns).astype(dtypes)

Unnamed: 0,Type,Part of a policing operation,Age range,Latitude,Longitude,Legislation,hour,month,day_of_week,Gender,Officer-defined ethnicity
0,x,False,string,0.0,0.0,string,15,12,Friday,string,string


In [213]:
pipeline.predict(pd.DataFrame([obs_dataframe], columns=columns).astype(dtypes))[0]

0

In [215]:
pipeline.predict_proba(pd.DataFrame([obs_dataframe], columns=columns).astype(dtypes))[0, 1]

0.2426666666666667

In [227]:
categorical_features = ['Type', 'Age range', 'Legislation', 'Gender', 'Officer-defined ethnicity']
observation=req_test
for column in categorical_features:
    observation[column] =str(observation[column]).strip().lower()

In [228]:
observation

{'observation_id': 'teste1 X',
 'Type': 'x',
 'Date': '2019-12-06T15:40:00+00:00',
 'Part of a policing operation': False,
 'Latitude': 0,
 'Longitude': 0,
 'Gender': 'string',
 'Age range': 'string',
 'Officer-defined ethnicity': 'strinxxg',
 'Legislation': 'string',
 'Object of search': 'strinAAg',
 'station': 'stringX'}

In [240]:
obs_dataframe['Officer-defined ethnicity'] = obs_dataframe['Officer-defined ethnicity'].replace('mixed', 'other')

In [236]:
type(obs_dataframe['Officer-defined ethnicity'])

str

In [237]:
type(obs_dataframe['Age range'])

str

In [242]:
obs_dataframe['Officer-defined ethnicity'].replace('mixed', 'other')

'string'

In [5]:
req_test ={
  "observation_id": "teste1 X",
  "Type": "x",
  "Date": "2019-12-0600:00",
  "Part of a policing operation": False,
  "Latitude": 0,
  "Longitude": 0,
  "Gender": "string",
  "Age range": "string",
  "Officer-defined ethnicity": "strinXXg",
  "Legislation": "string",
  "Object of search": "strinAAg",
  "station": "stringX"
}


transform_date(req_test)

(nan, nan, nan)

In [8]:
isinstance(null, str)

NameError: name 'null' is not defined