In [1]:
import numpy as np
import pandas as pd
import math
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

# load dataset
df = pd.read_csv('mach_learn_df.csv')

df = df.drop(columns=['Unnamed: 0','City'])

# build X and y matrices
X = df.drop(['Offense_Type'], axis=1)
y = df[['Offense_Type']].values.reshape(-1)



In [2]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
encoded_y = label_encoder.fit(y)
transformed_y = label_encoder.transform(y)

transformed_y

array([2, 2, 2, ..., 7, 3, 7])

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
    
encoded_X = MultiColumnLabelEncoder(columns = ['Day_of_Week', 'Premise', 'Tract', 'Weather']).fit_transform(X)

encoded_X

Unnamed: 0,Hour,Premise,Tract,Month,Day_of_Week,Temperature,Weather
0,0,125,156,1,1,46.56,1
1,0,9,180,1,1,46.56,1
2,0,131,385,1,1,46.56,1
3,0,131,204,1,1,46.56,1
4,0,33,153,1,1,46.56,1
...,...,...,...,...,...,...,...
309954,23,126,491,5,3,74.26,6
309955,23,126,486,5,3,74.26,6
309956,23,18,486,5,3,74.26,6
309957,23,126,361,5,3,74.26,6


In [4]:
import numpy as np
import pandas as pd
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
import tqdm

def evaluate(pipeline, X_train, X_test, y_train, y_test):
    '''
    Evaluate a pipeline on training and test datasets
    '''    
    pipeline.fit(X_train, y_train)
    y_train_hat = pipeline.predict(X_train)
    y_test_hat = pipeline.predict(X_test)
    train_f1 = f1_score(y_train_hat, y_train, average='weighted')
    train_acc = accuracy_score(y_train_hat, y_train)
    test_f1 = f1_score(y_test_hat, y_test, average='weighted')
    test_acc = accuracy_score(y_test_hat, y_test)

    print(f"========== Predictor: {type(pipeline).__name__} ==========")
    print(f"Training result: f1: {train_f1:.3f}, acc: {train_acc:.3f}")
    print(f"Test result: f1: {test_f1:.3f}, acc: {test_acc:.3f}")
    print()

# split to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(encoded_X, transformed_y, test_size=0.2, random_state=42, stratify=y)

# try LogisticRegression to establish a baseline performance
pipeline = Pipeline([
    ('scale', StandardScaler()), # remember to scale first before feeding data into lgr
    ('lgr', LogisticRegression()),
])
evaluate(pipeline, X_train, X_test, y_train, y_test)

# try other predictors
evaluate(XGBClassifier(n_jobs=-1), X_train, X_test, y_train, y_test)
evaluate(LGBMClassifier(n_jobs=-1), X_train, X_test, y_train, y_test)
evaluate(RandomForestClassifier(n_jobs=-1), X_train, X_test, y_train, y_test)
evaluate(GradientBoostingClassifier(), X_train, X_test, y_train, y_test)


Training result: f1: 0.606, acc: 0.439
Test result: f1: 0.606, acc: 0.439

Training result: f1: 0.589, acc: 0.524
Test result: f1: 0.572, acc: 0.505

Training result: f1: 0.573, acc: 0.506
Test result: f1: 0.568, acc: 0.500

Training result: f1: 0.963, acc: 0.962
Test result: f1: 0.482, acc: 0.447

Training result: f1: 0.569, acc: 0.494
Test result: f1: 0.568, acc: 0.492



In [8]:
# RandomizedSearchCV on XGB
xgb_param_grid = {
    'n_estimators': [10, 20, 50, 100, 200, 300, 400],
    'max_depth': np.arange(5, 20),
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'subsample': np.arange(0.5, 1.0, 0.05),
    'min_child_weight': np.arange(1, 10),
    'colsample_bytree': np.arange(0.2, 1.0, 0.1),
    'gamma': [0, 0.001, 0.002, 0.003, 0.004, 0.005, 1e-2],
    'n_jobs': [-1]
}

predictor = XGBClassifier()
rs = RandomizedSearchCV(predictor, xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1, n_iter=20, verbose=1)
rs.fit(X_train[:5000], y_train[:5000])
evaluate(rs.best_estimator_, X_train[:5000], X_test[:5000], y_train[:5000], y_test[:5000])


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.4min finished


Training result: f1: 0.581, acc: 0.520
Test result: f1: 0.534, acc: 0.470



In [11]:
# evaluate model with kfold
kfold = KFold(n_splits=10)
results = cross_val_score(rs.best_estimator_, X_test[:5000], y_test[:5000], cv=kfold, n_jobs=-1)
print("Results: %.2f (%.2f) accuracy" % (results.mean(), results.std()))

Results: 0.48 (0.03) accuracy
