In [3]:
# First XGBoost model for Pima Indians dataset
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [4]:
# load dataset
df = pd.read_csv('mach_learn_df.csv')

df = df.drop(columns=['Unnamed: 0','City'])

# build X and y matrices
X = df.drop(['Offense_Type'], axis=1)
y = df[['Offense_Type']].values.reshape(-1)

In [5]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
encoded_y = label_encoder.fit(y)
transformed_y = label_encoder.transform(y)

transformed_y

array([2, 2, 2, ..., 7, 3, 7])

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
    
encoded_X = MultiColumnLabelEncoder(columns = ['Day_of_Week', 'Premise', 'Tract', 'Weather']).fit_transform(X)

encoded_X

Unnamed: 0,Hour,Premise,Tract,Month,Day_of_Week,Temperature,Weather
0,0,125,156,1,1,46.56,1
1,0,9,180,1,1,46.56,1
2,0,131,385,1,1,46.56,1
3,0,131,204,1,1,46.56,1
4,0,33,153,1,1,46.56,1
...,...,...,...,...,...,...,...
309954,23,126,491,5,3,74.26,6
309955,23,126,486,5,3,74.26,6
309956,23,18,486,5,3,74.26,6
309957,23,126,361,5,3,74.26,6


In [7]:
# split data into train and test sets
seed = 7
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(encoded_X, transformed_y, test_size=test_size, random_state=seed)


In [8]:
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [13]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# model.score(X_test, y_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split, cross_val_score

# RandomizedSearchCV on XGB
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 6, 7, 8],
    'learning_rate': [.3, .33, .35, .4],
    'subsample': [0.5, 1.0, 0.05],
    'min_child_weight': [1, 2, 3, 10],
    'colsample_bytree': [0.2, 1.0, 0.5],
    'gamma': [0, 0.001, 0.002],
    'n_jobs': [-1]
}

predictor = XGBClassifier()
rs = RandomizedSearchCV(predictor, xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1, n_iter=10, verbose=1)
rs.fit(X_train, y_train)
evaluate(rs.best_estimator_, X_train, X_test, y_train, y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [14]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 50.60%


In [17]:
import joblib

#save model
with open(f'crime_xgb_model.sav', 'wb') as f:
    joblib.dump(model, f)

In [18]:
xgb_model = joblib.load('crime_xgb_model.sav')

In [19]:
xgb_model.predict(X_test[:5])

array([7, 7, 1, 1, 7])