In [1]:
# Initial Dependencies
from numpy import loadtxt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [2]:
# load dataset
df = pd.read_csv('mach_learn_df.csv')

df = df.drop(columns=['Unnamed: 0','City'])

# build X and y matrices
X = df.drop(['Offense_Type'], axis=1)
y = df[['Offense_Type']]

In [3]:
# Build Test/Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [4]:
# Import CatBoostClassifier and Fit a Model
from catboost import Pool, CatBoostClassifier

# Set Features
train_data = X_train
eval_data = X_test

# Call Categorical Features
cat_features = [1, 2, 4, 6]

# Set Outcomes
train_label = y_train
eval_label = y_test

# Pool Training Data
train_dataset = Pool(data=train_data,
                     label=train_label,
                     cat_features=cat_features)

# Pool Testing Data
eval_dataset = Pool(data=eval_data,
                    label=eval_label,
                    cat_features=cat_features)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=100,
                           learning_rate=1,
                           depth=6,
                           thread_count=-1)

# Fit Model
model.fit(train_dataset, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.7614111	total: 2.23s	remaining: 3m 41s
1:	learn: 1.6128628	total: 4.79s	remaining: 3m 54s
2:	learn: 1.9080411	total: 6.61s	remaining: 3m 33s
3:	learn: 13.7224479	total: 8.13s	remaining: 3m 15s
4:	learn: 13.1294394	total: 9.51s	remaining: 3m
5:	learn: 33.1188030	total: 10.8s	remaining: 2m 49s
6:	learn: 48.5023179	total: 12.1s	remaining: 2m 40s
7:	learn: 53.6696737	total: 13.8s	remaining: 2m 39s
8:	learn: 50.0697539	total: 15.1s	remaining: 2m 32s
9:	learn: 77.5117865	total: 16.2s	remaining: 2m 25s
10:	learn: 65.7008673	total: 17.2s	remaining: 2m 19s
11:	learn: 65.3167188	total: 18.4s	remaining: 2m 15s
12:	learn: 92.0054860	total: 19.6s	remaining: 2m 11s
13:	learn: 113.9066850	total: 20.9s	remaining: 2m 8s
14:	learn: 81.0081002	total: 22.3s	remaining: 2m 6s
15:	learn: 137.2528200	total: 23.9s	remaining: 2m 5s
16:	learn: 52.7082448	total: 25.2s	remaining: 2m 3s
17:	learn: 66.2336325	total: 26.5s	remaining: 2m
18:	learn: 69.2839793	total: 27.7s	remaining: 1m 58s
19:	learn: 68.60

<catboost.core.CatBoostClassifier at 0x1a1c8e0410>

In [5]:
model.score(X_test, y_test)

0.37991998967608726

In [66]:
# Import CatBoostClassifier and Fit a Model
from catboost import Pool, CatBoostClassifier

# Set Features
train_data = X_train
eval_data = X_test

# Call Categorical Features
cat_features = [1, 2, 4, 6]

# Set Outcomes
train_label = y_train
eval_label = y_test

# Pool Training Data
train_dataset = Pool(data=train_data,
                     label=train_label,
                     cat_features=cat_features)

# Pool Testing Data
eval_dataset = Pool(data=eval_data,
                    label=eval_label,
                    cat_features=cat_features)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=500,
                           learning_rate=.5,
                           depth=3,
                           loss_function='MultiClass',
                           eval_metric='Accuracy',
                           custom_metric='F1',
                           use_best_model=True,
                           thread_count=-1)

# Fit Model
model.fit(train_dataset, plot=True, eval_set=eval_dataset)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.4576223	test: 0.4597109	best: 0.4597109 (0)	total: 1.69s	remaining: 14m 4s
1:	learn: 0.4838710	test: 0.4846432	best: 0.4846432 (1)	total: 3.44s	remaining: 14m 15s
2:	learn: 0.4858067	test: 0.4855981	best: 0.4855981 (2)	total: 4.57s	remaining: 12m 36s
3:	learn: 0.4850066	test: 0.4851207	best: 0.4855981 (2)	total: 5.62s	remaining: 11m 37s
4:	learn: 0.4871660	test: 0.4873661	best: 0.4873661 (4)	total: 6.79s	remaining: 11m 11s
5:	learn: 0.4880522	test: 0.4889534	best: 0.4889534 (5)	total: 7.88s	remaining: 10m 49s
6:	learn: 0.4900309	test: 0.4910053	best: 0.4910053 (6)	total: 8.82s	remaining: 10m 21s
7:	learn: 0.4905041	test: 0.4918570	best: 0.4918570 (7)	total: 9.94s	remaining: 10m 11s
8:	learn: 0.4913859	test: 0.4926184	best: 0.4926184 (8)	total: 11s	remaining: 9m 59s
9:	learn: 0.4925173	test: 0.4936637	best: 0.4936637 (9)	total: 12s	remaining: 9m 48s
10:	learn: 0.4923925	test: 0.4935088	best: 0.4936637 (9)	total: 12.9s	remaining: 9m 35s
11:	learn: 0.4927410	test: 0.4948122	be

93:	learn: 0.5038263	test: 0.5048006	best: 0.5048264 (91)	total: 2m 4s	remaining: 8m 56s
94:	learn: 0.5038650	test: 0.5048393	best: 0.5048393 (94)	total: 2m 5s	remaining: 8m 55s
95:	learn: 0.5037962	test: 0.5047232	best: 0.5048393 (94)	total: 2m 6s	remaining: 8m 54s
96:	learn: 0.5038392	test: 0.5046070	best: 0.5048393 (94)	total: 2m 8s	remaining: 8m 52s
97:	learn: 0.5038650	test: 0.5046458	best: 0.5048393 (94)	total: 2m 9s	remaining: 8m 51s
98:	learn: 0.5040199	test: 0.5047232	best: 0.5048393 (94)	total: 2m 10s	remaining: 8m 49s
99:	learn: 0.5040500	test: 0.5048006	best: 0.5048393 (94)	total: 2m 11s	remaining: 8m 47s
100:	learn: 0.5040457	test: 0.5048006	best: 0.5048393 (94)	total: 2m 13s	remaining: 8m 45s
101:	learn: 0.5039640	test: 0.5051361	best: 0.5051361 (101)	total: 2m 14s	remaining: 8m 44s
102:	learn: 0.5039683	test: 0.5051361	best: 0.5051361 (101)	total: 2m 15s	remaining: 8m 42s
103:	learn: 0.5039726	test: 0.5051361	best: 0.5051361 (101)	total: 2m 16s	remaining: 8m 40s
104:	lea

183:	learn: 0.5052932	test: 0.5060008	best: 0.5061814 (177)	total: 4m 5s	remaining: 7m 1s
184:	learn: 0.5052545	test: 0.5060008	best: 0.5061814 (177)	total: 4m 7s	remaining: 7m
185:	learn: 0.5052760	test: 0.5058072	best: 0.5061814 (177)	total: 4m 8s	remaining: 6m 59s
186:	learn: 0.5051813	test: 0.5059362	best: 0.5061814 (177)	total: 4m 10s	remaining: 6m 58s
187:	learn: 0.5052760	test: 0.5059750	best: 0.5061814 (177)	total: 4m 11s	remaining: 6m 58s
188:	learn: 0.5052373	test: 0.5060395	best: 0.5061814 (177)	total: 4m 13s	remaining: 6m 57s
189:	learn: 0.5052201	test: 0.5060653	best: 0.5061814 (177)	total: 4m 14s	remaining: 6m 55s
190:	learn: 0.5052502	test: 0.5061040	best: 0.5061814 (177)	total: 4m 16s	remaining: 6m 55s
191:	learn: 0.5052201	test: 0.5061298	best: 0.5061814 (177)	total: 4m 18s	remaining: 6m 54s
192:	learn: 0.5053104	test: 0.5059879	best: 0.5061814 (177)	total: 4m 19s	remaining: 6m 52s
193:	learn: 0.5052588	test: 0.5061943	best: 0.5061943 (193)	total: 4m 21s	remaining: 6m 

273:	learn: 0.5061019	test: 0.5063879	best: 0.5066847 (258)	total: 6m 18s	remaining: 5m 12s
274:	learn: 0.5061793	test: 0.5064008	best: 0.5066847 (258)	total: 6m 20s	remaining: 5m 10s
275:	learn: 0.5061836	test: 0.5063363	best: 0.5066847 (258)	total: 6m 21s	remaining: 5m 9s
276:	learn: 0.5061234	test: 0.5064137	best: 0.5066847 (258)	total: 6m 23s	remaining: 5m 8s
277:	learn: 0.5062567	test: 0.5064137	best: 0.5066847 (258)	total: 6m 24s	remaining: 5m 7s
278:	learn: 0.5062438	test: 0.5064008	best: 0.5066847 (258)	total: 6m 26s	remaining: 5m 6s
279:	learn: 0.5062137	test: 0.5064654	best: 0.5066847 (258)	total: 6m 28s	remaining: 5m 4s
280:	learn: 0.5063772	test: 0.5066073	best: 0.5066847 (258)	total: 6m 29s	remaining: 5m 3s
281:	learn: 0.5062955	test: 0.5066202	best: 0.5066847 (258)	total: 6m 32s	remaining: 5m 3s
282:	learn: 0.5062740	test: 0.5067364	best: 0.5067364 (282)	total: 6m 34s	remaining: 5m 2s
283:	learn: 0.5062137	test: 0.5068138	best: 0.5068138 (283)	total: 6m 36s	remaining: 5m 

363:	learn: 0.5070827	test: 0.5069041	best: 0.5071106 (301)	total: 8m 20s	remaining: 3m 6s
364:	learn: 0.5069493	test: 0.5068525	best: 0.5071106 (301)	total: 8m 21s	remaining: 3m 5s
365:	learn: 0.5069321	test: 0.5068396	best: 0.5071106 (301)	total: 8m 22s	remaining: 3m 3s
366:	learn: 0.5070569	test: 0.5069428	best: 0.5071106 (301)	total: 8m 23s	remaining: 3m 2s
367:	learn: 0.5070784	test: 0.5069428	best: 0.5071106 (301)	total: 8m 25s	remaining: 3m 1s
368:	learn: 0.5071687	test: 0.5068912	best: 0.5071106 (301)	total: 8m 26s	remaining: 2m 59s
369:	learn: 0.5070396	test: 0.5067493	best: 0.5071106 (301)	total: 8m 27s	remaining: 2m 58s
370:	learn: 0.5071042	test: 0.5066976	best: 0.5071106 (301)	total: 8m 29s	remaining: 2m 56s
371:	learn: 0.5071085	test: 0.5066976	best: 0.5071106 (301)	total: 8m 30s	remaining: 2m 55s
372:	learn: 0.5070655	test: 0.5066847	best: 0.5071106 (301)	total: 8m 31s	remaining: 2m 54s
373:	learn: 0.5071644	test: 0.5067105	best: 0.5071106 (301)	total: 8m 33s	remaining: 

453:	learn: 0.5077623	test: 0.5072267	best: 0.5072525 (450)	total: 10m 17s	remaining: 1m 2s
454:	learn: 0.5078182	test: 0.5073816	best: 0.5073816 (454)	total: 10m 18s	remaining: 1m 1s
455:	learn: 0.5077795	test: 0.5073687	best: 0.5073816 (454)	total: 10m 19s	remaining: 59.8s
456:	learn: 0.5077924	test: 0.5073558	best: 0.5073816 (454)	total: 10m 20s	remaining: 58.4s
457:	learn: 0.5079258	test: 0.5073042	best: 0.5073816 (454)	total: 10m 22s	remaining: 57.1s
458:	learn: 0.5079645	test: 0.5072784	best: 0.5073816 (454)	total: 10m 23s	remaining: 55.7s
459:	learn: 0.5079645	test: 0.5074074	best: 0.5074074 (459)	total: 10m 24s	remaining: 54.3s
460:	learn: 0.5079559	test: 0.5073687	best: 0.5074074 (459)	total: 10m 26s	remaining: 53s
461:	learn: 0.5079559	test: 0.5073687	best: 0.5074074 (459)	total: 10m 28s	remaining: 51.7s
462:	learn: 0.5078355	test: 0.5072784	best: 0.5074074 (459)	total: 10m 30s	remaining: 50.3s
463:	learn: 0.5078785	test: 0.5073300	best: 0.5074074 (459)	total: 10m 31s	remaini

<catboost.core.CatBoostClassifier at 0x1a305e1ed0>

In [34]:
# Cross Validation
from catboost import cv

cv_data = X_train

labels = y_train

cat_features = [1, 2, 4, 6]

cv_dataset = Pool(data=cv_data,
                  label=labels,
                  cat_features=cat_features)

params = {"iterations": 250,
          "depth": 3,
          "loss_function": "Logloss",
          "verbose": True}

scores = cv(cv_dataset,
            params,
            fold_count=2, 
            plot="True")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CatBoostError: catboost/private/libs/target/target_converter.cpp:380: Target with classes must contain only 2 unique values for binary classification

In [31]:
scores.head()

Unnamed: 0,iterations,test-MultiClass-mean,test-MultiClass-std,train-MultiClass-mean,train-MultiClass-std
0,0,2.137761,0.000163,2.137998,1.3e-05
1,1,2.086617,0.000274,2.0869,0.000553
2,2,2.04258,0.000206,2.043108,0.000218
3,3,2.002672,0.000741,2.003211,0.000687
4,4,1.966527,0.000666,1.967235,0.000517


In [4]:
from catboost import CatBoost

# Grid Search to Improve Model

cat_features = [1, 2, 4, 6]

train_data = Pool(data=X_train[:1000],
                  label=y_train[:1000],
                  cat_features=cat_features)

model = CatBoostClassifier(loss_function='MultiClass', custom_metric=['Accuracy','F1'], thread_count=-1)

grid = {'learning_rate': [0.03, 0.1, .5],
        'depth': [3, 6, 10],
        }

grid_search_result = model.grid_search(grid, 
                                       X=train_data, 
                                       plot=True,
                                       verbose=True)

NameError: name 'Pool' is not defined

In [65]:
grid_search_result

{'params': {'depth': 3, 'learning_rate': 0.5},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
               46,
               47,
    

In [52]:
# View Parameters in Model
print(model.get_all_params())

{'nan_mode': 'Min', 'eval_metric': 'Accuracy', 'combinations_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=8:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1', 'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'], 'iterations': 250, 'sampling_frequency': 'PerTree', 'fold_permutation_block': 0, 'leaf_estimation_method': 'Newton', 'counter_calc_method': 'SkipTest', 'grow_policy': 'SymmetricTree', 'penalties_coefficient': 1, 'boosting_type': 'Plain', 'model_shrink_mode': 'Constant', 'feature_border_type': 'GreedyLogSum', 'ctr_leaf_count_limit': 18446744073709551615, 'bayesian_matrix_reg': 0.10000000149011612, 'one_hot_max_size': 2, 'l2_leaf_reg': 3, 'random_strength': 1, 'rsm': 1, 'boost_from_average': False, 'max_ctr_complexity': 4, 'model_size_reg': 0.5, 'simple_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=8:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1', 'Counter:CtrBorderCount=15:CtrBorderType=Un

In [75]:
# View Feature Importances
print(model.feature_names_)
print(model.feature_importances_)

['Hour', 'Premise', 'Tract', 'Month', 'Day_of_Week', 'Temperature', 'Weather']
[11.92143028 52.30474141 21.48938952  3.29399789  4.61435664  3.31147863
  3.06460563]


In [67]:
# See Model Score
model.score(X_test, y_test)

0.5074332171893148

In [45]:
# # Save Model
# import pickle
# pickle.dump(model, open( "crime_CatBoost", "wb" ) )

In [46]:
# # Load Model
# cb = pickle.load( open( "crime_CatBoost", "rb" ) )
# cb.score(X_test, y_test)

0.5064137308039747

In [43]:
print(cb.get_all_params())

{'nan_mode': 'Min', 'eval_metric': 'AUC', 'combinations_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=8:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1', 'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'], 'iterations': 250, 'sampling_frequency': 'PerTree', 'fold_permutation_block': 0, 'leaf_estimation_method': 'Newton', 'counter_calc_method': 'SkipTest', 'grow_policy': 'SymmetricTree', 'penalties_coefficient': 1, 'boosting_type': 'Plain', 'model_shrink_mode': 'Constant', 'feature_border_type': 'GreedyLogSum', 'ctr_leaf_count_limit': 18446744073709551615, 'bayesian_matrix_reg': 0.10000000149011612, 'one_hot_max_size': 2, 'l2_leaf_reg': 3, 'random_strength': 1, 'rsm': 1, 'boost_from_average': False, 'max_ctr_complexity': 4, 'model_size_reg': 0.5, 'simple_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=8:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1', 'Counter:CtrBorderCount=15:CtrBorderType=Uniform

In [31]:
# Predict with Model
cb.predict(X_test[:1])[0][0]

'Theft'

In [17]:
# See Model Classes
classes = cb.classes_
classes

array(['Arson', 'Assault', 'Burglary/Robbery', 'Intimidation',
       'Kidnapping', 'Murder/Manslaughter', 'Sex Crime', 'Theft',
       'Vandalism'], dtype=object)

In [18]:
X_test[2:3]

Unnamed: 0,Hour,Premise,Tract,Month,Day_of_Week,Temperature,Weather
308922,15,"Highway, Road, Street, Alley",100000,5,Friday,80.22,Clouds


In [32]:
# Predict with Specific Inputs
cb.predict([15, 'Highway, Road, Street, Alley', '100000', 5, 'Friday', 80.22, 'Clouds'])
# cb.predict(X_test[2:3])[0]

array(['Assault'], dtype=object)

In [33]:
# Get Outcome Probabilities
proba_array = (cb.predict_proba([15, 'Highway, Road, Street, Alley', '100000', 5, 'Friday', 80.22, 'Clouds'])*100)
# proba_array = (cb.predict_proba(X_test[2:3])*100)
proba_array

array([ 0.09429496, 36.47124917,  5.88076777, 11.07312943,  0.06090333,
        0.11054445,  0.59674195, 35.41300935, 10.29935959])

In [34]:
# Zip Outcomes with Probabilities 
class_probs = zip(classes, proba_array)

for offense, prob in class_probs:
    print(f'Crime: {offense} - Probability: {"%.2f"%(prob)}%')

Crime: Arson - Probability: 0.09%
Crime: Assault - Probability: 36.47%
Crime: Burglary/Robbery - Probability: 5.88%
Crime: Intimidation - Probability: 11.07%
Crime: Kidnapping - Probability: 0.06%
Crime: Murder/Manslaughter - Probability: 0.11%
Crime: Sex Crime - Probability: 0.60%
Crime: Theft - Probability: 35.41%
Crime: Vandalism - Probability: 10.30%


In [None]:
# import warnings
# warnings.filterwarnings("ignore")

# # load libraries
# from sklearn import datasets
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import GridSearchCV
# from catboost import Pool, CatBoostClassifier

# # load the dataset
# train_data = X_train

# eval_data = X_test

# cat_features = [1, 2, 4, 6]

# train_label = y_train
# eval_label = y_test


# train_dataset = Pool(data=train_data,
#                      label=train_label,
#                      cat_features=cat_features)

# eval_dataset = Pool(data=eval_data,
#                     label=eval_label,
#                     cat_features=cat_features)

# # Initialize CatBoostClassifier
# model = CatBoostClassifier()

# parameters = {'depth'         : [3, 6,8,10],
#               'learning_rate' : [0.05, 0.1, .5],
#               'iterations'    : [30, 50, 100]
#              }
# grid = GridSearchCV(estimator=model, param_grid = parameters, n_jobs=-1)
# grid.fit(X_train, y_train, cat_features=cat_features)    

# # Results from Grid Search
# print("\n========================================================")
# print(" Results from Grid Search " )
# print("========================================================")    

# print("\n The best estimator across ALL searched params:\n",
#       grid.best_estimator_)

# print("\n The best score across ALL searched params:\n",
#       grid.best_score_)

# print("\n The best parameters across ALL searched params:\n",
#       grid.best_params_)

# print("\n ========================================================")