In [1]:
!nvidia-smi

Sat Dec  4 16:26:16 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8    30W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
!pip install -q miceforest catboost scikit-learn optuna

[K     |████████████████████████████████| 69 kB 4.5 MB/s 
[K     |████████████████████████████████| 76.3 MB 1.1 MB/s 
[K     |████████████████████████████████| 308 kB 61.6 MB/s 
[K     |████████████████████████████████| 209 kB 80.6 MB/s 
[K     |████████████████████████████████| 80 kB 8.4 MB/s 
[K     |████████████████████████████████| 75 kB 3.7 MB/s 
[K     |████████████████████████████████| 149 kB 46.3 MB/s 
[K     |████████████████████████████████| 49 kB 5.1 MB/s 
[K     |████████████████████████████████| 112 kB 87.8 MB/s 
[?25h  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


In [32]:
!pip install dask-saturn dask==2.30.0 distributed==2.30.0 tornado==6.1 numpy==1.20




In [33]:
# import relevant packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import miceforest as mf

import requests
import io

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score


In [6]:
url = "https://getsubsalt.com/jobs/data-science/challenge/crime.csv"

response = requests.get(url)

with open('crime.csv', 'wb') as f:
  f.write(response.content)

crime_df = pd.read_csv('crime.csv', low_memory = False)
crime_df.head()


Unnamed: 0,date,primary_type,location_description,district,ward,domestic,arrest
0,2005-01-01 01:00:00 UTC,BATTERY,STREET,2.0,3.0,False,False
1,2005-01-01 01:00:00 UTC,WEAPONS VIOLATION,RESIDENCE,5.0,?,False,False
2,2005-01-01 01:00:00 UTC,CRIMINAL DAMAGE,?,2.0,3.0,False,False
3,2005-01-01 01:00:00 UTC,THEFT,DEPARTMENT STORE,1.0,42.0,False,True
4,2005-01-01 01:00:00 UTC,THEFT,BAR OR TAVERN,18.0,42.0,?,False


In [7]:
crime_df.dtypes

date                    object
primary_type            object
location_description    object
district                object
ward                    object
domestic                object
arrest                  object
dtype: object

In [8]:
crime_df.describe()

Unnamed: 0,date,primary_type,location_description,district,ward,domestic,arrest
count,1000000,1000000,999947,999970.0,999990.0,1000000,1000000
unique,409892,31,127,24.0,51.0,3,3
top,2008-01-01 12:01:00 UTC,THEFT,STREET,8.0,28.0,False,False
freq,178,195422,268317,71183.0,45617.0,884476,706651


In [9]:
crime_df.isnull().sum(axis=0)

date                     0
primary_type             0
location_description    53
district                30
ward                    10
domestic                 0
arrest                   0
dtype: int64

In [10]:
crime_df.isin(['?']).sum(axis=0)

date                    2
primary_type            2
location_description    2
district                2
ward                    2
domestic                2
arrest                  2
dtype: int64

In [24]:
crime_df_new = crime_df.replace('?', np.NaN)
crime_df_new.isin(['?']).sum(axis=0)


date                    0
primary_type            0
location_description    0
district                0
ward                    0
domestic                0
arrest                  0
dtype: int64

In [25]:
# compute percent of missing values by column

percent_missing = pd.DataFrame((crime_df_new.isnull().sum() / len(crime_df))*100, columns=['pct_missing'])
percent_missing

Unnamed: 0,pct_missing
date,0.0002
primary_type,0.0002
location_description,0.0055
district,0.0032
ward,0.0012
domestic,0.0002
arrest,0.0002


In [26]:
print(crime_df_new.nunique(dropna=True))

date                    409891
primary_type                30
location_description       126
district                    23
ward                        50
domestic                     2
arrest                       2
dtype: int64


In [28]:
# Split date column into year, month, day, weekday and hour

def split_datetime(df, column):
  df[column] = pd.to_datetime(df[column])
  df['year'] = df[column].dt.year
  df['month'] = df[column].dt.month
  df['day'] = df[column].dt.day
  df['weekday'] = df[column].dt.weekday
  df['hour'] = df[column].dt.hour
  return df.drop(column, axis=1)

In [29]:
crime_df_new = split_datetime(crime_df_new, 'date')

In [30]:
crime_df_new.head()

Unnamed: 0,primary_type,location_description,district,ward,domestic,arrest,year,month,day,weekday,hour
0,BATTERY,STREET,2.0,3.0,False,False,2005.0,1.0,1.0,5.0,1.0
1,WEAPONS VIOLATION,RESIDENCE,5.0,,False,False,2005.0,1.0,1.0,5.0,1.0
2,CRIMINAL DAMAGE,,2.0,3.0,False,False,2005.0,1.0,1.0,5.0,1.0
3,THEFT,DEPARTMENT STORE,1.0,42.0,False,True,2005.0,1.0,1.0,5.0,1.0
4,THEFT,BAR OR TAVERN,18.0,42.0,,False,2005.0,1.0,1.0,5.0,1.0


In [31]:
print(crime_df_new.nunique(dropna=True))

primary_type             30
location_description    126
district                 23
ward                     50
domestic                  2
arrest                    2
year                      4
month                    12
day                      31
weekday                   7
hour                     12
dtype: int64


In [33]:
crime_df_new = crime_df_new.astype("category")
crime_df_new.dtypes

primary_type            category
location_description    category
district                category
ward                    category
domestic                category
arrest                  category
year                    category
month                   category
day                     category
weekday                 category
hour                    category
dtype: object

In [22]:
!pip install lightgbm --install-option=--gpu

  cmdoptions.check_install_build_global(options)
Collecting lightgbm
  Downloading lightgbm-3.3.1.tar.gz (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.2 MB/s 
Skipping wheel build for lightgbm, due to binaries being disabled for it.
Installing collected packages: lightgbm
    Running setup.py install for lightgbm ... [?25l[?25hdone
Successfully installed lightgbm-3.3.1


In [35]:
%%time
# Multiple Imputation of missing values using miceforest

kernel = mf.ImputationKernel(
    crime_df_new,
    datasets=2,
    data_subset = 0.75,
    mean_match_candidates=5,
    categorical_feature='auto',
    save_all_iterations=True,
    random_state=21
)

kernel.mice(iterations=2,
            variable_parameters={'location_description': {'n_estimators': 25},
                                 'primary_type': {'n_estimators': 25},
                                 'ward': {'n_estimators': 25},
                                 'district': {'n_estimators': 25},
                                 'day': {'n_estimators': 25}},
            n_estimators=50,
            device_type= 'gpu',
            verbose=True
            )

print(kernel)

Dataset 0
1  | primary_type | domestic | arrest | year | month | day | weekday | hour | ward | district | location_description
2  | primary_type | domestic | arrest | year | month | day | weekday | hour | ward | district | location_description
Dataset 1
1  | primary_type | domestic | arrest | year | month | day | weekday | hour | ward | district | location_description
2  | primary_type | domestic | arrest | year | month | day | weekday | hour | ward | district | location_description
              Class: ImputationKernel
           Datasets: 2
         Iterations: 2
  Imputed Variables: 11
save_all_iterations: True
CPU times: user 57min 46s, sys: 52.1 s, total: 58min 38s
Wall time: 30min 2s


In [36]:
imputed_df = kernel.complete_data(dataset=0, inplace=False)
print(imputed_df.isnull().sum(0))

primary_type            0
location_description    0
district                0
ward                    0
domestic                0
arrest                  0
year                    0
month                   0
day                     0
weekday                 0
hour                    0
dtype: int64


In [37]:
imputed_df

Unnamed: 0,primary_type,location_description,district,ward,domestic,arrest,year,month,day,weekday,hour
0,BATTERY,STREET,2.0,3.0,False,False,2005.0,1.0,1.0,5.0,1.0
1,WEAPONS VIOLATION,RESIDENCE,5.0,9.0,False,False,2005.0,1.0,1.0,5.0,1.0
2,CRIMINAL DAMAGE,CHURCH / SYNAGOGUE / PLACE OF WORSHIP,2.0,3.0,False,False,2005.0,1.0,1.0,5.0,1.0
3,THEFT,DEPARTMENT STORE,1.0,42.0,False,True,2005.0,1.0,1.0,5.0,1.0
4,THEFT,BAR OR TAVERN,18.0,42.0,False,False,2005.0,1.0,1.0,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
999995,OTHER OFFENSE,STREET,2.0,3.0,False,False,2008.0,12.0,31.0,2.0,12.0
999996,BATTERY,STREET,1.0,42.0,True,False,2008.0,12.0,31.0,2.0,12.0
999997,MOTOR VEHICLE THEFT,STREET,7.0,17.0,False,False,2008.0,12.0,31.0,2.0,12.0
999998,BURGLARY,RESIDENCE,6.0,17.0,False,False,2008.0,12.0,31.0,2.0,12.0


In [3]:
# Connect Google Colab notebook to google drive
from google.colab import drive
drive.mount('/content/drive')

#imputed_df.to_csv('/content/drive/My Drive/Colab Notebooks/imputed_df.csv')

Mounted at /content/drive


In [4]:
imputed_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/imputed_df.csv', index_col=0)


In [6]:
f = plt.figure(figsize = (10, 10))
kernel.plot_feature_importance(dataset=0, annot=True,cmap="YlGnBu", vmin=0, vmax=1)

NameError: ignored

<Figure size 720x720 with 0 Axes>

In [34]:
imputed_df

Unnamed: 0,primary_type,location_description,district,ward,domestic,arrest,year,month,day,weekday,hour
0,BATTERY,STREET,2.0,3.0,False,False,2005.0,1.0,1.0,5.0,1.0
1,WEAPONS VIOLATION,RESIDENCE,5.0,9.0,False,False,2005.0,1.0,1.0,5.0,1.0
2,CRIMINAL DAMAGE,CHURCH / SYNAGOGUE / PLACE OF WORSHIP,2.0,3.0,False,False,2005.0,1.0,1.0,5.0,1.0
3,THEFT,DEPARTMENT STORE,1.0,42.0,False,True,2005.0,1.0,1.0,5.0,1.0
4,THEFT,BAR OR TAVERN,18.0,42.0,False,False,2005.0,1.0,1.0,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
999995,OTHER OFFENSE,STREET,2.0,3.0,False,False,2008.0,12.0,31.0,2.0,12.0
999996,BATTERY,STREET,1.0,42.0,True,False,2008.0,12.0,31.0,2.0,12.0
999997,MOTOR VEHICLE THEFT,STREET,7.0,17.0,False,False,2008.0,12.0,31.0,2.0,12.0
999998,BURGLARY,RESIDENCE,6.0,17.0,False,False,2008.0,12.0,31.0,2.0,12.0


In [6]:
X = imputed_df.drop('arrest', axis = 1)
y = imputed_df.arrest.apply(lambda x: 1 if x=='True' else 0)

In [7]:
X['district'] = X['district'].astype(np.int64)
X['ward'] = X['ward'].astype(np.int64)
X['year'] = X['year'].astype(np.int64)
X['month'] = X['month'].astype(np.int64)
X['day'] = X['day'].astype(np.int64)
X['weekday'] = X['weekday'].astype(np.int64)
X['hour'] = X['hour'].astype(np.int64)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    stratify=y,
                                                    test_size=0.25,
                                                    random_state=21)

In [3]:
y_train.shape

NameError: ignored

In [9]:
from catboost import CatBoostClassifier, Pool


In [10]:
cat_cols = X.columns.values.tolist()

In [11]:
cat_cols

['primary_type',
 'location_description',
 'district',
 'ward',
 'domestic',
 'year',
 'month',
 'day',
 'weekday',
 'hour']

In [12]:

train_data = Pool(data = X_train,
                  label = y_train,
                  cat_features = cat_cols)

eval_dataset = Pool(data = X_test,
                    label = y_test,
                    cat_features = cat_cols)


In [109]:
model = CatBoostClassifier(iterations = 100,
                           task_type="GPU",
                           devices='0.1')

model.fit(train_data,
          eval_set=eval_dataset,
          verbose=True)

Learning rate set to 0.105231
0:	learn: 0.6371158	test: 0.6373083	best: 0.6373083 (0)	total: 26.3ms	remaining: 2.6s
1:	learn: 0.5918764	test: 0.5922312	best: 0.5922312 (1)	total: 50.7ms	remaining: 2.49s
2:	learn: 0.5548523	test: 0.5553547	best: 0.5553547 (2)	total: 76.9ms	remaining: 2.49s
3:	learn: 0.5243096	test: 0.5248995	best: 0.5248995 (3)	total: 102ms	remaining: 2.45s
4:	learn: 0.4988896	test: 0.4995848	best: 0.4995848 (4)	total: 139ms	remaining: 2.64s
5:	learn: 0.4776286	test: 0.4783797	best: 0.4783797 (5)	total: 165ms	remaining: 2.59s
6:	learn: 0.4597865	test: 0.4606360	best: 0.4606360 (6)	total: 191ms	remaining: 2.54s
7:	learn: 0.4447565	test: 0.4456248	best: 0.4456248 (7)	total: 216ms	remaining: 2.49s
8:	learn: 0.4321362	test: 0.4329430	best: 0.4329430 (8)	total: 248ms	remaining: 2.5s
9:	learn: 0.4213968	test: 0.4222327	best: 0.4222327 (9)	total: 271ms	remaining: 2.44s
10:	learn: 0.4123522	test: 0.4132348	best: 0.4132348 (10)	total: 294ms	remaining: 2.38s
11:	learn: 0.4045301	

<catboost.core.CatBoostClassifier at 0x7f65c5436a90>

In [56]:
model.get_all_params()

NameError: ignored

In [39]:
import os
os.environ["SATURN_BASE_URL"] ="https://app.community.saturnenterprise.io"
os.environ["SATURN_TOKEN"] ="server-1f56e7376cd44583a00cef0976a90922"


In [45]:
from dask_saturn import SaturnCluster
from dask.distributed import Client

cluster = SaturnCluster()
client = Client(cluster)
client

INFO:dask-saturn:Cluster is ready
INFO:dask-saturn:Registering default plugins
ERROR:asyncio:Task exception was never retrieved
future: <Task finished coro=<connect.<locals>._() done, defined at /usr/local/lib/python3.7/dist-packages/distributed/comm/core.py:288> exception=CommClosedError()>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/distributed/comm/core.py", line 297, in _
    handshake = await asyncio.wait_for(comm.read(), 1)
  File "/usr/lib/python3.7/asyncio/tasks.py", line 435, in wait_for
    await waiter
concurrent.futures._base.CancelledError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/distributed/comm/core.py", line 304, in _
    raise CommClosedError() from e
distributed.comm.core.CommClosedError
ERROR:asyncio:Task exception was never retrieved
future: <Task finished coro=<connect.<locals>._() done, defined at /usr/local/lib/pyth

OSError: ignored

In [41]:
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [42]:
X_train_f = client.scatter([X_train, y_train])

AttributeError: ignored

In [25]:
def fit_reg(train_idx, val_idx, train_x, train_y, cat_cols, params):
      train_x, val_x = X_train.iloc[train_idx], X_train.iloc[val_idx]
      train_y, val_y = y_train.iloc[train_idx], y_train.iloc[val_idx]

      cat_cols = X_train.columns.values.tolist()

      train_pool = Pool(data = train_x,
                  label = train_y,
                  cat_features = cat_cols)

      val_pool = Pool(data = val_x,
                    label = val_y,
                    cat_features = cat_cols)

      model = CatBoostClassifier(
          **params,
          loss_function= 'Logloss',
          eval_metric='AUC',
          task_type='GPU',
          early_stopping_rounds=50,
          random_seed=21,
          )
    
      model.fit(train_pool, 
                eval_set=val_pool,
                verbose=0)
      y_preds = model.predict(val_pool)
      return roc_auc_score(val_y, y_preds)
  

In [28]:
# Objective function
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 50, 500),
        'depth': trial.suggest_int('depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'max_ctr_complexity': trial.suggest_int('max_ctr_complexity', 0, 8),
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
    }
    
    kf = KFold(n_splits=10, random_state=21, shuffle=True)
    scores = []

    for train_idx, val_idx in kf.split(X_train):
      score = client.submit(fit_reg, train_idx, val_idx, X_train_f, y_train_f)
      client.gather(score)
      scores.append(score)

    return np.mean(scores)


In [31]:
%%time
# Create study
study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed=21))

# Run optimization
study.optimize(objective, n_trials=2)

[32m[I 2021-12-04 21:53:41,022][0m A new study created in memory with name: no-name-00058f1c-d4cb-42e9-8fc9-d767394de3f1[0m
[33m[W 2021-12-04 21:53:43,035][0m Trial 0 failed because of the following error: ModuleNotFoundError("No module named 'catboost'")[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-28-3676a9511056>", line 18, in objective
    client.gather(score)
  File "/usr/local/lib/python3.7/dist-packages/distributed/client.py", line 1992, in gather
    asynchronous=asynchronous,
  File "/usr/local/lib/python3.7/dist-packages/distributed/client.py", line 833, in sync
    self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
  File "/usr/local/lib/python3.7/dist-packages/distributed/utils.py", line 340, in sync
    raise exc.with_traceback(tb)
  File "/usr/local/lib/python3.7/dist-packages/distributed/utils.py", lin

ModuleNotFoundError: ignored

In [112]:
params = study.best_trial.params
print(params)

{'iterations': 349, 'depth': 5, 'learning_rate': 0.03833463003091129, 'l2_leaf_reg': 1.626138591513405, 'max_ctr_complexity': 7, 'random_strength': 13, 'bagging_temperature': 0.051582055717860724}


In [None]:
# Train model with best parameters
model = CatBoostClassifier(**params)
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(roc_auc_score(y_test, preds))