In [43]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
from xgboost import XGBClassifier
import xgboost as xgb

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import precision_score
from sklearn.cross_validation import *

from catboost import Pool, CatBoostClassifier, cv, CatboostIpythonWidget

from auto_ml import Predictor
from auto_ml.utils_models import load_ml_model

In [2]:
test = pd.read_csv("./data/criminal_test.csv")
train= pd.read_csv("./data/criminal_train.csv")

In [3]:
featuers = train.columns[train.columns != "Criminal"]
X = train[featuers]
Y = train['Criminal']
list = ['PERID']

In [4]:
x = train.drop(list,axis=1)
y = test.drop(list,axis=1)

In [5]:
list1 = ['TROUBUND']
x = x.drop(list1,axis=1)
y = y.drop(list1,axis=1)
x = x.drop('Criminal',axis=1)

In [6]:
X_train, X_validation, y_train, y_validation = train_test_split(x, Y, train_size=0.85, random_state=1234)

In [40]:
xgtrain = xgb.DMatrix(x, label=Y)
clf = xgb.XGBClassifier(missing=9999999999,
                max_depth = 7,
                n_estimators=1000,
                learning_rate=0.1, 
                nthread=4,
                subsample=1.0,
                colsample_bytree=0.5,
                min_child_weight = 11,
                seed=1301)
xgb_param = clf.get_xgb_params()
#do cross validation
print ('Start cross validation')
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=15, metrics=['map'],
     early_stopping_rounds=50, stratified=True, seed=1301)
print('Best number of trees = {}'.format(cvresult.shape[0]))
# clf.set_params(n_estimators=cvresult.shape[0])
print('Fit on the trainingsdata')
clf.fit(x, Y, eval_metric='map')
print('Overall precision:', precision_score(Y, clf.predict_proba(x)[:,1].round()))
print('Predict the probabilities based on features in the test set')
pred = clf.predict_proba(y, ntree_limit=cvresult.shape[0])

Start cross validation
Best number of trees = 82
Fit on the trainingsdata
Overall precision: 0.8649229666786098
Predict the probabilities based on features in the test set


In [36]:
model = xgb.XGBClassifier(missing=9999999999,
                max_depth = 7,
                n_estimators=1000,
                learning_rate=0.1, 
                nthread=4,
                subsample=1.0,
                colsample_bytree=0.5,
                min_child_weight = 11,
                seed=1301)
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_validation)
print('precision score ', precision_score(y_validation, y_pred))

precision score  0.6266968325791855


In [29]:
print('precision score ', precision_score(y_validation, y_pred))

precision score  0.6280623608017817


In [41]:
create_submission(pred[:,1], 'blends/xgb_sub_prob_1.csv')

In [37]:
create_submission(clf.predict(y, ntree_limit=cvresult.shape[0]), 'xgb_sub_6.csv')

In [7]:
# xgb grid search cv
xgb_model = xgb.XGBClassifier()
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.1, 0.15], #so called `eta` value
              'max_depth': [8, 10, 12],
              'min_child_weight': [11],
              'subsample': [0.8, 1.0],
              'colsample_bytree': [0.7, 0.9],
              'n_estimators': [1500, 2000], #number of trees, change it to 1000 for better results
              'seed': [1337]}
clf = GridSearchCV(xgb_model, parameters, n_jobs=4, 
                   cv=StratifiedKFold(Y, n_folds=4, shuffle=True), 
                   scoring='precision',
                   verbose=3, refit=True)
clf.fit(x, Y)
#trust your CV!
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw precision score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
test_probs = clf.predict_proba(y)[:,1]

Fitting 4 folds for each of 48 candidates, totalling 192 fits


[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed: 17.1min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed: 121.7min
[Parallel(n_jobs=4)]: Done 192 out of 192 | elapsed: 200.5min finished


Raw precision score: 0.6382330582114011
colsample_bytree: 0.7
learning_rate: 0.1
max_depth: 8
min_child_weight: 11
n_estimators: 1500
nthread: 4
objective: 'binary:logistic'
seed: 1337
subsample: 1.0


In [10]:
create_submission(clf.predict(y), 'xgb_sub_2.csv')

In [7]:
column_descriptions = {
  'Criminal': 'output'
}
training_params = {'n_estimators': 2000, 'learning_rate': 0.2, 'num_leaves': 14
                   , 'lambda_l2': 0.001, 'histogram_pool_size': 16384}
training_params2 = {'n_estimators': 2000, 'learning_rate': 0.2, 'num_leaves': 10, 'lambda_l2': 0.001, 'histogram_pool_size': 16384}
ml_predictor = Predictor(type_of_estimator='classifier'
                         , column_descriptions=column_descriptions)

ml_predictor.train(train, model_names=['LGBMClassifier'], training_params=training_params)

Welcome to auto_ml! We're about to go through and make sense of your data using machine learning, and give you a production-ready pipeline to get predictions with.

If you have any issues, or new feature ideas, let us know at http://auto.ml
Now using the model training_params that you passed in:
{'n_estimators': 2000, 'learning_rate': 0.2, 'num_leaves': 14, 'lambda_l2': 0.001, 'histogram_pool_size': 16384}
After overwriting our defaults with your values, here are the final params that will be used to initialize the model:
{'n_estimators': 2000, 'learning_rate': 0.2, 'num_leaves': 14, 'lambda_l2': 0.001, 'histogram_pool_size': 16384}
Running basic data cleaning
Fitting DataFrameVectorizer
Now using the model training_params that you passed in:
{'n_estimators': 2000, 'learning_rate': 0.2, 'num_leaves': 14, 'lambda_l2': 0.001, 'histogram_pool_size': 16384}
After overwriting our defaults with your values, here are the final params that will be used to initialize the model:
{'n_estimators':




About to fit the pipeline for the model LGBMClassifier to predict Criminal
Started at:
2018-02-14 11:09:28
[1]	random_holdout_set_from_training_data's binary_logloss: 0.537298
Training until validation scores don't improve for 100 rounds.
[2]	random_holdout_set_from_training_data's binary_logloss: 0.43279
[3]	random_holdout_set_from_training_data's binary_logloss: 0.358397
[4]	random_holdout_set_from_training_data's binary_logloss: 0.303341
[5]	random_holdout_set_from_training_data's binary_logloss: 0.261447
[6]	random_holdout_set_from_training_data's binary_logloss: 0.229257
[7]	random_holdout_set_from_training_data's binary_logloss: 0.204261
[8]	random_holdout_set_from_training_data's binary_logloss: 0.184633
[9]	random_holdout_set_from_training_data's binary_logloss: 0.168907
[10]	random_holdout_set_from_training_data's binary_logloss: 0.156355
[11]	random_holdout_set_from_training_data's binary_logloss: 0.146202
[12]	random_holdout_set_from_training_data's binary_logloss: 0.13817




The printed list will only contain at most the top 100 features.
+---------+----------------+--------------+---------------+-------------------+-------------------+-----------+-----------+-----------+-----------+
|         | Feature Name   |   Importance |         Delta |   FR_Decrementing |   FR_Incrementing |   FRD_abs |   FRI_abs |   FRD_MAD |   FRI_MAD |
|---------+----------------+--------------+---------------+-------------------+-------------------+-----------+-----------+-----------+-----------|
| 35.0000 | IIOTHHLT       |            0 |        1.3921 |            0.0000 |            0.0000 |    0.0000 |    0.0000 |    0.0000 |    0.0000 |
| 23.0000 | HLNVREF        |            0 |        6.4796 |            0.0000 |            0.0000 |    0.0000 |    0.0000 |    0.0000 |    0.0000 |
| 24.0000 | HLNVNEED       |            0 |        6.5268 |            0.0000 |            0.0000 |    0.0000 |    0.0000 |    0.0000 |    0.0000 |
| 25.0000 | HLNVSOR        |            0 |    

<auto_ml.predictor.Predictor at 0x1cb3c7afe80>

In [None]:
# test_score = ml_predictor.score(df_test, df_test.MEDV)

In [22]:
model_simple = CatBoostClassifier(
    eval_metric='Precision',
    depth=10,
    iterations=250,
    random_seed=42,
    use_best_model=False
)
model_simple.fit(
    X_train, y_train,
    cat_features=[],
    eval_set=(X_validation, y_validation),
)

0:	learn: 0.7023190	test: 0.6617284	best: 0.6617284 (0)	total: 219ms	remaining: 54.5s
1:	learn: 0.7045010	test: 0.6658228	best: 0.6658228 (1)	total: 287ms	remaining: 35.6s
2:	learn: 0.7045010	test: 0.6658228	best: 0.6658228 (2)	total: 378ms	remaining: 31.1s
3:	learn: 0.7114527	test: 0.6743590	best: 0.6743590 (3)	total: 447ms	remaining: 27.5s
4:	learn: 0.7095079	test: 0.6691729	best: 0.6743590 (3)	total: 497ms	remaining: 24.3s
5:	learn: 0.7168737	test: 0.6729223	best: 0.6743590 (3)	total: 513ms	remaining: 20.9s
6:	learn: 0.7177459	test: 0.6770833	best: 0.6770833 (6)	total: 599ms	remaining: 20.8s
7:	learn: 0.7135851	test: 0.6649616	best: 0.6770833 (6)	total: 690ms	remaining: 20.9s
8:	learn: 0.7105010	test: 0.6666667	best: 0.6770833 (6)	total: 799ms	remaining: 21.4s
9:	learn: 0.7181409	test: 0.6692913	best: 0.6770833 (6)	total: 816ms	remaining: 19.6s
10:	learn: 0.7172111	test: 0.6649746	best: 0.6770833 (6)	total: 931ms	remaining: 20.2s
11:	learn: 0.7148325	test: 0.6649874	best: 0.6770833 

96:	learn: 0.7250238	test: 0.6608040	best: 0.6770833 (6)	total: 7s	remaining: 11s
97:	learn: 0.7250238	test: 0.6608040	best: 0.6770833 (6)	total: 7.04s	remaining: 10.9s
98:	learn: 0.7245480	test: 0.6608040	best: 0.6770833 (6)	total: 7.12s	remaining: 10.9s
99:	learn: 0.7238593	test: 0.6608040	best: 0.6770833 (6)	total: 7.23s	remaining: 10.8s
100:	learn: 0.7242035	test: 0.6608040	best: 0.6770833 (6)	total: 7.27s	remaining: 10.7s
101:	learn: 0.7252852	test: 0.6608040	best: 0.6770833 (6)	total: 7.38s	remaining: 10.7s
102:	learn: 0.7249406	test: 0.6608040	best: 0.6770833 (6)	total: 7.43s	remaining: 10.6s
103:	learn: 0.7253321	test: 0.6608040	best: 0.6770833 (6)	total: 7.54s	remaining: 10.6s
104:	learn: 0.7256763	test: 0.6608040	best: 0.6770833 (6)	total: 7.59s	remaining: 10.5s
105:	learn: 0.7260209	test: 0.6591479	best: 0.6770833 (6)	total: 7.7s	remaining: 10.5s
106:	learn: 0.7260209	test: 0.6591479	best: 0.6770833 (6)	total: 7.72s	remaining: 10.3s
107:	learn: 0.7259365	test: 0.6600000	best

193:	learn: 0.7354597	test: 0.6650124	best: 0.6770833 (6)	total: 14.1s	remaining: 4.06s
194:	learn: 0.7366197	test: 0.6650124	best: 0.6770833 (6)	total: 14.2s	remaining: 4s
195:	learn: 0.7363977	test: 0.6641791	best: 0.6770833 (6)	total: 14.3s	remaining: 3.93s
196:	learn: 0.7360525	test: 0.6641791	best: 0.6770833 (6)	total: 14.3s	remaining: 3.85s
197:	learn: 0.7362998	test: 0.6650124	best: 0.6770833 (6)	total: 14.4s	remaining: 3.79s
198:	learn: 0.7371134	test: 0.6658416	best: 0.6770833 (6)	total: 14.5s	remaining: 3.71s
199:	learn: 0.7370145	test: 0.6658416	best: 0.6770833 (6)	total: 14.6s	remaining: 3.65s
200:	learn: 0.7371375	test: 0.6658416	best: 0.6770833 (6)	total: 14.7s	remaining: 3.57s
201:	learn: 0.7370145	test: 0.6658416	best: 0.6770833 (6)	total: 14.8s	remaining: 3.51s
202:	learn: 0.7373596	test: 0.6658416	best: 0.6770833 (6)	total: 14.9s	remaining: 3.45s
203:	learn: 0.7378505	test: 0.6641975	best: 0.6770833 (6)	total: 15s	remaining: 3.38s
204:	learn: 0.7378505	test: 0.6641975

<catboost.core.CatBoostClassifier at 0x1cb3cbc4978>

In [None]:
parameters = {'kernel':('poly', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
# model = SVC(C=10, kernel='poly', degree=4, verbose=True)
clf.fit(X_train, y_train)
model = clf.estimator

[LibSVM]

In [55]:
model = ExtraTreesClassifier(n_estimators=50, criterion='gini', verbose=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_validation)
print('precision score ', precision_score(y_validation, y_pred))
create_submission(model.predict(y), 'extra_tree_sub1.csv')

precision score  0.6161369193154034


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.0s finished


In [24]:
predicted = model_simple.predict(y)

In [10]:
predicted = model.predict(y)

In [8]:
predicted = ml_predictor.predict(test)

In [23]:
# for predict_proba predictions
predicted = model_simple.predict_proba(test)
sub2 = pd.DataFrame({'PERID':test.PERID, 'Criminal':predicted[:,1]})
sub2 = sub2[['PERID', 'Criminal']]
sub2.to_csv('submissions/catboost_sub_1.csv', index=False)

In [9]:
def create_submission(predicted, filename):
    sub2 = pd.DataFrame({'PERID':test.PERID, 'Criminal':[int(i) for i in predicted]})
    sub2 = sub2[['PERID', 'Criminal']]
    filepath = 'submissions/'+filename
    sub2.to_csv(filepath, index=False)

In [28]:
predicted[:100]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [12]:
predicted.sum()

9