In [137]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import os
import numpy as np
import pandas as pd
import random
import gc
import lightgbm as lgb
from sklearn.model_selection import cross_val_score 
import json

In [148]:
del model_helper

In [164]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import cross_val_score 

class model_helper(object):
    def __init__(self, data, train_n, val_n, label_col):
        self.data = data    
        self.train_data = data.drop(label_col, axis = 1)
        self.train_x = self.train_data[:train_n].astype('float32')
        self.train_y = self.data[:train_n][label_col].astype('float32')
        self.val_x = self.train_data[train_n: train_n + val_n].astype('float32')
        self.val_y = self.data[train_n: train_n + val_n][label_col].astype('float32')
        self.all_train_x = self.train_data[:train_n + val_n].astype('float32')
        self.all_train_y = self.data[:train_n + val_n][label_col].astype('float32')
        self.test_x = self.train_data[train_n + val_n:].astype('float32')
        self.x = self.train_data.astype('float32')        
    def lgb_work(self, params, round, verbose_eval=20, early_stopping_rounds=100, use_best = True):
        lgb_train_data = lgb.Dataset(self.train_x, self.train_y)
        lgb_val_data = lgb.Dataset(self.val_x, self.val_y)
        model = lgb.train(params,
                        lgb_train_data,
                        round,
                        valid_sets=[lgb_train_data, lgb_val_data],
                        verbose_eval=verbose_eval,
                        early_stopping_rounds=early_stopping_rounds)
        prob = model.predict(self.x, num_iteration=round - 1)
        if use_best:
            prob = model.predict(self.x, num_iteration=model.best_iteration)
        self.data['lgb_prob'] = list(prob)
        return self.data
    def xgb_work(self, params, round, verbose_eval=20, early_stopping_rounds=100):
        xgb_train_data = xgb.DMatrix(self.train_x, self.train_y)
        xgb_val_data = xgb.DMatrix(self.val_x, self.val_y)
        model = xgb.train(params,
                        xgb_train_data,
                        round,
                        evals=[(xgb_train_data, 'train'), (xgb_val_data, 'eval')],
                        verbose_eval=verbose_eval,
                        early_stopping_rounds=early_stopping_rounds)
        xgb_data = xgb.DMatrix(self.x)
        prob = model.predict(xgb_data)
        self.data['xgb_prob'] = list(prob)
        return self.data        
    def cv_lgb_cls_work(self, params):
        model = lgb.LGBMClassifier(**params) 
        auc = cross_val_score(model, self.all_train_x, self.all_train_y, scoring='roc_auc', cv=5)  
        print('[cv lgb cls][auc]', auc.mean())
        model.fit(self.all_train_x, self.all_train_y)
        prob = model.predict_proba(self.x)[:, 1]
        self.data['cv_lgb_prob'] = list(prob)
        return self.data         

In [165]:
def check_null(data, type='None'):
    cols = data.columns.tolist()
    li = []
    n = len(data)
    for col in cols:
        col_data = data[col].tolist()
        number = 0
        for i in col_data:
            if type == 'None' and (i == None or str(i) == 'None'):
                number += 1
            elif type == 'NaN' and (i != i or str(i) == 'NaN'):
                number += 1
            elif str(i) == type:
                number += 1
        li.append((number / n, col))
    li.sort(key = lambda x: x[0], reverse=True)
    for i in li:
        print(i[0], i[1])
        
def count_unique(data):
    cols = data.columns.tolist()
    n = len(data)
    for col in cols:
        print(col, len(set(data[col].tolist())))

In [166]:
seed = 42
np.random.seed(seed)
random.seed(seed)

In [167]:
train_data = pd.DataFrame(json.load(open('train_data.json','r')))
test_data = pd.DataFrame(json.load(open('test_data.json','r')))
train_data['label'] = pd.DataFrame(json.load(open('train_labels.json','r')))
test_data['label'] = -1.0
train_n = 170
val_n = 30
data = pd.concat([train_data, test_data], axis=0)

In [168]:
data

Unnamed: 0,temperature,movement_events,water_usage,power_usage,label
0,23.701663,8.0,2.240068,300.897030,1.0
1,19.817218,3.0,1.892936,358.857170,0.0
2,25.900846,12.0,2.394110,465.751480,1.0
3,23.923732,7.0,10.959409,285.928408,1.0
4,27.761647,7.0,1.739654,108.935042,1.0
...,...,...,...,...,...
45,19.742782,4.0,2.362424,181.293118,-1.0
46,27.867251,10.0,1.531540,295.153566,-1.0
47,24.875744,14.0,5.818158,175.860574,-1.0
48,21.166608,5.0,0.338973,325.538068,-1.0


In [169]:
mh = model_helper(data, train_n, val_n, 'label')

In [170]:
params = {
    'max_depth': 6,
    'learning_rate': 0.1,
    'num_iterations': 300,
    "random_state": 42
}
mh.cv_lgb_cls_work(params)



[cv lgb cls][auc] 0.9539511278195489




Unnamed: 0,temperature,movement_events,water_usage,power_usage,label,cv_lgb_prob
0,23.701663,8.0,2.240068,300.897030,1.0,9.988292e-01
1,19.817218,3.0,1.892936,358.857170,0.0,1.081002e-05
2,25.900846,12.0,2.394110,465.751480,1.0,1.000000e+00
3,23.923732,7.0,10.959409,285.928408,1.0,9.999984e-01
4,27.761647,7.0,1.739654,108.935042,1.0,9.999958e-01
...,...,...,...,...,...,...
45,19.742782,4.0,2.362424,181.293118,-1.0,5.771594e-06
46,27.867251,10.0,1.531540,295.153566,-1.0,9.999999e-01
47,24.875744,14.0,5.818158,175.860574,-1.0,1.000000e+00
48,21.166608,5.0,0.338973,325.538068,-1.0,6.031410e-07


In [118]:
params = {
      'objective': 'binary', 
      'max_depth': 3,
      'learning_rate': 0.05,
      "min_sum_hessian_in_leaf": 6,
      "boosting": "gbdt",
      "feature_fraction": 0.9, 
      "bagging_freq": 1,
      "bagging_fraction": 0.8,
      "bagging_seed": 11,
      "lambda_l1": 0.01,             
      'lambda_l2': 0.001,     
      "verbosity": -1,
      "nthread": -1,                
      'metric': {'binary_logloss', 'auc'},  
      "random_state": 42
}
round = 5000
data = mh.lgb_work(params, 2000)

Training until validation scores don't improve for 100 rounds
[20]	training's binary_logloss: 0.374238	training's auc: 0.964632	valid_1's binary_logloss: 0.47855	valid_1's auc: 0.918552
[40]	training's binary_logloss: 0.282221	training's auc: 0.968785	valid_1's binary_logloss: 0.426564	valid_1's auc: 0.932127
[60]	training's binary_logloss: 0.25099	training's auc: 0.972453	valid_1's binary_logloss: 0.398167	valid_1's auc: 0.936652
[80]	training's binary_logloss: 0.23735	training's auc: 0.971346	valid_1's binary_logloss: 0.39066	valid_1's auc: 0.941176
[100]	training's binary_logloss: 0.231401	training's auc: 0.970861	valid_1's binary_logloss: 0.384497	valid_1's auc: 0.945701
[120]	training's binary_logloss: 0.228587	training's auc: 0.970169	valid_1's binary_logloss: 0.384321	valid_1's auc: 0.945701
[140]	training's binary_logloss: 0.227496	training's auc: 0.970307	valid_1's binary_logloss: 0.380611	valid_1's auc: 0.945701
[160]	training's binary_logloss: 0.226929	training's auc: 0.9697



In [126]:
prob = data[-50:]['lgb_prob'].tolist()

In [127]:
s = ['0'] * 50
li = []
tmp = []
for i in range(len(list(prob))):
    li.append((list(prob)[i],i))
li.sort(key=lambda x:x[0], reverse=True)

In [128]:
for i in range(len(li)):
    if i < 10:
        s[li[i][1]] = '1'
        tmp.append(li[i][1])

In [129]:
''.join(s)

'00001000000001001001000000000000001010101000001100'

In [130]:
li[:10]

[(0.9655764165326977, 4),
 (0.9655764165326977, 16),
 (0.9655764165326977, 47),
 (0.9624235292158875, 13),
 (0.9620058071805437, 40),
 (0.9614229749785571, 36),
 (0.9563678973543924, 19),
 (0.9549735300812572, 38),
 (0.9533763750213244, 34),
 (0.9526583728709365, 46)]

In [131]:
tmp

[4, 16, 47, 13, 40, 36, 19, 38, 34, 46]

In [None]:
f = '00001000000001001001000000000000001010101000001100'
try:
    r = requests.post("http://murderbot.fly.dev:5000/score", json=json.dumps({"submission": f, 'challenge_id':'murderbots'}))
    print(r.text)
except requests.exceptions.ConnectionError:
    print("Connection problems. Contact the CTF organizers.")