# 사전작업

## 라이브러리 로드

In [1]:
import numpy as np
import pandas as pd
import warnings
import gc
from tqdm import tqdm_notebook
import lightgbm as lgb
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings("ignore")
gc.enable()

In [2]:
pd.set_option('max_rows', 150)
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

## 데이터 로드

In [3]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [4]:
%%time
train = pd.read_csv('./data/train.csv', dtype=dtypes, low_memory=True)
test  = pd.read_csv('./data/test.csv',  dtype=dtypes, low_memory=True)

Wall time: 3min 51s


In [5]:
debug = False
if debug:
    train = train[:10000]
    test = test[:10000]

In [6]:
gc.collect()

201019

## New Feature

### 지역 관련 피쳐 추가

In [7]:
temp = pd.read_csv('./data_temp/new_feature_region.csv')

In [8]:
for v in tqdm_notebook(temp.columns):
    train[v] = temp[~temp.HasDetections.isna()][v]
    test[v] = temp[temp.HasDetections.isna()][v]

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




In [9]:
del temp
gc.collect()

231

## 시간 관련 피쳐 제거

In [10]:
DROP_FEATURES_ = ['EngineVersion', 'AppVersion', 'AvSigVersion', 
                  'OsVer', 'OsBuild', 'OsPlatformSubRelease', 'OsBuildLab', 
                  'IeVerIdentifier',
                  'Census_OSVersion', 'Census_OSBranch', 'Census_OSBuildNumber', 'Census_OSBuildRevision', 'Census_FirmwareVersionIdentifier']

In [11]:
train = train.drop(DROP_FEATURES_, axis=1)
test = test.drop(DROP_FEATURES_, axis=1)

In [12]:
del DROP_FEATURES_
gc.collect()

126

## Feature Split

### drop feature

In [13]:
drop_feature = ['IsBeta', 'AutoSampleOptIn']

In [14]:
train = train.drop(drop_feature, axis=1)
test = test.drop(drop_feature, axis=1)

In [15]:
del drop_feature
gc.collect()

14

In [16]:
# except_feature = ['AVProductsInstalled', 'AVProductsEnabled', 'PuaMode', 
#                   'Census_ProcessorCoreCount', 'Census_ProcessorClass',
#                   'Census_PrimaryDiskTotalCapacity', 'Census_SystemVolumeTotalCapacity', 'Census_TotalPhysicalRAM']

## Feature Transform

In [21]:
train.MachineIdentifier = range(len(train))
train.reset_index(drop=True, inplace=True)

test.MachineIdentifier = range(len(test))
test.reset_index(drop=True, inplace=True)

In [22]:
train

Unnamed: 0,MachineIdentifier,ProductName,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,Platform,Processor,OsSuite,SkuEdition,IsProtected,PuaMode,SMode,SmartScreen,Firewall,UacLuaenable,Census_MDC2FormFactor,Census_DeviceFamily,Census_OEMNameIdentifier,Census_OEMModelIdentifier,Census_ProcessorCoreCount,Census_ProcessorManufacturerIdentifier,Census_ProcessorModelIdentifier,Census_ProcessorClass,Census_PrimaryDiskTotalCapacity,Census_PrimaryDiskTypeName,Census_SystemVolumeTotalCapacity,Census_HasOpticalDiskDrive,Census_TotalPhysicalRAM,Census_ChassisTypeName,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,Census_PowerPlatformRoleName,Census_InternalBatteryType,Census_InternalBatteryNumberOfCharges,Census_OSArchitecture,Census_OSEdition,Census_OSSkuName,Census_OSInstallTypeName,Census_OSInstallLanguageIdentifier,Census_OSUILocaleIdentifier,Census_OSWUAutoUpdateOptionsName,Census_IsPortableOperatingSystem,Census_GenuineStateName,Census_ActivationChannel,Census_IsFlightingInternal,Census_IsFlightsDisabled,Census_FlightRing,Census_ThresholdOptIn,Census_FirmwareManufacturerIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections,CityIdx_CountryIdx_multi,CityCnt_Per_Country,CityCnt_Per_Country_Minmax,CityCnt_Per_Country_Log,DataCnt_Per_Country,DataCnt_Per_Country_Minmax,DataCnt_Per_Country_Log,Personal_User,NewOrganizationIdentifier
0,0,win8defender,7.0,0,,53447.0,1.0,1.0,1,29.0,128035.0,18.0,35.0,-85,windows10,x64,256,Pro,1.0,,0.0,,1.0,1.0,Desktop,Windows.Desktop,2668.0,9124.0,4.0,5.0,2340.0,,476940.0,HDD,299451.0,0,4096.0,Desktop,18.906250,1440.0,900.0,Desktop,,4.294967e+09,amd64,Professional,PROFESSIONAL,UUPUpgrade,26.0,119,UNKNOWN,0,IS_GENUINE,Retail,,0.0,Retail,,628.0,0,,0.0,0,0,0.0,0.0,10.0,0.0,0,5257,3,9,645836,8,13,0,15795.0
1,1,win8defender,7.0,0,,53447.0,1.0,1.0,1,93.0,1482.0,18.0,119.0,64,windows10,x64,256,Pro,1.0,,0.0,,1.0,1.0,Notebook,Windows.Desktop,2668.0,91656.0,4.0,5.0,2404.0,,476940.0,HDD,102385.0,0,4096.0,Notebook,13.898438,1366.0,768.0,Mobile,,1.000000e+00,amd64,Professional,PROFESSIONAL,IBSClean,8.0,31,UNKNOWN,0,OFFLINE,Retail,,0.0,NOT_SET,,628.0,0,,0.0,0,0,0.0,0.0,8.0,0.0,0,3227,2,8,523673,6,13,0,87501.0
2,2,win8defender,7.0,0,,53447.0,1.0,1.0,1,86.0,153579.0,18.0,64.0,49,windows10,x64,768,Home,1.0,,0.0,RequireAdmin,1.0,1.0,Desktop,Windows.Desktop,4908.0,317701.0,4.0,5.0,1972.0,,114473.0,SSD,113907.0,0,4096.0,Desktop,21.500000,1920.0,1080.0,Desktop,,4.294967e+09,amd64,Core,CORE,UUPUpgrade,7.0,30,FullAuto,0,IS_GENUINE,OEM:NONSLP,,0.0,Retail,,142.0,0,,0.0,0,0,0.0,0.0,3.0,0.0,0,398,0,6,76200,1,11,0,83918.0
3,3,win8defender,7.0,0,,53447.0,1.0,1.0,1,88.0,20710.0,,117.0,115,windows10,x64,256,Pro,1.0,,0.0,ExistsNotSet,1.0,1.0,Desktop,Windows.Desktop,1443.0,275890.0,4.0,5.0,2272.0,,238475.0,UNKNOWN,227116.0,0,4096.0,MiniTower,18.500000,1366.0,768.0,Desktop,,4.294967e+09,amd64,Professional,PROFESSIONAL,UUPUpgrade,17.0,64,FullAuto,0,IS_GENUINE,OEM:NONSLP,,0.0,Retail,,355.0,0,,0.0,0,0,0.0,0.0,3.0,1.0,1,1557,1,7,166379,2,12,1,
4,4,win8defender,7.0,0,,53447.0,1.0,1.0,1,18.0,37376.0,,277.0,75,windows10,x64,768,Home,1.0,,0.0,RequireAdmin,1.0,1.0,Notebook,Windows.Desktop,1443.0,331929.0,4.0,5.0,2500.0,,476940.0,HDD,101900.0,0,6144.0,Portable,14.000000,1366.0,768.0,Mobile,lion,0.000000e+00,amd64,Core,CORE,Update,8.0,31,FullAuto,0,IS_GENUINE,Retail,0.0,0.0,Retail,0.0,355.0,0,0.0,0.0,0,0,0.0,0.0,1.0,1.0,1,102,0,5,122662,1,12,1,
5,5,win8defender,7.0,0,,53447.0,1.0,1.0,1,97.0,13598.0,27.0,126.0,124,windows10,x64,256,Pro,1.0,,0.0,RequireAdmin,1.0,1.0,Desktop,Windows.Desktop,3800.0,340727.0,2.0,5.0,4324.0,,114473.0,SSD,113671.0,0,8192.0,Desktop,21.500000,1920.0,1080.0,Desktop,,4.294967e+09,amd64,Professional,PROFESSIONAL,UUPUpgrade,18.0,72,FullAuto,0,IS_GENUINE,Retail,0.0,0.0,Retail,0.0,93.0,0,0.0,0.0,0,0,0.0,0.0,15.0,1.0,0,8746,5,9,363893,4,13,0,92360.0
6,6,win8defender,7.0,0,,43927.0,2.0,1.0,1,78.0,81215.0,,89.0,88,windows10,x64,768,Home,1.0,,0.0,,1.0,1.0,Notebook,Windows.Desktop,3800.0,207404.0,2.0,1.0,657.0,,476940.0,HDD,458702.0,0,4096.0,Notebook,17.203125,1600.0,900.0,Mobile,,0.000000e+00,amd64,Core,CORE,IBSClean,14.0,49,FullAuto,0,IS_GENUINE,Retail,,0.0,Retail,,556.0,1,,0.0,0,0,0.0,0.0,10.0,1.0,0,27,0,3,15811,0,10,1,
7,7,win8defender,7.0,0,,53447.0,1.0,1.0,1,97.0,150323.0,27.0,126.0,124,windows10,x64,768,Home,1.0,,0.0,RequireAdmin,1.0,1.0,Notebook,Windows.Desktop,5680.0,338896.0,2.0,5.0,3380.0,,305245.0,HDD,290807.0,1,4096.0,Notebook,15.500000,1366.0,768.0,Mobile,lion,0.000000e+00,amd64,Core,CORE,Upgrade,18.0,72,FullAuto,0,IS_GENUINE,Retail,0.0,0.0,Retail,0.0,512.0,0,0.0,0.0,0,0,0.0,0.0,15.0,0.0,0,8746,5,9,363893,4,13,0,102801.0
8,8,win8defender,7.0,0,,53447.0,1.0,1.0,1,164.0,155006.0,27.0,205.0,-84,windows10,x64,256,Pro,1.0,,0.0,RequireAdmin,1.0,1.0,Notebook,Windows.Desktop,2206.0,240688.0,4.0,5.0,2836.0,,305245.0,HDD,303892.0,0,4096.0,Notebook,15.601562,1920.0,1080.0,Mobile,lion,0.000000e+00,amd64,Professional,PROFESSIONAL,Update,27.0,120,FullAuto,0,IS_GENUINE,Retail,,0.0,Retail,0.0,500.0,0,0.0,0.0,0,0,0.0,0.0,15.0,0.0,0,1588,1,7,194774,2,12,0,130147.0
9,9,win8defender,7.0,0,,46413.0,2.0,1.0,1,93.0,98572.0,27.0,119.0,64,windows10,x64,768,Home,1.0,,0.0,RequireAdmin,1.0,1.0,Notebook,Windows.Desktop,585.0,189457.0,4.0,5.0,2372.0,,953869.0,HDD,203252.0,1,8192.0,Notebook,15.500000,1366.0,768.0,Mobile,lion,0.000000e+00,amd64,CoreSingleLanguage,CORE_SINGLELANGUAGE,Upgrade,8.0,31,UNKNOWN,0,IS_GENUINE,OEM:DM,,0.0,Retail,0.0,556.0,1,0.0,0.0,0,0,0.0,1.0,8.0,1.0,0,3227,2,8,523673,6,13,0,89694.0


In [23]:
print('Transform some features to category.\n')
for usecol in tqdm_notebook([col for col in train.columns if col not in ['HasDetections', 'MachineIdentifier']]):

    train[usecol] = train[usecol].astype('str')
    test[usecol] = test[usecol].astype('str')
    
    #Fit LabelEncoder
    le = LabelEncoder().fit(
            np.unique(train[usecol].unique().tolist()+
                      test[usecol].unique().tolist()))

    #At the end 0 will be used for dropped values
    train[usecol] = le.transform(train[usecol])+1
    test[usecol]  = le.transform(test[usecol])+1

    agg_tr = (train
              .groupby([usecol])
              .aggregate({'MachineIdentifier':'count'})
              .reset_index()
              .rename({'MachineIdentifier':'Train'}, axis=1))
    
    agg_te = (test
              .groupby([usecol])
              .aggregate({'MachineIdentifier':'count'})
              .reset_index()
              .rename({'MachineIdentifier':'Test'}, axis=1))

    agg = pd.merge(agg_tr, agg_te, on=usecol, how='outer').replace(np.nan, 0)
    #Select values with more than 1000 observations
    agg = agg[(agg['Train'] > 1000)].reset_index(drop=True)
    agg['Total'] = agg['Train'] + agg['Test']
    #Drop unbalanced values
    agg = agg[(agg['Train'] / agg['Total'] > 0.2) & (agg['Train'] / agg['Total'] < 0.8)]
    agg[usecol+'Copy'] = agg[usecol]
    
    train[usecol] = (pd.merge(train[[usecol]], 
                              agg[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

    test[usecol]  = (pd.merge(test[[usecol]], 
                              agg[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

    del le, agg_tr, agg_te, agg, usecol
    gc.collect()

Transform some features to category.



HBox(children=(IntProgress(value=0, max=75), HTML(value='')))

In [32]:
%%time
train.to_csv('./data_temp/train_temp.csv', index=False)
test.to_csv('./data_temp/test_temp.csv', index=False)

Wall time: 4min 40s


In [33]:
y_train = np.array(train['HasDetections'])
train_ids = train.index
test_ids  = test.index

In [39]:
del train['HasDetections'], train['MachineIdentifier'], test['MachineIdentifier'], test['HasDetections']
gc.collect()

AttributeError: __delitem__

In [41]:
print("If you don't want use Sparse Matrix choose Kernel Version 2 to get simple solution.\n")

print('--------------------------------------------------------------------------------------------------------')
print('Transform Data to Sparse Matrix.')
print('Sparse Matrix can be used to fit a lot of models, eg. XGBoost, LightGBM, Random Forest, K-Means and etc.')
print('To concatenate Sparse Matrices by column use hstack()')
print('Read more about Sparse Matrix https://docs.scipy.org/doc/scipy/reference/sparse.html')
print('Good Luck!')
print('--------------------------------------------------------------------------------------------------------')


If you don't want use Sparse Matrix choose Kernel Version 2 to get simple solution.

--------------------------------------------------------------------------------------------------------
Transform Data to Sparse Matrix.
Sparse Matrix can be used to fit a lot of models, eg. XGBoost, LightGBM, Random Forest, K-Means and etc.
To concatenate Sparse Matrices by column use hstack()
Read more about Sparse Matrix https://docs.scipy.org/doc/scipy/reference/sparse.html
Good Luck!
--------------------------------------------------------------------------------------------------------


In [36]:
#Fit OneHotEncoder
ohe = OneHotEncoder(categories='auto', sparse=True, dtype='uint8').fit(train)

In [44]:
#Transform data using small groups to reduce memory usage
m = 100000
train = vstack([ohe.transform(train[i*m:(i+1)*m]) for i in range(train.shape[0] // m + 1)])
test  = vstack([ohe.transform(test[i*m:(i+1)*m])  for i in range(test.shape[0] // m +  1)])

In [17]:
save_npz('./data_temp/train.npz', train, compressed=True)
save_npz('./data_temp/test.npz',  test,  compressed=True)

In [18]:
del ohe, train, test
gc.collect()

574

In [47]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
skf.get_n_splits(train_ids, y_train)

lgb_test_result  = np.zeros(test_ids.shape[0])
lgb_train_result = np.zeros(train_ids.shape[0])
counter = 0

In [48]:
print('\nLightGBM\n')

for train_index, test_index in skf.split(train_ids, y_train):
    
    print('Fold {}\n'.format(counter + 1))
    
    X_fit = vstack([train[train_index[i*m:(i+1)*m]] for i in range(train_index.shape[0] // m + 1)])
    X_val = vstack([train[test_index[i*m:(i+1)*m]]  for i in range(test_index.shape[0] //  m + 1)])
    X_fit, X_val = csr_matrix(X_fit, dtype='float32'), csr_matrix(X_val, dtype='float32')
    y_fit, y_val = y_train[train_index], y_train[test_index]
    
    gc.collect()

    lgb_model = lgb.LGBMClassifier(max_depth=-1,
                                   n_estimators=1000,
                                   learning_rate=0.1,
                                   num_leaves=2**5-1,
                                   objective='binary', 
                                   boosting_type='gbdt',
                                   # overfitting handling
                                   # max_bin=120,
                                   # lambda_l1=6,
                                   # lambda_l2=2,
                                   save_binary=True,
                                   feature_fraction=0.8,
                                   feature_fraction_seed=42,
                                   n_jobs=-1)
    
    print("fitting")
    lgb_model.fit(X_fit, y_fit, eval_metric='auc', 
                  eval_set=[(X_val, y_val)], 
                  verbose=200, early_stopping_rounds=100)
    
    del X_fit, X_val, y_fit, y_val, train_index, test_index
    gc.collect()
    
    print("predicting")
    lgb_test_result += lgb_model.predict_proba(test)[:,1]
    counter += 1
    
    gc.collect()


LightGBM

Fold 1

fitting
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's binary_logloss: 0.621836	valid_0's auc: 0.704438
[400]	valid_0's binary_logloss: 0.618924	valid_0's auc: 0.708837
[600]	valid_0's binary_logloss: 0.617574	valid_0's auc: 0.710755
[800]	valid_0's binary_logloss: 0.616796	valid_0's auc: 0.711858
[1000]	valid_0's binary_logloss: 0.616085	valid_0's auc: 0.712828
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.616085	valid_0's auc: 0.712828
predicting


TypeError: Expected np.float32 or np.float64, met type(uint8)

Fold 1

Training until validation scores don't improve for 100 rounds.
* [100]	valid_0's auc: 0.731814	valid_0's binary_logloss: 0.604756
* [200]	valid_0's auc: 0.737255	valid_0's binary_logloss: 0.598171
* [300]	valid_0's auc: 0.738762	valid_0's binary_logloss: 0.596577
* [400]	valid_0's auc: 0.73902	valid_0's binary_logloss: 0.596246
* [500]	valid_0's auc: 0.738941	valid_0's binary_logloss: 0.596295

Early stopping, best iteration is:
* [413]	valid_0's auc: 0.739032	valid_0's binary_logloss: 0.596234

Fold 2

Training until validation scores don't improve for 100 rounds.
* [100]	valid_0's auc: 0.732085	valid_0's binary_logloss: 0.604716
* [200]	valid_0's auc: 0.737355	valid_0's binary_logloss: 0.598296
* [300]	valid_0's auc: 0.738891	valid_0's binary_logloss: 0.596623
* [400]	valid_0's auc: 0.739114	valid_0's binary_logloss: 0.596321

Early stopping, best iteration is:
* [392]	valid_0's auc: 0.739125	valid_0's binary_logloss: 0.596318

Fold 3

Training until validation scores don't improve for 100 rounds.
* [100]	valid_0's auc: 0.731732	valid_0's binary_logloss: 0.604695
* [200]	valid_0's auc: 0.7373	valid_0's binary_logloss: 0.598301
* [300]	valid_0's auc: 0.739042	valid_0's binary_logloss: 0.596534
* [400]	valid_0's auc: 0.73933	valid_0's binary_logloss: 0.596197
* [500]	valid_0's auc: 0.739239	valid_0's binary_logloss: 0.596242

Early stopping, best iteration is:
* [403]	valid_0's auc: 0.739335	valid_0's binary_logloss: 0.596189

Fold 4

Training until validation scores don't improve for 100 rounds.
* [100]	valid_0's auc: 0.732696	valid_0's binary_logloss: 0.60421
* [200]	valid_0's auc: 0.738141	valid_0's binary_logloss: 0.597535
* [300]	valid_0's auc: 0.739715	valid_0's binary_logloss: 0.595869
* [400]	valid_0's auc: 0.739938	valid_0's binary_logloss: 0.595555

Early stopping, best iteration is:
* [350]	valid_0's auc: 0.739944	valid_0's binary_logloss: 0.595605

Fold 5

Training until validation scores don't improve for 100 rounds.
* [100]	valid_0's auc: 0.731629	valid_0's binary_logloss: 0.60482
* [200]	valid_0's auc: 0.737059	valid_0's binary_logloss: 0.598237
* [300]	valid_0's auc: 0.738603	valid_0's binary_logloss: 0.596627
* [400]	valid_0's auc: 0.738839	valid_0's binary_logloss: 0.596299

Early stopping, best iteration is:
* [396]	valid_0's auc: 0.73884	valid_0's binary_logloss: 0.596306


In [135]:
sub = pd.DataFrame({"MachineIdentifier":test.MachineIdentifier, "HasDetections": lgb_test_result / counter})

In [23]:
submission = pd.read_csv('./data/sample_submission.csv')

In [27]:
submission.to_csv('./data/submission_lgb_more_feature.csv', index=False)

In [25]:
submission.HasDetections = lgb_test_result / counter

In [118]:
t1 = set(range(len(submission.index)))

In [123]:
t2 = set(sub.index)

In [132]:
submission.iloc[list(t1.difference(t2))].append(sub).sort_values('MachineIdentifier').to_csv('./data/submission_split_av.csv', index=False)

In [108]:
# for machine_id in tqdm_notebook(sub.MachineIdentifier):
#     submission.loc[submission.MachineIdentifier == machine_id, 'HasDetections'] = sub[sub.MachineIdentifier == machine_id].HasDetections

HBox(children=(IntProgress(value=0, max=2344631), HTML(value='')))

KeyboardInterrupt: 

In [94]:
submission = pd.read_csv('./data/sample_submission.csv')
# submission['HasDetections'] = lgb_test_result / counter
# submission.to_csv('lgb_submission.csv', index=False)

In [116]:
submission['HasDetections'] = lgb_test_result / counter

In [106]:
submission.to_csv('./data/submission_temp.csv', index=False)

# 모델 블렌딩 테스트

In [108]:
sub2 = pd.read_csv('./data/nffm_submission.csv')

In [109]:
sub3 = pd.read_csv('./data/ms_malware.csv')

In [117]:
submission.HasDetections = (2*submission.HasDetections + 2*sub2.HasDetections + sub3.HasDetections) / 5

In [118]:
submission.to_csv('./data/submission_temp3.csv', index=False)