# 케글 커널 필사 3

* [커널](https://www.kaggle.com/guoday/nffm-baseline-0-690-on-lb) + [커널]()
* 위 커널 2개 적절히 혼합

In [133]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import lightgbm as lgb
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
import gc
gc.enable()

In [134]:
import warnings
warnings.filterwarnings("ignore")

In [135]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [136]:
from ctrnet import ctrNet
from ctrnet.src import misc_utils as utils
import tensorflow as tf

In [137]:
%%time
print('Download Train and Test Data.\n')
train = pd.read_csv('./data/train.csv', dtype=dtypes, low_memory=True)
train['MachineIdentifier'] = train.index.astype('uint32')
test  = pd.read_csv('./data/test.csv',  dtype=dtypes, low_memory=True)
test['MachineIdentifier']  = test.index.astype('uint32')
test['HasDetections']=[0]*len(test)

Download Train and Test Data.

Wall time: 4min 9s


In [138]:
gc.collect()

201068

In [139]:
print('Transform all features to category.\n')
for usecol in tqdm_notebook(train.columns.tolist()[1:-1]):

    train[usecol] = train[usecol].astype('str')
    test[usecol] = test[usecol].astype('str')
    
    #Fit LabelEncoder
    le = LabelEncoder().fit(
            np.unique(train[usecol].unique().tolist()+
                      test[usecol].unique().tolist()))

    #At the end 0 will be used for dropped values
    train[usecol] = le.transform(train[usecol])+1
    test[usecol]  = le.transform(test[usecol])+1

    agg_tr = (train
              .groupby([usecol])
              .aggregate({'MachineIdentifier':'count'})
              .reset_index()
              .rename({'MachineIdentifier':'Train'}, axis=1))
    
    agg_te = (test
              .groupby([usecol])
              .aggregate({'MachineIdentifier':'count'})
              .reset_index()
              .rename({'MachineIdentifier':'Test'}, axis=1))

    agg = pd.merge(agg_tr, agg_te, on=usecol, how='outer').replace(np.nan, 0)
    #Select values with more than 1000 observations
    agg = agg[(agg['Train'] > 1000)].reset_index(drop=True)
    agg['Total'] = agg['Train'] + agg['Test']
    #Drop unbalanced values
    agg = agg[(agg['Train'] / agg['Total'] > 0.2) & (agg['Train'] / agg['Total'] < 0.8)]
    agg[usecol+'Copy'] = agg[usecol]
    
    train[usecol] = (pd.merge(train[[usecol]], 
                              agg[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

    test[usecol]  = (pd.merge(test[[usecol]], 
                              agg[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

    del le, agg_tr, agg_te, agg, usecol
    gc.collect()

Transform all features to category.



HBox(children=(IntProgress(value=0, max=81), HTML(value='')))




In [140]:
y_train = np.array(train['HasDetections'])

In [141]:
del train['HasDetections'], train['MachineIdentifier'], test['MachineIdentifier']
gc.collect()

21

In [142]:
hparam=tf.contrib.training.HParams(
            model='nffm',
            norm=True,
            batch_norm_decay=0.9,
            hidden_size=[128,128],
            k=8,
            hash_ids=int(2e5),
            batch_size=1024,
            optimizer="adam",
            learning_rate=0.001,
            num_display_steps=1000,
            num_eval_steps=1000,
            epoch=1,
            metric='auc',
            init_method='uniform',
            init_value=0.1,
            feature_nums=train.shape[1],
            kfold=4)

In [143]:
utils.print_hparams(hparam)

  batch_norm_decay=0.9
  batch_size=1024
  epoch=1
  feature_nums=81
  hash_ids=200000
  hidden_size=[128, 128]
  init_method=uniform
  init_value=0.1
  k=8
  kfold=4
  learning_rate=0.001
  metric=auc
  model=nffm
  norm=True
  num_display_steps=1000
  num_eval_steps=1000
  optimizer=adam


In [157]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [158]:
print('\nLightGBM\n')

for i, (train_index, test_index) in enumerate(skf.split(train.index, y_train)):
    
    print('Fold {}\n'.format(i + 1))
    
    x_trn = train.loc[train_index]
    y_trn = y_train[train_index]
    
    x_val = train.loc[test_index]
    y_val = y_train[test_index]

    lgb_model = lgb.LGBMClassifier(max_depth=-1,
                                   n_estimators=30000,
                                   learning_rate=0.05,
                                   num_leaves=2**12-1,
                                   colsample_bytree=0.28,
                                   objective='binary', 
                                   n_jobs=-1)
    
                               
    lgb_model.fit(x_trn, y_trn, eval_metric='auc', 
                  eval_set=[(x_val, y_val)], 
                  verbose=100, early_stopping_rounds=100)
    
    lgb_test_result += lgb_model.predict_proba(test)[:,1]


LightGBM

Fold 1

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.608471	valid_0's auc: 0.729571


KeyboardInterrupt: 

In [145]:
for i, (train_index, test_index) in enumerate(skf.split(train.index, y_train)):
        
    print('Fold {}\n'.format(i + 1))
    
    x_trn = train.loc[train_index]
    y_trn = y_train[train_index]
    
    x_val = train.loc[test_index]
    y_val = y_train[test_index]
        
    model=ctrNet.build_model(hparam)
    model.train(train_data=(x_trn,y_trn), dev_data=(x_val,y_val))
    print("Training Done! Inference...")

    if i==0:
        preds=model.infer(dev_data=(test.drop(['HasDetections'], axis=1),test['HasDetections']))/hparam.kfold
    else:
        preds+=model.infer(dev_data=(test.drop(['HasDetections'], axis=1),test['HasDetections']))/hparam.kfold

Fold 1

# Trainable variables
  emb_v1:0, (200000, 1), 
  emb_v2:0, (200000, 81, 8), 
  Variable:0, (3240, 128), 
  norm_0/beta:0, (128,), 
  norm_0/gamma:0, (128,), 
  Variable_1:0, (128, 128), 
  norm_1/beta:0, (128,), 
  norm_1/gamma:0, (128,), 
  Variable_2:0, (128, 1), 
  Variable_3:0, (), 
  epoch 0 step 1000 lr 0.001 logloss 0.654539 gN 0.29, Mon Feb 11 03:53:19 2019
# Epcho-time 2191.74s Eval AUC 0.719400. Best AUC 0.719400.
  epoch 0 step 2000 lr 0.001 logloss 0.608625 gN 0.18, Mon Feb 11 04:52:50 2019
# Epcho-time 5762.65s Eval AUC 0.724763. Best AUC 0.724763.
  epoch 0 step 3000 lr 0.001 logloss 0.606413 gN 0.17, Mon Feb 11 05:52:11 2019
# Epcho-time 9323.49s Eval AUC 0.728711. Best AUC 0.728711.
  epoch 0 step 4000 lr 0.001 logloss 0.603810 gN 0.16, Mon Feb 11 06:51:32 2019
# Epcho-time 12884.71s Eval AUC 0.730823. Best AUC 0.730823.
  epoch 0 step 5000 lr 0.001 logloss 0.602947 gN 0.15, Mon Feb 11 07:50:53 2019
# Epcho-time 16445.50s Eval AUC 0.730886. Best AUC 0.730886.
 

KeyboardInterrupt: 