In [1]:
from fastai.learner import *
from fastai.column_data import *
from fastai.structured import *

  from numpy.core.umath_tests import inner1d


In [2]:
import gc

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
import warnings
import gc
import time
import sys
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn import metrics
# Plotly library
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected=True)
pd.set_option('display.max_columns', 500)

In [5]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [6]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = [c for c,v in dtypes.items() if v in numerics]
categorical_columns = [c for c,v in dtypes.items() if v not in numerics]

In [7]:
nrows = 1000000
#_______________________________________________________________________________
retained_columns = numerical_columns + categorical_columns
train = pd.read_csv('data/malware/train.csv',
#                     nrows = nrows,
                    usecols = retained_columns,
                    dtype = dtypes)
#_______________________________________________________________
retained_columns += ['MachineIdentifier']
retained_columns.remove('HasDetections')
test = pd.read_csv('data/malware/test.csv',
                   usecols = retained_columns,
                   dtype = dtypes)

In [8]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 1673.25 Mb (0.0% reduction)
Mem. usage decreased to 1503.79 Mb (0.0% reduction)


In [9]:
true_numerical_columns = [
    'Census_ProcessorCoreCount',
    'Census_PrimaryDiskTotalCapacity',
    'Census_SystemVolumeTotalCapacity',
    'Census_TotalPhysicalRAM',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches',
    'Census_InternalPrimaryDisplayResolutionHorizontal',
    'Census_InternalPrimaryDisplayResolutionVertical',
    'Census_InternalBatteryNumberOfCharges'
]

In [10]:
binary_variables = [c for c in train.columns if train[c].nunique() == 2]

In [11]:
categorical_columns = [c for c in train.columns 
                       if (c not in true_numerical_columns) & (c not in binary_variables)]

In [25]:
print(categorical_columns)
print(binary_variables)

['ProductName', 'EngineVersion', 'AppVersion', 'AvSigVersion', 'RtpStateBitfield', 'DefaultBrowsersIdentifier', 'AVProductStatesIdentifier', 'AVProductsInstalled', 'AVProductsEnabled', 'CountryIdentifier', 'CityIdentifier', 'OrganizationIdentifier', 'GeoNameIdentifier', 'LocaleEnglishNameIdentifier', 'Platform', 'Processor', 'OsVer', 'OsBuild', 'OsSuite', 'OsPlatformSubRelease', 'OsBuildLab', 'SkuEdition', 'IeVerIdentifier', 'SmartScreen', 'UacLuaenable', 'Census_MDC2FormFactor', 'Census_DeviceFamily', 'Census_OEMNameIdentifier', 'Census_OEMModelIdentifier', 'Census_ProcessorManufacturerIdentifier', 'Census_ProcessorModelIdentifier', 'Census_ProcessorClass', 'Census_PrimaryDiskTypeName', 'Census_ChassisTypeName', 'Census_PowerPlatformRoleName', 'Census_InternalBatteryType', 'Census_OSVersion', 'Census_OSArchitecture', 'Census_OSBranch', 'Census_OSBuildNumber', 'Census_OSBuildRevision', 'Census_OSEdition', 'Census_OSSkuName', 'Census_OSInstallTypeName', 'Census_OSInstallLanguageIdentifi

In [12]:
# remove hasDetections from binary variable
binary_variables = binary_variables[:-1]

In [13]:
#remove MachineIdentifier from categorical columns
categorical_columns = categorical_columns[1:]

In [14]:
categorical_columns = categorical_columns + binary_variables

In [15]:
dep = 'HasDetections'
df_train = train[categorical_columns+true_numerical_columns+[dep]].copy()

In [16]:
test[dep] = 0.0
df_test = test[categorical_columns+true_numerical_columns+[dep, 'MachineIdentifier']].copy()

In [17]:
for v in categorical_columns: df_train[v] = df_train[v].astype('category').cat.as_ordered()
apply_cats(df_test, df_train)

In [18]:
del train
del test
gc.collect()

201229

In [20]:
samp_size = len(df_train)
df, y, nas = proc_df(df_train, dep, do_scale=False)

In [22]:
df_test1, _, nas = proc_df(df_test, dep, do_scale=False,skip_flds=['MachineIdentifier'], na_dict=nas)

In [23]:
train_ratio = 0.75
# train_ratio = 0.9
train_size = int(samp_size * train_ratio); train_size
val_idx = list(range(train_size, len(df)))

In [24]:
def exp_rmse(y_pred, targ):
#     targ = inv_y(targ)
    pct_var = (targ - y_pred)
    return math.sqrt((pct_var**2).mean())

In [25]:
PATH=Path('data/malware')

In [26]:
md = ColumnarModelData.from_data_frame(PATH, val_idx, df, y.astype(np.float32), cat_flds=categorical_columns, bs=512,
                                       test_df=df_test1)

In [27]:
cat_sz = [(c, len(df_train[c].cat.categories)+1) for c in categorical_columns]
cat_sz

[('ProductName', 7),
 ('EngineVersion', 71),
 ('AppVersion', 111),
 ('AvSigVersion', 8532),
 ('RtpStateBitfield', 8),
 ('DefaultBrowsersIdentifier', 1731),
 ('AVProductStatesIdentifier', 28971),
 ('AVProductsInstalled', 9),
 ('AVProductsEnabled', 7),
 ('CountryIdentifier', 223),
 ('CityIdentifier', 107367),
 ('OrganizationIdentifier', 50),
 ('GeoNameIdentifier', 293),
 ('LocaleEnglishNameIdentifier', 253),
 ('Platform', 5),
 ('Processor', 4),
 ('OsVer', 59),
 ('OsBuild', 77),
 ('OsSuite', 15),
 ('OsPlatformSubRelease', 10),
 ('OsBuildLab', 664),
 ('SkuEdition', 9),
 ('IeVerIdentifier', 304),
 ('SmartScreen', 22),
 ('UacLuaenable', 12),
 ('Census_MDC2FormFactor', 14),
 ('Census_DeviceFamily', 4),
 ('Census_OEMNameIdentifier', 2565),
 ('Census_OEMModelIdentifier', 175366),
 ('Census_ProcessorManufacturerIdentifier', 8),
 ('Census_ProcessorModelIdentifier', 2584),
 ('Census_ProcessorClass', 4),
 ('Census_PrimaryDiskTypeName', 5),
 ('Census_ChassisTypeName', 53),
 ('Census_PowerPlatformRol

In [28]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
emb_szs

[(7, 4),
 (71, 36),
 (111, 50),
 (8532, 50),
 (8, 4),
 (1731, 50),
 (28971, 50),
 (9, 5),
 (7, 4),
 (223, 50),
 (107367, 50),
 (50, 25),
 (293, 50),
 (253, 50),
 (5, 3),
 (4, 2),
 (59, 30),
 (77, 39),
 (15, 8),
 (10, 5),
 (664, 50),
 (9, 5),
 (304, 50),
 (22, 11),
 (12, 6),
 (14, 7),
 (4, 2),
 (2565, 50),
 (175366, 50),
 (8, 4),
 (2584, 50),
 (4, 2),
 (5, 3),
 (53, 27),
 (11, 6),
 (79, 40),
 (470, 50),
 (4, 2),
 (33, 17),
 (166, 50),
 (286, 50),
 (34, 17),
 (31, 16),
 (10, 5),
 (40, 20),
 (148, 50),
 (7, 4),
 (6, 3),
 (7, 4),
 (11, 6),
 (713, 50),
 (50495, 50),
 (16, 8),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2)]

In [29]:
m = md.get_learner(emb_szs,len(df.columns)-len(categorical_columns),0.04,1,[1000,500],[0.001,0.01])

In [30]:
m.fit(1e-3, 3,metrics=[exp_rmse],cycle_len=1)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmse                                                                              
    0      0.252224   0.249141   0.49914   
    1      0.24934    0.249229   0.499228                                                                              
    2      0.249838   0.249779   0.499779                                                                              



[array([0.24978]), 0.4997787873351206]

In [31]:
x,y=m.predict_with_targs()
exp_rmse(x,y)

0.4997789967393764

In [32]:
pred_test=m.predict(True)
# pred_test = np.exp(pred_test)
df_test[dep]=pred_test

In [33]:
df_test.to_csv(f'{PATH}\submissionv3.csv',columns =['MachineIdentifier','HasDetections'],index=False)