# EDA - Clustering Feature - Machine Info
* 멀웨어 소프트웨어가 아닌 하드웨어 특성에 대한 EDA

## 라이브러리

In [1]:
import numpy as np
import pandas as pd
import warnings
import gc
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('max_rows', 500)
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

## 데이터 로드

In [3]:
dtypes = {
        'MachineIdentifier':                                    'object',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [4]:
%%time
train = pd.read_csv('./data/train.csv', dtype=dtypes)
test = pd.read_csv('./data/test.csv', dtype=dtypes)

Wall time: 8min 45s


In [5]:
data = train.append(test)

## Machine Info 관련 피처

* Processor
* Census_MDC2FormFactor
* Census_DeviceFamily
* Census_ProcessorCoreCount
* Census_ProcessorManufacturerIdentifier
* Census_ProcessorModelIdentifier
* Census_ProcessorClass
* Census_PrimaryDiskTotalCapacity
* Census_PrimaryDiskTypeName
* Census_SystemVolumeTotalCapacity
* Census_HasOpticalDiskDrive
* Census_TotalPhysicalRAM
* Census_ChassisTypeName
* Census_InternalPrimaryDiagonalDisplaySizeInInches
* Census_InternalPrimaryDisplayResolutionHorizontal
* Census_InternalPrimaryDisplayResolutionVertical
* Census_PowerPlatformRoleName
* Census_InternalBatteryType
* Census_InternalBatteryNumberOfCharges
* Census_OSArchitecture (삭제)
* Census_IsTouchEnabled
* Census_IsPenCapable
* Census_IsAlwaysOnAlwaysConnectedCapable

### Processor

In [23]:
data.Processor.value_counts(dropna=False)

x64      15278795
x86       1495384
arm64         557
Name: Processor, dtype: int64

### Census_OSArchitecture

In [24]:
data.Census_OSArchitecture.value_counts(dropna=False)

amd64    15280552
x86       1493627
arm64         557
Name: Census_OSArchitecture, dtype: int64

### Census_MDC2FormFactor

In [25]:
data.Census_MDC2FormFactor.value_counts(dropna=False)

Notebook        10652855
Desktop          3813472
Convertible       744141
AllInOne          547632
Detachable        544171
PCOther           275408
LargeTablet       121325
SmallTablet        52491
SmallServer        15563
MediumServer        6169
LargeServer         1453
ServerOther           53
IoTOther               2
Other                  1
Name: Census_MDC2FormFactor, dtype: int64

### Census_DeviceFamily
* Windows를 Windows.Desktop으로 치환

In [27]:
data.Census_DeviceFamily.value_counts(dropna=False)

Windows.Desktop    16749228
Windows.Server        25463
Windows                  45
Name: Census_DeviceFamily, dtype: int64

In [28]:
data.groupby(['Census_DeviceFamily', 'Census_MDC2FormFactor']).size()

Census_DeviceFamily  Census_MDC2FormFactor
Windows.Desktop      AllInOne                   547618
                     Convertible                744132
                     Desktop                   3808557
                     Detachable                 544170
                     IoTOther                        2
                     LargeServer                    15
                     LargeTablet                121323
                     MediumServer                  969
                     Notebook                 10652495
                     Other                           1
                     PCOther                    275402
                     SmallServer                  2053
                     SmallTablet                 52491
Windows.Server       AllInOne                       12
                     Convertible                     7
                     Desktop                      4899
                     LargeServer                  1438
                     M

In [35]:
train[train.Census_DeviceFamily == 'Windows.Server'].groupby('Census_MDC2FormFactor')['HasDetections'].sum()

Census_MDC2FormFactor
AllInOne           3.0
Convertible        2.0
Desktop         1240.0
Detachable         0.0
LargeServer      219.0
LargeTablet        0.0
MediumServer     857.0
Notebook          77.0
PCOther            0.0
SmallServer     2627.0
SmallTablet        0.0
ServerOther        9.0
IoTOther           0.0
Name: HasDetections, dtype: float64

In [38]:
100 * (train[train.Census_DeviceFamily == 'Windows.Server'].groupby('Census_MDC2FormFactor')['HasDetections'].sum() / train[train.Census_DeviceFamily == 'Windows.Server'].groupby('Census_MDC2FormFactor').size())

Census_MDC2FormFactor
AllInOne        60.000000
Convertible     66.666667
Desktop         45.074518
Detachable            NaN
LargeServer     25.259516
LargeTablet           NaN
MediumServer    29.369431
Notebook        39.285714
PCOther          0.000000
SmallServer     34.393820
SmallTablet           NaN
ServerOther     30.000000
IoTOther              NaN
dtype: float64