# EDA - All
* 이 커널에서는 모든 피처를 하나하나씩 살펴보며 어떤 데이터 양상을 띄고 있는 지 검토할 것입니다.

## 라이브러리

In [1]:
import numpy as np
import pandas as pd
import warnings
import gc
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('max_rows', 150)
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

## 데이터 로드

In [3]:
dtypes = {
        'MachineIdentifier':                                    'object',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [4]:
train = pd.read_csv('./data/train.csv', dtype=dtypes)
test = pd.read_csv('./data/test.csv', dtype=dtypes)

## 컬럼 분석

### Census_OSInstallLanguageIdentifier

In [84]:
col = 'Census_OSInstallLanguageIdentifier'

In [85]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_OSInstallLanguageIdentifier,rate
8.0,3179262,35.64
9.0,1034201,11.59
7.0,512753,5.75
29.0,492267,5.52
14.0,432503,4.85
37.0,403190,4.52
10.0,366636,4.11
26.0,334766,3.75
5.0,252887,2.83
35.0,204832,2.3


In [86]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_OSInstallLanguageIdentifier,rate
8.0,2805320,35.72
9.0,828830,10.55
29.0,543163,6.92
37.0,458146,5.83
7.0,423065,5.39
14.0,356466,4.54
10.0,320350,4.08
26.0,289091,3.68
35.0,200274,2.55
5.0,190338,2.42


In [87]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_OSInstallLanguageIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8.0,1612174.0,3179262,50.71
9.0,529033.0,1034201,51.15
29.0,249820.0,492267,50.75
7.0,249160.0,512753,48.59
37.0,208475.0,403190,51.71
14.0,203234.0,432503,46.99
10.0,197462.0,366636,53.86
26.0,170360.0,334766,50.89
5.0,111350.0,252887,44.03
35.0,106313.0,204832,51.9


### Census_OSUILocaleIdentifier

In [88]:
col = 'Census_OSUILocaleIdentifier'

In [89]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_OSUILocaleIdentifier,rate
31,3170824,35.54
34,1040042,11.66
30,513995,5.76
125,498236,5.58
49,436691,4.89
158,408118,4.57
35,377071,4.23
119,337988,3.79
26,252863,2.83
148,206854,2.32


In [90]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_OSUILocaleIdentifier,rate
31,2800526,35.66
34,833576,10.61
125,549829,7.0
158,463741,5.91
30,424469,5.41
49,360291,4.59
35,327934,4.18
119,291802,3.72
148,202707,2.58
26,190217,2.42


In [91]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_OSUILocaleIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
31,1606877.0,3170824,50.68
34,532015.0,1040042,51.15
125,253204.0,498236,50.82
30,249447.0,513995,48.53
158,211376.0,408118,51.79
49,205293.0,436691,47.01
35,203579.0,377071,53.99
119,171996.0,337988,50.89
26,111208.0,252863,43.98
148,107373.0,206854,51.91


### Census_OSWUAutoUpdateOptionsName

In [92]:
col = 'Census_OSWUAutoUpdateOptionsName'

In [93]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_OSWUAutoUpdateOptionsName,rate
FullAuto,3954497,44.33
UNKNOWN,2519925,28.25
Notify,2034254,22.8
AutoInstallAndRebootAtMaintenanceTime,371475,4.16
Off,26961,0.3
DownloadNotify,14371,0.16


In [94]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_OSWUAutoUpdateOptionsName,rate
FullAuto,4113612,52.38
UNKNOWN,2299918,29.29
Notify,1194458,15.21
AutoInstallAndRebootAtMaintenanceTime,213891,2.72
Off,20388,0.26
DownloadNotify,10986,0.14


In [95]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_OSWUAutoUpdateOptionsName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FullAuto,2040783.0,3954497,51.61
UNKNOWN,1252056.0,2519925,49.69
Notify,967026.0,2034254,47.54
AutoInstallAndRebootAtMaintenanceTime,181112.0,371475,48.75
Off,12891.0,26961,47.81
DownloadNotify,5024.0,14371,34.96


### Census_IsPortableOperatingSystem

In [96]:
col = 'Census_IsPortableOperatingSystem'

In [97]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_IsPortableOperatingSystem,rate
0,8916619,99.95
1,4864,0.05


In [98]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_IsPortableOperatingSystem,rate
0,7848659,99.94
1,4594,0.06


In [99]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_IsPortableOperatingSystem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4456201.0,8916619,49.98
1,2691.0,4864,55.32


### Census_GenuineStateName

In [100]:
col = 'Census_GenuineStateName'

In [101]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_GenuineStateName,rate
IS_GENUINE,7877597,88.3
INVALID_LICENSE,801692,8.99
OFFLINE,228366,2.56
UNKNOWN,13826,0.15
TAMPERED,2,0.0


In [102]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_GenuineStateName,rate
IS_GENUINE,6729893,85.7
INVALID_LICENSE,875327,11.15
OFFLINE,232931,2.97
UNKNOWN,15100,0.19
,1,0.0
TAMPERED,1,0.0


In [103]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_GenuineStateName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
IS_GENUINE,3935480.0,7877597,49.96
INVALID_LICENSE,395483.0,801692,49.33
OFFLINE,123740.0,228366,54.18
UNKNOWN,4188.0,13826,30.29
TAMPERED,1.0,2,50.0


### Census_ActivationChannel

In [104]:
col = 'Census_ActivationChannel'

In [105]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_ActivationChannel,rate
Retail,4727589,52.99
OEM:DM,3413350,38.26
Volume:GVLK,450954,5.05
OEM:NONSLP,317980,3.56
Volume:MAK,8028,0.09
Retail:TB:Eval,3582,0.04


In [106]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_ActivationChannel,rate
Retail,4051234,51.59
OEM:DM,2815395,35.85
Volume:GVLK,710466,9.05
OEM:NONSLP,261287,3.33
Volume:MAK,12512,0.16
Retail:TB:Eval,2359,0.03


In [107]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_ActivationChannel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Retail,2325719.0,4727589,49.19
OEM:DM,1695147.0,3413350,49.66
Volume:GVLK,268182.0,450954,59.47
OEM:NONSLP,165240.0,317980,51.97
Volume:MAK,3622.0,8028,45.12
Retail:TB:Eval,982.0,3582,27.41


### Census_IsFlightingInternal

In [108]:
col = 'Census_IsFlightingInternal'

In [109]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_IsFlightingInternal,rate
,7408759,83.04
0.0,1512703,16.96
1.0,21,0.0


In [110]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_IsFlightingInternal,rate
,6673962,84.98
0.0,1179278,15.02
1.0,13,0.0


In [111]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_IsFlightingInternal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,737583.0,1512703,48.76
1.0,13.0,21,61.9


### Census_IsFlightsDisabled

In [112]:
col = 'Census_IsFlightsDisabled'

In [113]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_IsFlightsDisabled,rate
0.0,8760872,98.2
,160523,1.8
1.0,88,0.0


In [114]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_IsFlightsDisabled,rate
0.0,7727414,98.4
,125801,1.6
1.0,38,0.0


In [115]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_IsFlightsDisabled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,4377413.0,8760872,49.97
1.0,5.0,88,5.68


### Census_FlightRing

In [116]:
col = 'Census_FlightRing'

In [117]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_FlightRing,rate
Retail,8355679,93.66
NOT_SET,287803,3.23
Unknown,243438,2.73
WIS,10648,0.12
WIF,10322,0.12
RP,9860,0.11
Disabled,3722,0.04
OSG,7,0.0
Canary,3,0.0
Invalid,1,0.0


In [118]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_FlightRing,rate
Retail,7340769,93.47
NOT_SET,270817,3.45
Unknown,201243,2.56
RP,16909,0.22
WIF,10113,0.13
WIS,10015,0.13
Disabled,3379,0.04
OSG,4,0.0
Canary,2,0.0
Invalid,1,0.0


In [119]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_FlightRing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Retail,4185842.0,8355679,50.1
NOT_SET,143854.0,287803,49.98
Unknown,113261.0,243438,46.53
RP,4876.0,9860,49.45
WIF,4635.0,10322,44.9
WIS,4547.0,10648,42.7
Disabled,1875.0,3722,50.38
Canary,1.0,3,33.33
Invalid,1.0,1,100.0
OSG,0.0,7,0.0


### Census_ThresholdOptIn

In [120]:
col = 'Census_ThresholdOptIn'

In [121]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_ThresholdOptIn,rate
,5667325,63.52
0.0,3253342,36.47
1.0,816,0.01


In [122]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_ThresholdOptIn,rate
,5529515,70.41
0.0,2323086,29.58
1.0,652,0.01


In [123]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_ThresholdOptIn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,1600755.0,3253342,49.2
1.0,382.0,816,46.81


### Census_FirmwareManufacturerIdentifier

In [124]:
col = 'Census_FirmwareManufacturerIdentifier'

In [125]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_FirmwareManufacturerIdentifier,rate
142.0,2699078,30.25
628.0,1229140,13.78
554.0,1175137,13.17
355.0,941793,10.56
556.0,800536,8.97
500.0,385327,4.32
93.0,192481,2.16
,183257,2.05
807.0,157172,1.76
513.0,142388,1.60


In [126]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_FirmwareManufacturerIdentifier,rate
142.0,2442080,31.10
628.0,1071331,13.64
554.0,982632,12.51
355.0,847687,10.79
556.0,650852,8.29
500.0,345645,4.40
,198634,2.53
93.0,171689,2.19
807.0,128554,1.64
513.0,113513,1.45


In [127]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_FirmwareManufacturerIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
142.0,1400255.0,2699078,51.88
628.0,616527.0,1229140,50.16
554.0,599526.0,1175137,51.02
355.0,482124.0,941793,51.19
556.0,404023.0,800536,50.47
500.0,180328.0,385327,46.80
93.0,101111.0,192481,52.53
807.0,74714.0,157172,47.54
513.0,57228.0,142388,40.19
127.0,55494.0,123667,44.87


### Census_FirmwareVersionIdentifier

In [128]:
col = 'Census_FirmwareVersionIdentifier'

In [129]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_FirmwareVersionIdentifier,rate
,160133,1.79
33105.0,89611,1.00
33111.0,61583,0.69
33054.0,56626,0.63
33108.0,55040,0.62
11778.0,53785,0.60
63175.0,52504,0.59
33115.0,50972,0.57
63155.0,49074,0.55
33060.0,48587,0.54


In [130]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_FirmwareVersionIdentifier,rate
,177624,2.26
33105.0,72448,0.92
33054.0,52437,0.67
11778.0,51806,0.66
33111.0,46768,0.60
33108.0,43862,0.56
63175.0,41789,0.53
33000.0,41648,0.53
19970.0,40291,0.51
33060.0,40257,0.51


In [131]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_FirmwareVersionIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33105.0,45370.0,89611,50.63
33111.0,31624.0,61583,51.35
33054.0,28930.0,56626,51.09
33108.0,28068.0,55040,51.00
11778.0,28027.0,53785,52.11
33115.0,26119.0,50972,51.24
63175.0,25437.0,52504,48.45
33060.0,23869.0,48587,49.13
19970.0,23713.0,46679,50.80
63155.0,23689.0,49074,48.27


### Census_IsSecureBootEnabled

In [132]:
col = 'Census_IsSecureBootEnabled'

In [133]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_IsSecureBootEnabled,rate
0,4585438,51.4
1,4336045,48.6


In [134]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_IsSecureBootEnabled,rate
0,4308612,54.86
1,3544641,45.14


In [135]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_IsSecureBootEnabled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2295583.0,4585438,50.06
1,2163309.0,4336045,49.89


### Census_IsWIMBootEnabled

In [136]:
col = 'Census_IsWIMBootEnabled'

In [137]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_IsWIMBootEnabled,rate
,5659703,63.44
0.0,3261779,36.56
1.0,1,0.0


In [138]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_IsWIMBootEnabled,rate
,5522707,70.32
0.0,2330545,29.68
1.0,1,0.0


In [139]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_IsWIMBootEnabled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,1604572.0,3261779,49.19
1.0,0.0,1,0.0


### Census_IsVirtualDevice

In [140]:
col = 'Census_IsVirtualDevice'

In [141]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_IsVirtualDevice,rate
0.0,8842840,99.12
1.0,62690,0.7
,15953,0.18


In [142]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_IsVirtualDevice,rate
0.0,7771328,98.96
1.0,64634,0.82
,17291,0.22


In [143]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_IsVirtualDevice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,4438599.0,8842840,50.19
1.0,12172.0,62690,19.42


### Census_IsTouchEnabled

In [144]:
col = 'Census_IsTouchEnabled'

In [145]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_IsTouchEnabled,rate
0,7801452,87.45
1,1120031,12.55


In [146]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_IsTouchEnabled,rate
0,6924286,88.17
1,928967,11.83


In [147]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_IsTouchEnabled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3958835.0,7801452,50.74
1,500057.0,1120031,44.65


### Census_IsPenCapable

In [148]:
col = 'Census_IsPenCapable'

In [149]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_IsPenCapable,rate
0,8581834,96.19
1,339649,3.81


In [150]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_IsPenCapable,rate
0,7555266,96.21
1,297987,3.79


In [151]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_IsPenCapable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4303801.0,8581834,50.15
1,155091.0,339649,45.66


### Census_IsAlwaysOnAlwaysConnectedCapable

In [152]:
col = 'Census_IsAlwaysOnAlwaysConnectedCapable'

In [153]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_IsAlwaysOnAlwaysConnectedCapable,rate
0.0,8341972,93.5
1.0,508168,5.7
,71343,0.8


In [154]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_IsAlwaysOnAlwaysConnectedCapable,rate
0.0,7340131,93.47
1.0,422970,5.39
,90152,1.15


In [155]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_IsAlwaysOnAlwaysConnectedCapable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,4232858.0,8341972,50.74
1.0,189287.0,508168,37.25


### Wdft_IsGamer

In [156]:
col = 'Wdft_IsGamer'

In [157]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Wdft_IsGamer,rate
0.0,6174143,69.21
1.0,2443889,27.39
,303451,3.4


In [158]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Wdft_IsGamer,rate
0.0,5302612,67.52
1.0,2247784,28.62
,302857,3.86


In [159]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Wdft_IsGamer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,2976639.0,6174143,48.21
1.0,1324330.0,2443889,54.19


### Wdft_RegionIdentifier

In [160]:
col = 'Wdft_RegionIdentifier'

In [161]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Wdft_RegionIdentifier,rate
10.0,1800105,20.18
11.0,1347828,15.11
3.0,1295892,14.53
1.0,1232258,13.81
15.0,1017591,11.41
7.0,597297,6.7
,303451,3.4
8.0,276029,3.09
13.0,225130,2.52
5.0,205372,2.3


In [162]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Wdft_RegionIdentifier,rate
10.0,1451816,18.49
11.0,1264670,16.1
3.0,1253395,15.96
1.0,1109642,14.13
15.0,787767,10.03
7.0,601077,7.65
,302857,3.86
8.0,228934,2.92
13.0,199668,2.54
5.0,163548,2.08


In [163]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Wdft_RegionIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10.0,916641.0,1800105,50.92
11.0,678230.0,1347828,50.32
1.0,659826.0,1232258,53.55
3.0,638119.0,1295892,49.24
15.0,474349.0,1017591,46.61
7.0,299335.0,597297,50.11
8.0,135824.0,276029,49.21
13.0,107481.0,225130,47.74
5.0,99113.0,205372,48.26
12.0,77151.0,163711,47.13
