# EDA - All
* 이 커널에서는 모든 피처를 하나하나씩 살펴보며 어떤 데이터 양상을 띄고 있는 지 검토할 것입니다.

## 라이브러리

In [1]:
import numpy as np
import pandas as pd
import warnings
import gc
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('max_rows', 150)
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

## 데이터 로드

In [3]:
dtypes = {
        'MachineIdentifier':                                    'object',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [4]:
train = pd.read_csv('./data/train.csv', dtype=dtypes)
test = pd.read_csv('./data/test.csv', dtype=dtypes)

## 컬럼 분석

### Firewall

In [5]:
col = 'Firewall'

In [6]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Firewall,rate
1.0,8641014,96.86
0.0,189119,2.12
,91350,1.02


In [7]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Firewall,rate
1.0,7629355,97.15
0.0,165426,2.11
,58472,0.74


In [8]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Firewall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,4321114.0,8641014,50.01
0.0,92590.0,189119,48.96


### UacLuaenable

In [203]:
col = 'UacLuaenable'

In [204]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,UacLuaenable,rate
1.0,8856517,99.27
0.0,53851,0.6
,10838,0.12
48.0,206,0.0
2.0,30,0.0
49.0,17,0.0
6357062.0,13,0.0
3.0,6,0.0
5.0,2,0.0
16777216.0,1,0.0


In [205]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,UacLuaenable,rate
1.0,7785557,99.14
0.0,59605,0.76
,7865,0.1
48.0,167,0.0
2.0,20,0.0
49.0,15,0.0
6357062.0,12,0.0
3.0,9,0.0
808482880.0,1,0.0
537591872.0,1,0.0


In [206]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
UacLuaenable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,4425785.0,8856517,49.97
0.0,27028.0,53851,50.19
48.0,110.0,206,53.4
2.0,10.0,30,33.33
6357062.0,6.0,13,46.15
3.0,4.0,6,66.67
49.0,4.0,17,23.53
5.0,2.0,2,100.0
255.0,1.0,1,100.0
7798884.0,0.0,1,0.0


### Census_MDC2FormFactor

In [207]:
col = 'Census_MDC2FormFactor'

In [208]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_MDC2FormFactor,rate
Notebook,5723319,64.15
Desktop,1951086,21.87
Convertible,405378,4.54
Detachable,298233,3.34
AllInOne,292077,3.27
PCOther,139955,1.57
LargeTablet,67121,0.75
SmallTablet,31393,0.35
SmallServer,8630,0.1
MediumServer,3385,0.04


In [209]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_MDC2FormFactor,rate
Notebook,4929536,62.77
Desktop,1862386,23.71
Convertible,338763,4.31
AllInOne,255555,3.25
Detachable,245938,3.13
PCOther,135453,1.72
LargeTablet,54204,0.69
SmallTablet,21098,0.27
SmallServer,6933,0.09
MediumServer,2784,0.04


In [210]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_MDC2FormFactor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Notebook,2876153.0,5723319,50.25
Desktop,1019851.0,1951086,52.27
Convertible,201394.0,405378,49.68
AllInOne,150581.0,292077,51.56
Detachable,111996.0,298233,37.55
PCOther,67932.0,139955,48.54
LargeTablet,20886.0,67121,31.12
SmallTablet,5875.0,31393,18.71
SmallServer,2996.0,8630,34.72
MediumServer,998.0,3385,29.48


### Census_DeviceFamily

In [211]:
col = 'Census_DeviceFamily'

In [212]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_DeviceFamily,rate
Windows.Desktop,8907053,99.84
Windows.Server,14410,0.16
Windows,20,0.0


In [213]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_DeviceFamily,rate
Windows.Desktop,7842175,99.86
Windows.Server,11053,0.14
Windows,25,0.0


In [214]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_DeviceFamily,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Windows.Desktop,4453851.0,8907053,50.0
Windows.Server,5034.0,14410,34.93
Windows,7.0,20,35.0


### Census_OEMNameIdentifier

In [215]:
col = 'Census_OEMNameIdentifier'

In [216]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_OEMNameIdentifier,rate
2668.0,1287276,14.43
2102.0,1038567,11.64
1443.0,949531,10.64
2206.0,924349,10.36
585.0,895452,10.04
525.0,842364,9.44
4588.0,310702,3.48
4728.0,304104,3.41
1980.0,285600,3.20
4144.0,203228,2.28


In [217]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_OEMNameIdentifier,rate
2668.0,1122056,14.29
2102.0,910343,11.59
1443.0,856664,10.91
2206.0,778399,9.91
525.0,737427,9.39
585.0,734379,9.35
4588.0,307504,3.92
1980.0,288160,3.67
4728.0,245689,3.13
4144.0,166328,2.12


In [218]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_OEMNameIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2668.0,645476.0,1287276,50.14
2102.0,536642.0,1038567,51.67
1443.0,485746.0,949531,51.16
2206.0,451165.0,924349,48.81
585.0,441907.0,895452,49.35
525.0,438743.0,842364,52.08
4588.0,171858.0,310702,55.31
1980.0,159172.0,285600,55.73
4728.0,144612.0,304104,47.55
4144.0,95733.0,203228,47.11


### Census_OEMModelIdentifier

In [219]:
col = 'Census_OEMModelIdentifier'

In [220]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_OEMModelIdentifier,rate
313586.0,304782,3.42
242491.0,263382,2.95
317701.0,139035,1.56
317708.0,115257,1.29
,102233,1.15
228975.0,79878,0.90
188345.0,70950,0.80
245824.0,69529,0.78
241876.0,68263,0.77
244755.0,49630,0.56


In [221]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_OEMModelIdentifier,rate
313586.0,301547,3.84
242491.0,194558,2.48
317701.0,137959,1.76
317708.0,111161,1.42
,95935,1.22
188345.0,66218,0.84
228975.0,62349,0.79
241876.0,59926,0.76
245824.0,50606,0.64
244755.0,39115,0.50


In [222]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_OEMModelIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
313586.0,168849.0,304782,55.40
242491.0,136780.0,263382,51.93
317701.0,74769.0,139035,53.78
317708.0,61099.0,115257,53.01
188345.0,40943.0,70950,57.71
228975.0,39709.0,79878,49.71
245824.0,39300.0,69529,56.52
241876.0,35503.0,68263,52.01
244755.0,26024.0,49630,52.44
248045.0,21096.0,43757,48.21


### Census_ProcessorCoreCount

In [223]:
col = 'Census_ProcessorCoreCount'

In [224]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_ProcessorCoreCount,rate
4.0,5430193,60.87
2.0,2311969,25.91
8.0,865004,9.7
12.0,92702,1.04
1.0,70390,0.79
6.0,69910,0.78
,41306,0.46
16.0,18551,0.21
3.0,13580,0.15
20.0,1781,0.02


In [225]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_ProcessorCoreCount,rate
4.0,4646102,59.16
2.0,1995987,25.42
8.0,843530,10.74
12.0,120673,1.54
6.0,80310,1.02
1.0,62056,0.79
,61277,0.78
16.0,22954,0.29
3.0,12313,0.16
24.0,1979,0.03


In [226]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_ProcessorCoreCount,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4.0,2758076.0,5430193,50.79
2.0,1063217.0,2311969,45.99
8.0,480084.0,865004,55.5
12.0,54202.0,92702,58.47
6.0,39597.0,69910,56.64
1.0,20768.0,70390,29.5
16.0,10418.0,18551,56.16
3.0,6193.0,13580,45.6
32.0,1065.0,2136,49.86
24.0,930.0,1847,50.35


### Census_ProcessorManufacturerIdentifier

In [229]:
col = 'Census_ProcessorManufacturerIdentifier'

In [230]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_ProcessorManufacturerIdentifier,rate
5.0,7839318,87.87
1.0,1040292,11.66
,41313,0.46
10.0,339,0.0
3.0,218,0.0
9.0,1,0.0
7.0,1,0.0
4.0,1,0.0


In [231]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_ProcessorManufacturerIdentifier,rate
5.0,6878134,87.58
1.0,913445,11.63
,61281,0.78
10.0,207,0.0
3.0,183,0.0
8.0,1,0.0
6.0,1,0.0
2.0,1,0.0


In [232]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_ProcessorManufacturerIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5.0,3926620.0,7839318,50.09
1.0,509719.0,1040292,49.0
3.0,51.0,218,23.39
10.0,5.0,339,1.47
4.0,0.0,1,0.0
7.0,0.0,1,0.0
9.0,0.0,1,0.0


### Census_ProcessorModelIdentifier

In [235]:
col = 'Census_ProcessorModelIdentifier'

In [236]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_ProcessorModelIdentifier,rate
2696.0,289284,3.24
1998.0,267397,3.00
2660.0,195737,2.19
2372.0,175412,1.97
1992.0,171728,1.92
2382.0,170651,1.91
2640.0,153720,1.72
2524.0,142527,1.60
1985.0,135682,1.52
2096.0,134925,1.51


In [237]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_ProcessorModelIdentifier,rate
2696.0,243488,3.10
1998.0,226179,2.88
2382.0,154082,1.96
2736.0,150526,1.92
2660.0,150496,1.92
1992.0,140564,1.79
2372.0,134711,1.72
2096.0,131970,1.68
2640.0,121723,1.55
2524.0,121378,1.55


In [234]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_ProcessorModelIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2696.0,156253.0,289284,54.01
1998.0,127970.0,267397,47.86
2660.0,108303.0,195737,55.33
2372.0,93329.0,175412,53.21
2382.0,90172.0,170651,52.84
2640.0,84370.0,153720,54.89
1992.0,80394.0,171728,46.81
2524.0,73295.0,142527,51.43
2096.0,65052.0,134925,48.21
1985.0,65007.0,135682,47.91


### Census_ProcessorClass

In [238]:
col = 'Census_ProcessorClass'

In [239]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_ProcessorClass,rate
,8884852,99.59
mid,20914,0.23
low,9621,0.11
high,6096,0.07


In [240]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_ProcessorClass,rate
,7835022,99.77
mid,10476,0.13
low,4682,0.06
high,3073,0.04


In [241]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_ProcessorClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mid,11566.0,20914,55.3
low,4816.0,9621,50.06
high,3635.0,6096,59.63


In [5]:
train.head(3)

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,Platform,Processor,OsVer,OsBuild,OsSuite,OsPlatformSubRelease,OsBuildLab,SkuEdition,IsProtected,AutoSampleOptIn,PuaMode,SMode,IeVerIdentifier,SmartScreen,Firewall,UacLuaenable,Census_MDC2FormFactor,Census_DeviceFamily,Census_OEMNameIdentifier,Census_OEMModelIdentifier,Census_ProcessorCoreCount,Census_ProcessorManufacturerIdentifier,Census_ProcessorModelIdentifier,Census_ProcessorClass,Census_PrimaryDiskTotalCapacity,Census_PrimaryDiskTypeName,Census_SystemVolumeTotalCapacity,Census_HasOpticalDiskDrive,Census_TotalPhysicalRAM,Census_ChassisTypeName,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,Census_PowerPlatformRoleName,Census_InternalBatteryType,Census_InternalBatteryNumberOfCharges,Census_OSVersion,Census_OSArchitecture,Census_OSBranch,Census_OSBuildNumber,Census_OSBuildRevision,Census_OSEdition,Census_OSSkuName,Census_OSInstallTypeName,Census_OSInstallLanguageIdentifier,Census_OSUILocaleIdentifier,Census_OSWUAutoUpdateOptionsName,Census_IsPortableOperatingSystem,Census_GenuineStateName,Census_ActivationChannel,Census_IsFlightingInternal,Census_IsFlightsDisabled,Census_FlightRing,Census_ThresholdOptIn,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,0000028988387b115f69f31a3bf04f09,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1735.0,0,7.0,0,,53447.0,1.0,1.0,1,29,128035.0,18.0,35.0,-85,windows10,x64,10.0.0.0,17134,256,rs4,17134.1.amd64fre.rs4_release.180410-1804,Pro,1.0,0,,0.0,137.0,,1.0,1.0,Desktop,Windows.Desktop,2668.0,9124.0,4.0,5.0,2340.0,,476940.0,HDD,299451.0,0,4096.0,Desktop,18.90625,1440.0,900.0,Desktop,,4294967000.0,10.0.17134.165,amd64,rs4_release,17134,165,Professional,PROFESSIONAL,UUPUpgrade,26.0,119,UNKNOWN,0,IS_GENUINE,Retail,,0.0,Retail,,628.0,36144.0,0,,0.0,0,0,0.0,0.0,10.0,0
1,000007535c3f730efa9ea0b7ef1bd645,win8defender,1.1.14600.4,4.13.17134.1,1.263.48.0,0,7.0,0,,53447.0,1.0,1.0,1,93,1482.0,18.0,119.0,64,windows10,x64,10.0.0.0,17134,256,rs4,17134.1.amd64fre.rs4_release.180410-1804,Pro,1.0,0,,0.0,137.0,,1.0,1.0,Notebook,Windows.Desktop,2668.0,91656.0,4.0,5.0,2404.0,,476940.0,HDD,102385.0,0,4096.0,Notebook,13.898438,1366.0,768.0,Mobile,,1.0,10.0.17134.1,amd64,rs4_release,17134,1,Professional,PROFESSIONAL,IBSClean,8.0,31,UNKNOWN,0,OFFLINE,Retail,,0.0,NOT_SET,,628.0,57858.0,0,,0.0,0,0,0.0,0.0,8.0,0
2,000007905a28d863f6d0d597892cd692,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1341.0,0,7.0,0,,53447.0,1.0,1.0,1,86,153579.0,18.0,64.0,49,windows10,x64,10.0.0.0,17134,768,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,1.0,0,,0.0,137.0,RequireAdmin,1.0,1.0,Desktop,Windows.Desktop,4908.0,317701.0,4.0,5.0,1972.0,,114473.0,SSD,113907.0,0,4096.0,Desktop,21.5,1920.0,1080.0,Desktop,,4294967000.0,10.0.17134.165,amd64,rs4_release,17134,165,Core,CORE,UUPUpgrade,7.0,30,FullAuto,0,IS_GENUINE,OEM:NONSLP,,0.0,Retail,,142.0,52682.0,0,,0.0,0,0,0.0,0.0,3.0,0


### Census_PrimaryDiskTotalCapacity

In [5]:
col = 'Census_PrimaryDiskTotalCapacity'

In [6]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_PrimaryDiskTotalCapacity,rate
476940.0,2841530,31.85
953869.0,2175780,24.39
305245.0,474616,5.32
122104.0,469060,5.26
244198.0,452284,5.07
238475.0,312093,3.50
29820.0,303493,3.40
114473.0,259542,2.91
715404.0,245946,2.76
228936.0,159491,1.79


In [7]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_PrimaryDiskTotalCapacity,rate
476940.0,2389604,30.43
953869.0,1851450,23.58
122104.0,440396,5.61
244198.0,420386,5.35
305245.0,412019,5.25
238475.0,288226,3.67
29820.0,274337,3.49
114473.0,266451,3.39
715404.0,194669,2.48
228936.0,169523,2.16


In [8]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_PrimaryDiskTotalCapacity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
476940.0,1421497.0,2841530,50.03
953869.0,1168194.0,2175780,53.69
122104.0,247566.0,469060,52.78
244198.0,240056.0,452284,53.08
305245.0,208895.0,474616,44.01
238475.0,150875.0,312093,48.34
114473.0,141545.0,259542,54.54
29820.0,128719.0,303493,42.41
715404.0,124696.0,245946,50.70
228936.0,92078.0,159491,57.73


### Census_PrimaryDiskTypeName

In [9]:
col = 'Census_PrimaryDiskTypeName'

In [10]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_PrimaryDiskTypeName,rate
HDD,5806804,65.09
SSD,2466808,27.65
UNKNOWN,358251,4.02
Unspecified,276776,3.1
,12844,0.14


In [11]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_PrimaryDiskTypeName,rate
HDD,4894312,62.32
SSD,2360847,30.06
UNKNOWN,385419,4.91
Unspecified,205868,2.62
,6807,0.09


In [12]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_PrimaryDiskTypeName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HDD,2941049.0,5806804,50.65
SSD,1226622.0,2466808,49.73
UNKNOWN,161091.0,358251,44.97
Unspecified,123941.0,276776,44.78


### Census_SystemVolumeTotalCapacity

In [16]:
col = 'Census_SystemVolumeTotalCapacity'

In [17]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_SystemVolumeTotalCapacity,rate
,53002,0.59
28542.0,51998,0.58
926992.0,50430,0.57
476389.0,44435,0.50
953253.0,41572,0.47
102400.0,41257,0.46
476324.0,40925,0.46
952728.0,38251,0.43
476323.0,35410,0.40
475799.0,33723,0.38


In [18]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_SystemVolumeTotalCapacity,rate
,74690,0.95
476389.0,56295,0.72
476324.0,51439,0.66
953253.0,48934,0.62
28542.0,48045,0.61
926992.0,42048,0.54
102400.0,38558,0.49
121488.0,35070,0.45
243582.0,34878,0.44
113922.0,34580,0.44


In [19]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_SystemVolumeTotalCapacity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
926992.0,24928.0,50430,49.43
28542.0,22459.0,51998,43.19
476389.0,22088.0,44435,49.71
953253.0,21279.0,41572,51.19
476324.0,20636.0,40925,50.42
102400.0,19443.0,41257,47.13
952728.0,18745.0,38251,49.01
476323.0,16588.0,35410,46.85
475799.0,16341.0,33723,48.46
952727.0,16322.0,33714,48.41


### Census_HasOpticalDiskDrive

In [20]:
col = 'Census_HasOpticalDiskDrive'

In [21]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_HasOpticalDiskDrive,rate
0,8232858,92.28
1,688625,7.72


In [22]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_HasOpticalDiskDrive,rate
0,7321932,93.23
1,531321,6.77


In [23]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_HasOpticalDiskDrive,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4089910.0,8232858,49.68
1,368982.0,688625,53.58


### Census_TotalPhysicalRAM

In [24]:
col = 'Census_TotalPhysicalRAM'

In [25]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_TotalPhysicalRAM,rate
4096.0,4094512,45.89
8192.0,2196505,24.62
2048.0,1097474,12.30
16384.0,531558,5.96
6144.0,398671,4.47
12288.0,159894,1.79
3072.0,152070,1.70
,80533,0.90
1024.0,66054,0.74
32768.0,58107,0.65


In [26]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_TotalPhysicalRAM,rate
4096.0,3529277,44.94
8192.0,1995041,25.40
2048.0,932422,11.87
16384.0,505916,6.44
6144.0,331597,4.22
12288.0,140042,1.78
3072.0,132044,1.68
,95051,1.21
32768.0,55477,0.71
1024.0,49950,0.64


In [27]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_TotalPhysicalRAM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4096.0,2037300.0,4094512,49.76
8192.0,1183643.0,2196505,53.89
2048.0,447292.0,1097474,40.76
16384.0,302810.0,531558,56.97
6144.0,208311.0,398671,52.25
12288.0,90632.0,159894,56.68
3072.0,65255.0,152070,42.91
32768.0,32315.0,58107,55.61
1024.0,16594.0,66054,25.12
24576.0,6668.0,11572,57.62
