# EDA - All
* 이 커널에서는 모든 피처를 하나하나씩 살펴보며 어떤 데이터 양상을 띄고 있는 지 검토할 것입니다.

## 라이브러리

In [1]:
import numpy as np
import pandas as pd
import warnings
import gc
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('max_rows', 150)
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

## 데이터 로드

In [3]:
dtypes = {
        'MachineIdentifier':                                    'object',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [4]:
train = pd.read_csv('./data/train.csv', dtype=dtypes)
test = pd.read_csv('./data/test.csv', dtype=dtypes)

## 컬럼 분석

### Census_ChassisTypeName

In [5]:
col = 'Census_ChassisTypeName'

In [6]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_ChassisTypeName,rate
Notebook,5248812,58.83
Desktop,1872125,20.98
Laptop,685581,7.68
Portable,360903,4.05
AllinOne,204295,2.29
MiniTower,85127,0.95
Convertible,84472,0.95
Other,75782,0.85
UNKNOWN,67212,0.75
Detachable,51466,0.58


In [7]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_ChassisTypeName,rate
Notebook,4498390,57.28
Desktop,1777669,22.64
Laptop,601196,7.66
Portable,307035,3.91
AllinOne,176207,2.24
MiniTower,81411,1.04
Convertible,78658,1.0
Other,73580,0.94
UNKNOWN,52760,0.67
LowProfileDesktop,46903,0.6


In [8]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_ChassisTypeName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Notebook,2629216.0,5248812,50.09
Desktop,982113.0,1872125,52.46
Laptop,337884.0,685581,49.28
Portable,173541.0,360903,48.09
AllinOne,105402.0,204295,51.59
MiniTower,45022.0,85127,52.89
Convertible,40399.0,84472,47.83
UNKNOWN,29881.0,67212,44.46
LowProfileDesktop,25386.0,50072,50.7
Other,20043.0,75782,26.45


### Census_InternalPrimaryDiagonalDisplaySizeInInches

In [9]:
col = 'Census_InternalPrimaryDiagonalDisplaySizeInInches'

In [10]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_InternalPrimaryDiagonalDisplaySizeInInches,rate
15.500000,3047431,34.16
13.898438,952078,10.67
14.000000,542450,6.08
11.601562,319376,3.58
21.500000,275337,3.09
13.296875,243794,2.73
18.500000,213119,2.39
13.203125,206897,2.32
23.000000,190656,2.14
15.601562,180426,2.02


In [11]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_InternalPrimaryDiagonalDisplaySizeInInches,rate
15.500000,2642431,33.65
13.898438,826328,10.52
14.000000,455304,5.80
11.601562,281024,3.58
21.500000,260248,3.31
13.296875,242137,3.08
18.500000,203954,2.60
23.000000,177022,2.25
13.203125,169462,2.16
15.601562,148116,1.89


In [12]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_InternalPrimaryDiagonalDisplaySizeInInches,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15.500000,1557621.0,3047431,51.11
13.898438,497360.0,952078,52.24
14.000000,275052.0,542450,50.71
21.500000,148617.0,275337,53.98
11.601562,143641.0,319376,44.98
18.500000,114494.0,213119,53.72
13.296875,110649.0,243794,45.39
13.203125,103163.0,206897,49.86
23.000000,100231.0,190656,52.57
15.601562,92607.0,180426,51.33


### Census_InternalPrimaryDisplayResolutionHorizontal

In [13]:
col = 'Census_InternalPrimaryDisplayResolutionHorizontal'

In [14]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_InternalPrimaryDisplayResolutionHorizontal,rate
1366.0,4515064,50.61
1920.0,2220648,24.89
1280.0,527430,5.91
1600.0,501288,5.62
1024.0,342620,3.84
1440.0,167131,1.87
1360.0,128010,1.43
1680.0,109716,1.23
2560.0,70124,0.79
2736.0,57640,0.65


In [15]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_InternalPrimaryDisplayResolutionHorizontal,rate
1366.0,3836293,48.85
1920.0,2058589,26.21
1280.0,449206,5.72
1600.0,430751,5.49
1024.0,338101,4.31
1440.0,158096,2.01
1360.0,109580,1.40
1680.0,96049,1.22
2560.0,72092,0.92
2736.0,48956,0.62


In [16]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_InternalPrimaryDisplayResolutionHorizontal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1366.0,2260904.0,4515064,50.07
1920.0,1181868.0,2220648,53.22
1600.0,251423.0,501288,50.16
1280.0,217139.0,527430,41.17
1024.0,154247.0,342620,45.02
1440.0,82433.0,167131,49.32
1360.0,66305.0,128010,51.80
1680.0,56473.0,109716,51.47
2560.0,35670.0,70124,50.87
2736.0,28425.0,57640,49.31


### Census_InternalPrimaryDisplayResolutionVertical

In [17]:
col = 'Census_InternalPrimaryDisplayResolutionVertical'

In [18]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_InternalPrimaryDisplayResolutionVertical,rate
768.0,4973621,55.75
1080.0,2148402,24.08
900.0,655155,7.34
800.0,262058,2.94
1024.0,186322,2.09
1050.0,112220,1.26
1440.0,81251,0.91
1200.0,79397,0.89
600.0,63806,0.72
1824.0,57630,0.65


In [19]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_InternalPrimaryDisplayResolutionVertical,rate
768.0,4275531,54.44
1080.0,1999745,25.46
900.0,576686,7.34
800.0,213129,2.71
1024.0,171969,2.19
1050.0,98336,1.25
1440.0,78213,1.00
1200.0,71323,0.91
600.0,50735,0.65
1824.0,48947,0.62


In [20]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_InternalPrimaryDisplayResolutionVertical,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
768.0,2482859.0,4973621,49.92
1080.0,1156298.0,2148402,53.82
900.0,327660.0,655155,50.01
1024.0,90614.0,186322,48.63
800.0,88240.0,262058,33.67
1050.0,57650.0,112220,51.37
1440.0,40031.0,81251,49.27
1200.0,33106.0,79397,41.70
1824.0,28425.0,57630,49.32
2160.0,24774.0,46777,52.96


### Census_PowerPlatformRoleName

In [21]:
col = 'Census_PowerPlatformRoleName'

In [22]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_PowerPlatformRoleName,rate
Mobile,6182908,69.3
Desktop,2066620,23.16
Slate,492537,5.52
Workstation,109683,1.23
SOHOServer,37841,0.42
UNKNOWN,20628,0.23
EnterpriseServer,7094,0.08
AppliancePC,4015,0.05
PerformanceServer,97,0.0
,55,0.0


In [23]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_PowerPlatformRoleName,rate
Mobile,5323917,67.79
Desktop,1962019,24.98
Slate,402960,5.13
Workstation,101269,1.29
SOHOServer,34793,0.44
UNKNOWN,18482,0.24
EnterpriseServer,6523,0.08
AppliancePC,3175,0.04
PerformanceServer,75,0.0
,26,0.0


In [24]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_PowerPlatformRoleName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mobile,3104529.0,6182908,50.21
Desktop,1083709.0,2066620,52.44
Slate,181293.0,492537,36.81
Workstation,57188.0,109683,52.14
SOHOServer,18311.0,37841,48.39
UNKNOWN,9965.0,20628,48.31
EnterpriseServer,2823.0,7094,39.79
AppliancePC,1004.0,4015,25.01
PerformanceServer,41.0,97,42.27
Unspecified,3.0,5,60.0


### Census_InternalBatteryType

In [25]:
col = 'Census_InternalBatteryType'

In [26]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_InternalBatteryType,rate
,6338429,71.05
lion,2028256,22.73
li-i,245617,2.75
#,183998,2.06
lip,62099,0.7
liio,32635,0.37
li p,8383,0.09
li,6708,0.08
nimh,4614,0.05
pbac,2274,0.03


In [27]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_InternalBatteryType,rate
,5979844,76.14
lion,1428882,18.19
#,175307,2.23
li-i,174765,2.23
lip,47080,0.6
liio,24624,0.31
li p,6300,0.08
li,4840,0.06
nimh,3428,0.04
real,2115,0.03


In [28]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_InternalBatteryType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lion,979374.0,2028256,48.29
li-i,119485.0,245617,48.65
#,95183.0,183998,51.73
lip,32999.0,62099,53.14
liio,15860.0,32635,48.6
li p,4190.0,8383,49.98
li,2998.0,6708,44.69
nimh,1927.0,4614,41.76
real,1522.0,2744,55.47
pbac,1294.0,2274,56.9


### Census_InternalBatteryNumberOfCharges

In [29]:
col = 'Census_InternalBatteryNumberOfCharges'

In [30]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_InternalBatteryNumberOfCharges,rate
0.000000e+00,5053404,56.64
4.294967e+09,2263993,25.38
,268755,3.01
1.000000e+00,53810,0.60
2.000000e+00,28128,0.32
1.600000e+01,27348,0.31
3.000000e+00,21537,0.24
3.300000e+01,19723,0.22
4.000000e+00,18020,0.20
5.000000e+00,16190,0.18


In [31]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_InternalBatteryNumberOfCharges,rate
0.000000e+00,4335051,55.20
4.294967e+09,2153981,27.43
,239233,3.05
1.000000e+00,49750,0.63
2.000000e+00,26035,0.33
1.600000e+01,22695,0.29
3.000000e+00,19993,0.25
3.300000e+01,17714,0.23
4.000000e+00,16654,0.21
5.000000e+00,14412,0.18


In [32]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_InternalBatteryNumberOfCharges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.000000e+00,2511817.0,5053404,49.71
4.294967e+09,1167305.0,2263993,51.56
1.000000e+00,22607.0,53810,42.01
1.600000e+01,12456.0,27348,45.55
2.000000e+00,11673.0,28128,41.50
3.300000e+01,11060.0,19723,56.08
3.000000e+00,8957.0,21537,41.59
4.000000e+00,7405.0,18020,41.09
5.000000e+00,6645.0,16190,41.04
6.000000e+00,5971.0,14472,41.26


### Census_OSVersion

In [33]:
col = 'Census_OSVersion'

In [34]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_OSVersion,rate
10.0.17134.228,1413627,15.85
10.0.17134.165,899711,10.08
10.0.16299.431,546546,6.13
10.0.17134.285,470280,5.27
10.0.16299.547,346853,3.89
10.0.17134.112,346410,3.88
10.0.16299.371,325267,3.65
10.0.17134.191,228254,2.56
10.0.14393.2189,223775,2.51
10.0.16299.611,216776,2.43


In [35]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_OSVersion,rate
10.0.17134.345,1377565,17.54
10.0.17134.285,669674,8.53
10.0.17134.407,520122,6.62
10.0.17134.286,365793,4.66
10.0.16299.431,283978,3.62
10.0.17134.112,227059,2.89
10.0.10240.17443,195916,2.49
10.0.16299.371,195793,2.49
10.0.14393.2189,191389,2.44
10.0.10586.1176,164920,2.10


In [36]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_OSVersion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10.0.17134.228,754499.0,1413627,53.37
10.0.17134.165,507414.0,899711,56.40
10.0.16299.431,272300.0,546546,49.82
10.0.17134.285,236785.0,470280,50.35
10.0.16299.547,182365.0,346853,52.58
10.0.17134.112,170981.0,346410,49.36
10.0.16299.371,147289.0,325267,45.28
10.0.16299.611,115024.0,216776,53.06
10.0.17134.191,110461.0,228254,48.39
10.0.14393.2189,108951.0,223775,48.69


### Census_OSArchitecture

In [37]:
col = 'Census_OSArchitecture'

In [38]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_OSArchitecture,rate
amd64,8105885,90.86
x86,815252,9.14
arm64,346,0.0


In [39]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_OSArchitecture,rate
amd64,7174667,91.36
x86,678375,8.64
arm64,211,0.0


In [40]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_OSArchitecture,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
amd64,4144868.0,8105885,51.13
x86,314019.0,815252,38.52
arm64,5.0,346,1.45


### Census_OSBranch

In [60]:
col = 'Census_OSBranch'

In [61]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_OSBranch,rate
rs4_release,4009158,44.94
rs3_release,1237321,13.87
rs3_release_svc_escrow,1199767,13.45
rs2_release,797066,8.93
rs1_release,785534,8.8
th2_release,326655,3.66
th2_release_sec,266882,2.99
th1_st1,195840,2.2
th1,75764,0.85
rs5_release,15324,0.17


In [62]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_OSBranch,rate
rs4_release,3979144,50.67
rs3_release,1127648,14.36
rs2_release,626698,7.98
rs1_release,603919,7.69
rs3_release_svc_escrow,496021,6.32
rs5_release,290608,3.7
th2_release,240049,3.06
th2_release_sec,224555,2.86
th1_st1,183721,2.34
th1,67214,0.86


In [63]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_OSBranch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rs4_release,2092879.0,4009158,52.2
rs3_release_svc_escrow,619778.0,1199767,51.66
rs3_release,575677.0,1237321,46.53
rs2_release,381620.0,797066,47.88
rs1_release,364050.0,785534,46.34
th2_release,161993.0,326655,49.59
th2_release_sec,119134.0,266882,44.64
th1_st1,99089.0,195840,50.6
th1,33014.0,75764,43.57
rs5_release,6579.0,15324,42.93


### Census_OSBuildNumber

In [64]:
col = 'Census_OSBuildNumber'

In [65]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_OSBuildNumber,rate
17134,4008881,44.94
16299,2443249,27.39
15063,797049,8.93
14393,785450,8.80
10586,593527,6.65
10240,271604,3.04
17692,3096,0.03
17738,3062,0.03
17744,2372,0.03
17758,1703,0.02


In [66]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_OSBuildNumber,rate
17134,3978878,50.67
16299,1627934,20.73
15063,626684,7.98
14393,603871,7.69
10586,464599,5.92
17763,285400,3.63
10240,250935,3.20
18252,3299,0.04
17758,2984,0.04
18282,1214,0.02


In [67]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_OSBuildNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17134,2092768.0,4008881,52.20
16299,1198018.0,2443249,49.03
15063,381610.0,797049,47.88
14393,364004.0,785450,46.34
10586,281120.0,593527,47.36
10240,132103.0,271604,48.64
17692,1407.0,3096,45.45
17738,1142.0,3062,37.30
17744,1062.0,2372,44.77
17758,747.0,1703,43.86


### Census_OSBuildRevision

In [68]:
col = 'Census_OSBuildRevision'

In [69]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_OSBuildRevision,rate
228,1413633,15.85
165,899712,10.08
431,546548,6.13
285,470280,5.27
547,346853,3.89
112,346488,3.88
371,325267,3.65
191,228255,2.56
2189,223775,2.51
611,216776,2.43


In [70]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_OSBuildRevision,rate
345,1377568,17.54
285,669675,8.53
407,520122,6.62
286,365795,4.66
431,283978,3.62
112,227122,2.89
17443,195916,2.49
371,195793,2.49
2189,191389,2.44
1,179005,2.28


In [71]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_OSBuildRevision,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
228,754503.0,1413633,53.37
165,507414.0,899712,56.40
431,272301.0,546548,49.82
285,236785.0,470280,50.35
547,182365.0,346853,52.58
112,171022.0,346488,49.36
371,147289.0,325267,45.28
611,115024.0,216776,53.06
191,110462.0,228255,48.39
2189,108951.0,223775,48.69


### Census_OSEdition

In [72]:
col = 'Census_OSEdition'

In [73]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_OSEdition,rate
Core,3469991,38.89
Professional,3130566,35.09
CoreSingleLanguage,1945461,21.81
CoreCountrySpecific,166100,1.86
ProfessionalEducation,56698,0.64
Education,40704,0.46
Enterprise,35603,0.4
ProfessionalN,28341,0.32
EnterpriseS,20020,0.22
ServerStandard,10128,0.11


In [74]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_OSEdition,rate
Professional,3101956,39.5
Core,2773165,35.31
CoreSingleLanguage,1596966,20.34
CoreCountrySpecific,177496,2.26
ProfessionalEducation,54834,0.7
Education,37910,0.48
Enterprise,34282,0.44
ProfessionalN,32466,0.41
EnterpriseS,20000,0.25
ServerStandard,8050,0.1


In [75]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_OSEdition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Core,1637162.0,3469991,47.18
Professional,1601859.0,3130566,51.17
CoreSingleLanguage,1029318.0,1945461,52.91
CoreCountrySpecific,83182.0,166100,50.08
ProfessionalEducation,31083.0,56698,54.82
Education,21623.0,40704,53.12
Enterprise,18404.0,35603,51.69
ProfessionalN,14891.0,28341,52.54
EnterpriseS,10321.0,20020,51.55
ServerStandard,3758.0,10128,37.11


### Census_OSSkuName

In [76]:
col = 'Census_OSSkuName'

In [77]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_OSSkuName,rate
CORE,3469869,38.89
PROFESSIONAL,3187913,35.73
CORE_SINGLELANGUAGE,1945133,21.8
CORE_COUNTRYSPECIFIC,165886,1.86
EDUCATION,40827,0.46
ENTERPRISE,35602,0.4
PROFESSIONAL_N,28522,0.32
ENTERPRISE_S,20022,0.22
STANDARD_SERVER,10128,0.11
CLOUD,6167,0.07


In [78]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_OSSkuName,rate
PROFESSIONAL,3157300,40.2
CORE,2773064,35.31
CORE_SINGLELANGUAGE,1596754,20.33
CORE_COUNTRYSPECIFIC,177269,2.26
EDUCATION,38152,0.49
ENTERPRISE,34135,0.43
PROFESSIONAL_N,32693,0.42
ENTERPRISE_S,20003,0.25
STANDARD_SERVER,8045,0.1
CLOUD,6024,0.08


In [79]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_OSSkuName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CORE,1637109.0,3469869,47.18
PROFESSIONAL,1633312.0,3187913,51.23
CORE_SINGLELANGUAGE,1029117.0,1945133,52.91
CORE_COUNTRYSPECIFIC,83054.0,165886,50.07
EDUCATION,21703.0,40827,53.16
ENTERPRISE,18390.0,35602,51.65
PROFESSIONAL_N,14983.0,28522,52.53
ENTERPRISE_S,10321.0,20022,51.55
STANDARD_SERVER,3761.0,10128,37.13
CLOUD,2466.0,6167,39.99


### Census_OSInstallTypeName

In [80]:
col = 'Census_OSInstallTypeName'

In [81]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Census_OSInstallTypeName,rate
UUPUpgrade,2608037,29.23
IBSClean,1650733,18.5
Update,1593308,17.86
Upgrade,1251559,14.03
Other,840121,9.42
Reset,649201,7.28
Refresh,205842,2.31
Clean,69073,0.77
CleanPCRefresh,53609,0.6


In [82]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Census_OSInstallTypeName,rate
IBSClean,2110259,26.87
UUPUpgrade,1847191,23.52
Update,1109763,14.13
Upgrade,942760,12.0
Other,809406,10.31
Reset,630962,8.03
Refresh,256050,3.26
Clean,74920,0.95
CleanPCRefresh,71942,0.92


In [83]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Census_OSInstallTypeName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UUPUpgrade,1373135.0,2608037,52.65
IBSClean,862680.0,1650733,52.26
Update,761228.0,1593308,47.78
Upgrade,592433.0,1251559,47.34
Other,403435.0,840121,48.02
Reset,311696.0,649201,48.01
Refresh,93885.0,205842,45.61
Clean,35649.0,69073,51.61
CleanPCRefresh,24751.0,53609,46.17
