# EDA - All
* 이 커널에서는 모든 피처를 하나하나씩 살펴보며 어떤 데이터 양상을 띄고 있는 지 검토할 것입니다.

## 라이브러리

In [1]:
import numpy as np
import pandas as pd
import warnings
import gc
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('max_rows', 150)
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

## 데이터 로드

In [3]:
dtypes = {
        'MachineIdentifier':                                    'object',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [4]:
train = pd.read_csv('./data/train.csv', dtype=dtypes)
test = pd.read_csv('./data/test.csv', dtype=dtypes)

## 컬럼 분석

### GeoNameIdentifier

In [138]:
col = 'GeoNameIdentifier'

In [139]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,GeoNameIdentifier,rate
277.0,1531929,17.17
211.0,423166,4.74
53.0,408807,4.58
89.0,360798,4.04
240.0,346568,3.88
35.0,345904,3.88
167.0,339845,3.81
276.0,296774,3.33
267.0,215812,2.42
126.0,198021,2.22


In [140]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,GeoNameIdentifier,rate
277.0,1407739,17.93
53.0,464823,5.92
211.0,462082,5.88
89.0,303780,3.87
35.0,297959,3.79
240.0,296945,3.78
167.0,295600,3.76
276.0,240133,3.06
267.0,211515,2.69
126.0,170329,2.17


In [141]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
GeoNameIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
277.0,784129.0,1531929,51.19
211.0,212070.0,423166,50.12
53.0,211714.0,408807,51.79
167.0,177251.0,339845,52.16
35.0,175448.0,345904,50.72
240.0,172596.0,346568,49.80
89.0,170922.0,360798,47.37
276.0,138394.0,296774,46.63
267.0,111478.0,215812,51.66
126.0,95856.0,198021,48.41


### LocaleEnglishNameIdentifier

In [142]:
col = 'LocaleEnglishNameIdentifier'

In [143]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,LocaleEnglishNameIdentifier,rate
75,2094585,23.48
-74,450088,5.04
74,411056,4.61
42,409616,4.59
88,375223,4.21
-85,343615,3.85
-23,341279,3.83
-29,333102,3.73
-5,211841,2.37
124,195088,2.19


In [144]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,LocaleEnglishNameIdentifier,rate
75,1979128,25.20
-74,486382,6.19
42,465224,5.92
74,335619,4.27
88,312996,3.99
-85,295457,3.76
-23,291967,3.72
-29,289261,3.68
-5,206585,2.63
124,167661,2.13


In [145]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
LocaleEnglishNameIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
75,1082101.0,2094585,51.66
-74,225819.0,450088,50.17
42,212304.0,409616,51.83
74,192348.0,411056,46.79
88,176729.0,375223,47.10
-85,174588.0,343615,50.81
-29,174223.0,333102,52.30
-23,170432.0,341279,49.94
-5,109727.0,211841,51.80
124,94665.0,195088,48.52


### Platform

In [146]:
col = 'Platform'

In [147]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Platform,rate
windows10,8618715,96.61
windows8,194508,2.18
windows7,93889,1.05
windows2016,14371,0.16


In [148]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Platform,rate
windows10,7675480,97.74
windows8,111547,1.42
windows7,55240,0.7
windows2016,10986,0.14


In [149]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Platform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
windows10,4309629.0,8618715,50.0
windows8,98561.0,194508,50.67
windows7,45678.0,93889,48.65
windows2016,5024.0,14371,34.96


### Processor

In [150]:
col = 'Processor'

In [151]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,Processor,rate
x64,8105435,90.85
x86,815702,9.14
arm64,346,0.0


In [152]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,Processor,rate
x64,7173360,91.34
x86,679682,8.65
arm64,211,0.0


In [153]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
Processor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
x64,4145493.0,8105435,51.14
x86,313394.0,815702,38.42
arm64,5.0,346,1.45


### OsVer

In [154]:
col = 'OsVer'

In [155]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,OsVer,rate
10.0.0.0,8632545,96.76
6.3.0.0,194447,2.18
6.1.1.0,93268,1.05
6.1.0.0,582,0.01
10.0.21.0,1,0.0
10.0.19.80,1,0.0
6.3.7.0,1,0.0
6.1.0.112,1,0.0
6.3.80.0,1,0.0
6.3.0.112,1,0.0


In [156]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,OsVer,rate
10.0.0.0,7686083,97.87
6.3.0.0,111520,1.42
6.1.1.0,54653,0.7
6.1.0.0,572,0.01
10.0.1.44,1,0.0
10.0.0.112,2,0.0
10.0.7.80,2,0.0
6.1.2.0,1,0.0
10.0.32.0,1,0.0
6.3.153.153,1,0.0


In [157]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
OsVer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10.0.0.0,4314355.0,8632545,49.98
6.3.0.0,98525.0,194447,50.67
6.1.1.0,45358.0,93268,48.63
6.1.0.0,289.0,582,49.66
10.0.3.0,130.0,225,57.78
10.0.1.0,79.0,141,56.03
6.1.3.0,25.0,30,83.33
10.0.2.0,17.0,30,56.67
6.3.3.0,14.0,24,58.33
6.3.1.0,14.0,22,63.64


### OsBuild

In [158]:
col = 'OsBuild'

In [159]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,OsBuild,rate
17134,3915521,43.89
16299,2503681,28.06
15063,780270,8.75
14393,730819,8.19
10586,411606,4.61
10240,270192,3.03
9600,194508,2.18
7601,93306,1.05
17692,3184,0.04
17744,2290,0.03


In [160]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,OsBuild,rate
17134,3893188,49.57
16299,1690582,21.53
15063,616089,7.85
14393,575569,7.33
10586,367350,4.68
17763,280705,3.57
10240,248346,3.16
9600,111547,1.42
7601,54667,0.7
18252,3246,0.04


In [161]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
OsBuild,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17134,2038918.0,3915521,52.07
16299,1232131.0,2503681,49.21
15063,373652.0,780270,47.89
14393,337835.0,730819,46.23
10586,191739.0,411606,46.58
10240,131471.0,270192,48.66
9600,98561.0,194508,50.67
7601,45387.0,93306,48.64
17692,1412.0,3184,44.35
17738,1103.0,2478,44.51


### OsSuite

In [162]:
col = 'OsSuite'

In [163]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,OsSuite,rate
768,5560661,62.33
256,3346251,37.51
272,12092,0.14
400,793,0.01
16,731,0.01
305,662,0.01
784,198,0.0
274,39,0.0
144,34,0.0
49,17,0.0


In [164]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,OsSuite,rate
768,4539020,57.8
256,3303044,42.06
272,9106,0.12
16,662,0.01
305,595,0.01
400,556,0.01
784,202,0.0
274,23,0.0
144,20,0.0
49,18,0.0


In [165]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
OsSuite,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
768,2735465.0,5560661,49.19
256,1718312.0,3346251,51.35
272,4056.0,12092,33.54
16,440.0,731,60.19
305,279.0,662,42.15
400,210.0,793,26.48
784,89.0,198,44.95
144,18.0,34,52.94
274,11.0,39,28.21
49,9.0,17,52.94


### OsPlatformSubRelease

In [166]:
col = 'OsPlatformSubRelease'

In [167]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,OsPlatformSubRelease,rate
rs4,3915526,43.89
rs3,2503681,28.06
rs2,780270,8.75
rs1,730819,8.19
th2,411606,4.61
th1,270192,3.03
windows8.1,194508,2.18
windows7,93889,1.05
prers5,20992,0.24


In [168]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,OsPlatformSubRelease,rate
rs4,3893189,49.57
rs3,1690582,21.53
rs2,616089,7.85
rs1,575569,7.33
th2,367350,4.68
prers5,295341,3.76
th1,248346,3.16
windows8.1,111547,1.42
windows7,55240,0.7


In [169]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
OsPlatformSubRelease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rs4,2038921.0,3915526,52.07
rs3,1232131.0,2503681,49.21
rs2,373652.0,780270,47.89
rs1,337835.0,730819,46.23
th2,191739.0,411606,46.58
th1,131471.0,270192,48.66
windows8.1,98561.0,194508,50.67
windows7,45678.0,93889,48.65
prers5,8904.0,20992,42.42


### SkuEdition

In [170]:
col = 'SkuEdition'

In [171]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,SkuEdition,rate
Home,5514341,61.81
Pro,3224164,36.14
Invalid,78054,0.87
Education,40694,0.46
Enterprise,34357,0.39
Enterprise LTSB,20702,0.23
Cloud,5589,0.06
Server,3582,0.04


In [172]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,SkuEdition,rate
Home,4513114,57.47
Pro,3195422,40.69
Invalid,48013,0.61
Education,36874,0.47
Enterprise,31700,0.4
Enterprise LTSB,20132,0.26
Cloud,5639,0.07
Server,2359,0.03


In [173]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
SkuEdition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Home,2714523.0,5514341,49.23
Pro,1654726.0,3224164,51.32
Invalid,36500.0,78054,46.76
Education,21445.0,40694,52.7
Enterprise,17753.0,34357,51.67
Enterprise LTSB,10820.0,20702,52.27
Cloud,2143.0,5589,38.34
Server,982.0,3582,27.41


### IsProtected

In [175]:
col = 'IsProtected'

In [176]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,IsProtected,rate
1.0,8402282,94.18
0.0,483157,5.42
,36044,0.4


In [177]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,IsProtected,rate
1.0,7396943,94.19
0.0,432661,5.51
,23649,0.3


In [178]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
IsProtected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,4261098.0,8402282,50.71
0.0,184253.0,483157,38.14


### AutoSampleOptIn

In [183]:
col = 'AutoSampleOptIn'

In [184]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,AutoSampleOptIn,rate
0,8921225,100.0
1,258,0.0


In [185]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,AutoSampleOptIn,rate
0,7853091,100.0
1,162,0.0


In [186]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
AutoSampleOptIn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4458751.0,8921225,49.98
1,141.0,258,54.65


### PuaMode

In [187]:
col = 'PuaMode'

In [188]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,PuaMode,rate
,8919174,99.97
on,2307,0.03
audit,2,0.0


In [189]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,PuaMode,rate
,7851065,99.97
on,2178,0.03
audit,10,0.0


In [190]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
PuaMode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
on,1699.0,2307,73.65
audit,2.0,2,100.0


### SMode

In [191]:
col = 'SMode'

In [192]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,SMode,rate
0.0,8379843,93.93
,537759,6.03
1.0,3881,0.04


In [193]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,SMode,rate
,5831272,74.25
0.0,2015562,25.67
1.0,6419,0.08


In [194]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
SMode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,4234781.0,8379843,50.54
1.0,650.0,3881,16.75


### IeVerIdentifier

In [195]:
col = 'IeVerIdentifier'

In [196]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,IeVerIdentifier,rate
137.0,3885842,43.56
117.0,1767931,19.82
108.0,474390,5.32
111.0,467828,5.24
98.0,354411,3.97
135.0,217458,2.44
53.0,204952,2.30
74.0,202542,2.27
94.0,173593,1.95
105.0,173448,1.94


In [197]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,IeVerIdentifier,rate
137.0,3867603,49.25
117.0,1089823,13.88
111.0,394754,5.03
108.0,372054,4.74
98.0,291669,3.71
205.0,275721,3.51
53.0,192580,2.45
74.0,175374,2.23
135.0,172538,2.20
105.0,160115,2.04


In [198]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
IeVerIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
137.0,2023823.0,3885842,52.08
117.0,873124.0,1767931,49.39
111.0,236175.0,467828,50.48
108.0,226004.0,474390,47.64
98.0,168671.0,354411,47.59
135.0,102612.0,217458,47.19
53.0,99644.0,204952,48.62
74.0,89873.0,202542,44.37
105.0,88005.0,173448,50.74
333.0,82663.0,156391,52.86


### SmartScreen

In [199]:
col = 'SmartScreen'

In [200]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,SmartScreen,rate
RequireAdmin,4316183,48.38
,3177011,35.61
ExistsNotSet,1046183,11.73
Off,186553,2.09
Warn,135483,1.52
Prompt,34533,0.39
Block,22533,0.25
off,1350,0.02
On,731,0.01
Promt,2,0.0


In [201]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,SmartScreen,rate
,3498402,44.55
RequireAdmin,3413560,43.47
ExistsNotSet,600446,7.65
Off,161371,2.05
Warn,125925,1.6
Prompt,28889,0.37
Block,21241,0.27
off,1783,0.02
On,805,0.01
&#x02;,404,0.01


In [202]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
SmartScreen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RequireAdmin,1887923.0,4316183,43.74
ExistsNotSet,844650.0,1046183,80.74
Off,91959.0,186553,49.29
Warn,78101.0,135483,57.65
Prompt,16444.0,34533,47.62
Block,11698.0,22533,51.91
off,816.0,1350,60.44
On,422.0,731,57.73
&#x02;,219.0,416,52.64
&#x01;,199.0,335,59.4
