# EDA - All
* 이 커널에서는 모든 피처를 하나하나씩 살펴보며 어떤 데이터 양상을 띄고 있는 지 검토할 것입니다.

## 라이브러리

In [1]:
import numpy as np
import pandas as pd
import warnings
import gc
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('max_rows', 150)
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

## 데이터 로드

In [3]:
dtypes = {
        'MachineIdentifier':                                    'object',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [4]:
train = pd.read_csv('./data/train.csv', dtype=dtypes)
test = pd.read_csv('./data/test.csv', dtype=dtypes)

## 컬럼 분석

### ProductName

In [74]:
col = 'ProductName'

In [75]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,ProductName,rate
win8defender,8826520,98.94
mse,94873,1.06
mseprerelease,53,0.0
scep,22,0.0
windowsintune,8,0.0
fep,7,0.0


In [76]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,ProductName,rate
win8defender,7797245,99.29
mse,55946,0.71
mseprerelease,34,0.0
scep,16,0.0
fep,7,0.0
windowsintune,5,0.0


In [77]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
ProductName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
win8defender,4412891.0,8826520,50.0
mse,45961.0,94873,48.44
mseprerelease,26.0,53,49.06
scep,10.0,22,45.45
fep,3.0,7,42.86
windowsintune,1.0,8,12.5


### EngineVersion

In [82]:
col = 'EngineVersion'

In [79]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,EngineVersion,rate
1.1.15200.1,3845067,43.1
1.1.15100.1,3675915,41.2
1.1.15000.2,265218,2.97
1.1.14901.4,212408,2.38
1.1.14600.4,160585,1.8
1.1.14800.3,136476,1.53
1.1.15300.6,120295,1.35
1.1.14104.0,93926,1.05
1.1.13504.0,70645,0.79
1.1.15300.5,68716,0.77


In [80]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,EngineVersion,rate
1.1.15300.6,3101305,39.49
1.1.15400.4,2106236,26.82
1.1.15400.5,1491273,18.99
1.1.15200.1,366085,4.66
1.1.15100.1,158036,2.01
1.1.14600.4,138514,1.76
1.1.14901.4,75808,0.97
1.1.14104.0,72795,0.93
1.1.15000.2,66904,0.85
1.1.14800.3,55992,0.71


In [81]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
EngineVersion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.1.15100.1,2031651.0,3675915,55.27
1.1.15200.1,1890743.0,3845067,49.17
1.1.15000.2,91183.0,265218,34.38
1.1.14600.4,70201.0,160585,43.72
1.1.14901.4,64245.0,212408,30.25
1.1.15300.6,59240.0,120295,49.25
1.1.14800.3,42273.0,136476,30.97
1.1.14104.0,41295.0,93926,43.97
1.1.15300.5,36099.0,68716,52.53
1.1.13504.0,30807.0,70645,43.61


### AppVersion

In [83]:
col = 'AppVersion'

In [84]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,AppVersion,rate
4.18.1807.18075,5139224,57.61
4.18.1806.18062,850929,9.54
4.12.16299.15,359871,4.03
4.10.209.0,272455,3.05
4.13.17134.1,257270,2.88
4.16.17656.18052,235032,2.63
4.13.17134.228,226501,2.54
4.8.10240.17443,205480,2.3
4.9.10586.1106,203525,2.28
4.14.17639.18041,194699,2.18


In [85]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,AppVersion,rate
4.18.1809.2,2738721,34.87
4.18.1810.5,2129928,27.12
4.18.1807.18075,685600,8.73
4.12.16299.15,267102,3.4
4.13.17134.1,231117,2.94
4.8.10240.17443,193085,2.46
4.9.10586.1106,176442,2.25
4.13.17134.320,169995,2.16
4.10.209.0,155962,1.99
4.13.17134.228,118351,1.51


In [86]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
AppVersion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4.18.1807.18075,2725768.0,5139224,53.04
4.18.1806.18062,412883.0,850929,48.52
4.12.16299.15,164577.0,359871,45.73
4.10.209.0,136731.0,272455,50.18
4.13.17134.1,134242.0,257270,52.18
4.8.10240.17443,99926.0,205480,48.63
4.13.17134.228,97089.0,226501,42.86
4.9.10586.1106,90466.0,203525,44.45
4.16.17656.18052,76175.0,235032,32.41
4.14.17639.18041,61717.0,194699,31.7


### AvSigVersion

In [88]:
col = 'AvSigVersion'

In [89]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,AvSigVersion,rate
1.273.1420.0,102317,1.15
1.263.48.0,98024,1.10
1.275.1140.0,97232,1.09
1.275.727.0,92448,1.04
1.273.371.0,86967,0.97
1.273.1826.0,86013,0.96
1.275.1244.0,78902,0.88
1.251.42.0,76837,0.86
1.275.1209.0,66393,0.74
1.273.810.0,65895,0.74


In [90]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,AvSigVersion,rate
1.263.48.0,132624,1.69
1.277.515.0,80393,1.02
1.251.42.0,73723,0.94
1.279.102.0,73108,0.93
1.279.32.0,67893,0.86
1.277.96.0,57377,0.73
1.277.1044.0,54981,0.70
1.277.1102.0,52147,0.66
1.237.0.0,47158,0.60
1.281.261.0,46284,0.59


In [91]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
AvSigVersion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.273.1420.0,57252.0,102317,55.96
1.273.1826.0,48004.0,86013,55.81
1.275.727.0,46108.0,92448,49.87
1.263.48.0,45767.0,98024,46.69
1.273.371.0,45375.0,86967,52.17
1.275.1140.0,43632.0,97232,44.87
1.273.1749.0,37227.0,65381,56.94
1.275.1244.0,37165.0,78902,47.10
1.273.810.0,36365.0,65895,55.19
1.251.42.0,34479.0,76837,44.87


### IsBeta

In [93]:
col = 'IsBeta'

In [94]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,IsBeta,rate
0,8921416,100.0
1,67,0.0


In [95]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,IsBeta,rate
0,7853207,100.0
1,46,0.0


In [96]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
IsBeta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4458859.0,8921416,49.98
1,33.0,67,49.25


### RtpStateBitfield

In [97]:
col = 'RtpStateBitfield'

In [98]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,RtpStateBitfield,rate
7.0,8651487,96.97
0.0,190701,2.14
,32318,0.36
8.0,21974,0.25
5.0,20328,0.23
3.0,3029,0.03
1.0,1625,0.02
35.0,21,0.0


In [99]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,RtpStateBitfield,rate
7.0,7619055,97.02
0.0,162531,2.07
,32222,0.41
8.0,23229,0.3
5.0,12845,0.16
3.0,1921,0.02
1.0,1425,0.02
35.0,24,0.0
40.0,1,0.0


In [100]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
RtpStateBitfield,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7.0,4351523.0,8651487,50.3
0.0,71641.0,190701,37.57
8.0,16426.0,21974,74.75
5.0,3069.0,20328,15.1
3.0,907.0,3029,29.94
1.0,649.0,1625,39.94
35.0,19.0,21,90.48


### IsSxsPassiveMode

In [102]:
col = 'IsSxsPassiveMode'

In [103]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,IsSxsPassiveMode,rate
0,8766840,98.27
1,154643,1.73


In [104]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,IsSxsPassiveMode,rate
0,7728637,98.41
1,124616,1.59


In [105]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
IsSxsPassiveMode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4402017.0,8766840,50.21
1,56875.0,154643,36.78


### DefaultBrowsersIdentifier

In [106]:
col = 'DefaultBrowsersIdentifier'

In [107]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,DefaultBrowsersIdentifier,rate
,8488045,95.14
239.0,46056,0.52
3196.0,42694,0.48
1632.0,28751,0.32
3176.0,24222,0.27
146.0,20756,0.23
1910.0,19416,0.22
1727.0,17393,0.19
2724.0,14538,0.16
2064.0,13993,0.16


In [108]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,DefaultBrowsersIdentifier,rate
,7546134,96.09
239.0,48488,0.62
3196.0,37181,0.47
1632.0,16461,0.21
146.0,13494,0.17
3176.0,13135,0.17
1910.0,11825,0.15
1727.0,11781,0.15
2064.0,11304,0.14
1623.0,8246,0.11


In [109]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
DefaultBrowsersIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
239.0,26221.0,46056,56.93
3196.0,23438.0,42694,54.90
1632.0,12134.0,28751,42.20
3176.0,9590.0,24222,39.59
1727.0,8088.0,17393,46.50
146.0,7092.0,20756,34.17
1910.0,6605.0,19416,34.02
1160.0,6374.0,12594,50.61
2064.0,5949.0,13993,42.51
2724.0,5920.0,14538,40.72


### AVProductStatesIdentifier

In [110]:
col = 'AVProductStatesIdentifier'

In [111]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,AVProductStatesIdentifier,rate
53447.0,5824565,65.29
7945.0,475897,5.33
47238.0,327656,3.67
62773.0,266764,2.99
46413.0,112878,1.27
23657.0,100455,1.13
49480.0,99899,1.12
41571.0,97659,1.09
29199.0,73550,0.82
51954.0,67616,0.76


In [112]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,AVProductStatesIdentifier,rate
53447.0,5508622,70.14
47238.0,301455,3.84
7945.0,295413,3.76
62773.0,255324,3.25
49480.0,81463,1.04
46413.0,78563,1.00
51954.0,64358,0.82
23657.0,60000,0.76
41571.0,53657,0.68
29199.0,43483,0.55


In [113]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
AVProductStatesIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
53447.0,3240582.0,5824565,55.64
7945.0,165771.0,475897,34.83
47238.0,152550.0,327656,46.56
62773.0,101939.0,266764,38.21
46413.0,55708.0,112878,49.35
23657.0,41380.0,100455,41.19
49480.0,39753.0,99899,39.79
29199.0,35973.0,73550,48.91
51954.0,35240.0,67616,52.12
41571.0,28277.0,97659,28.95


### AVProductsInstalled

In [114]:
col = 'AVProductsInstalled'

In [115]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,AVProductsInstalled,rate
1.0,6208893,69.59
2.0,2459008,27.56
3.0,208103,2.33
,36221,0.41
4.0,8757,0.1
5.0,471,0.01
6.0,28,0.0
7.0,1,0.0
0.0,1,0.0


In [116]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,AVProductsInstalled,rate
1.0,5849609,74.49
2.0,1825605,23.25
3.0,147704,1.88
,23767,0.3
4.0,6218,0.08
5.0,329,0.0
6.0,21,0.0


In [117]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
AVProductsInstalled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,3406078.0,6208893,54.86
2.0,975996.0,2459008,39.69
3.0,60682.0,208103,29.16
4.0,2371.0,8757,27.08
5.0,125.0,471,26.54
6.0,6.0,28,21.43
7.0,1.0,1,100.0
0.0,0.0,1,0.0


### AVProductsEnabled

In [179]:
col = 'AVProductsEnabled'

In [180]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,AVProductsEnabled,rate
1.0,8654101,97.0
2.0,198652,2.23
,36221,0.41
0.0,25958,0.29
3.0,6075,0.07
4.0,453,0.01
5.0,23,0.0


In [181]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,AVProductsEnabled,rate
1.0,7657407,97.51
2.0,148869,1.9
,23767,0.3
0.0,17603,0.22
3.0,5193,0.07
4.0,396,0.01
5.0,18,0.0


In [182]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
AVProductsEnabled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,4367169.0,8654101,50.46
2.0,66831.0,198652,33.64
0.0,9184.0,25958,35.38
3.0,1924.0,6075,31.67
4.0,149.0,453,32.89
5.0,2.0,23,8.7


### HasTpm

In [121]:
col = 'HasTpm'

In [122]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,HasTpm,rate
1,8814167,98.8
0,107316,1.2


In [123]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,HasTpm,rate
1,7788201,99.17
0,65052,0.83


In [124]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
HasTpm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4407926.0,8814167,50.01
0,50966.0,107316,47.49


### CountryIdentifier

In [125]:
col = 'CountryIdentifier'

In [126]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,CountryIdentifier,rate
43,397172,4.45
29,347991,3.90
141,333411,3.74
93,283625,3.18
171,280572,3.14
60,231981,2.60
201,220622,2.47
207,211645,2.37
66,208579,2.34
89,200516,2.25


In [127]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,CountryIdentifier,rate
43,457783,5.83
171,310506,3.95
29,299643,3.82
141,292722,3.73
93,237214,3.02
201,216462,2.76
60,208370,2.65
89,204003,2.60
214,191298,2.44
207,191072,2.43


In [128]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
CountryIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
43,203364.0,397172,51.20
29,175846.0,347991,50.53
141,173263.0,333411,51.97
93,139601.0,283625,49.22
171,139257.0,280572,49.63
89,118148.0,200516,58.92
214,116083.0,191269,60.69
60,114256.0,231981,49.25
201,112664.0,220622,51.07
207,102423.0,211645,48.39


### CityIdentifier

In [129]:
col = 'CityIdentifier'

In [130]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,CityIdentifier,rate
,325409,3.65
130775.0,94812,1.06
16668.0,84780,0.95
82373.0,83312,0.93
10222.0,71814,0.80
61668.0,66845,0.75
143782.0,64650,0.72
66202.0,64286,0.72
58607.0,60598,0.68
66953.0,57874,0.65


In [131]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,CityIdentifier,rate
,191962,2.44
61668.0,69029,0.88
82373.0,69449,0.88
130775.0,67830,0.86
22656.0,65737,0.84
66202.0,64695,0.82
10222.0,63694,0.81
58607.0,60751,0.77
16668.0,60433,0.77
66953.0,57009,0.73


In [132]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
CityIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
130775.0,46553.0,94812,49.10
82373.0,44514.0,83312,53.43
16668.0,44472.0,84780,52.46
61668.0,40456.0,66845,60.52
58607.0,37841.0,60598,62.45
10222.0,37731.0,71814,52.54
66953.0,32261.0,57874,55.74
66202.0,31582.0,64286,49.13
22656.0,31456.0,55269,56.91
143782.0,30401.0,64650,47.02


### OrganizationIdentifier

In [133]:
col = 'OrganizationIdentifier'

In [134]:
print("[train]")
temp = train[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * train[col].value_counts(dropna=False) / train.shape[0], 2)
temp.sort_values('rate', ascending=False)

[train]


Unnamed: 0,OrganizationIdentifier,rate
27.0,4196457,47.04
,2751518,30.84
18.0,1764175,19.77
48.0,63845,0.72
50.0,45502,0.51
11.0,19436,0.22
37.0,19398,0.22
49.0,13627,0.15
46.0,10974,0.12
14.0,4713,0.05


In [135]:
print("[test]")
temp = test[col].value_counts(dropna=False).to_frame()
temp['rate'] = np.around(100 * test[col].value_counts(dropna=False) / test.shape[0], 2)
temp.sort_values('rate', ascending=False)

[test]


Unnamed: 0,OrganizationIdentifier,rate
27.0,3605244,45.91
,2482129,31.61
18.0,1552377,19.77
48.0,77154,0.98
50.0,45856,0.58
11.0,16871,0.21
37.0,16743,0.21
49.0,12060,0.15
46.0,9658,0.12
32.0,3790,0.05


In [136]:
table = train.groupby(col)['HasDetections'].sum().to_frame()
table['cnt'] = train.groupby(col)['HasDetections'].count()
table['rate'] = np.around((table.HasDetections / table.cnt) * 100, 2)
table.sort_values('HasDetections', ascending=False)

Unnamed: 0_level_0,HasDetections,cnt,rate
OrganizationIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
27.0,2088092.0,4196457,49.76
18.0,892153.0,1764175,50.57
48.0,33211.0,63845,52.02
50.0,27752.0,45502,60.99
11.0,10043.0,19436,51.67
37.0,9033.0,19398,46.57
49.0,6221.0,13627,45.65
46.0,5643.0,10974,51.42
36.0,2133.0,3909,54.57
32.0,2083.0,4045,51.5
