# 사전작업

## 라이브러리

In [1]:
import numpy as np
import pandas as pd
import warnings
import gc
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('max_rows', 500)
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

## 데이터 로드

In [3]:
dtypes = {
        'MachineIdentifier':                                    'object',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

In [4]:
%%time
train = pd.read_csv('./data/train.csv', dtype=dtypes)
test = pd.read_csv('./data/test.csv', dtype=dtypes)

Wall time: 3min 22s


In [5]:
data = train.append(test)

In [6]:
data = data.reset_index(drop=True)

## 임시 데이터 로드

In [7]:
temp = pd.read_csv('./data_temp/new_feature_region.csv')

In [8]:
for col in temp.columns:
    data[col] = temp[col]

# 지역 관련 피처

- CountryIdentifier
- CityIdentifier
- OrganizationIdentifier
- GeoNameIdentifier
- LocaleEnglishNameIdentifier
- Census_OSInstallLanguageIdentifier
- Census_OSUILocaleIdentifier
- Wdft_RegionIdentifier
- Census_OEMNameIdentifier
- Census_OEMModelIdentifier

# Combined

In [9]:
new_cols = ['MachineIdentifier', 'HasDetections']

## Country + City

* 한 도시에 나라가 여러 개 존재하는 경우
* 나라당 도시의 갯수

### 한 도시에 나라가 여러개 존재하는 경우

In [6]:
city = data.groupby(['CityIdentifier', 'CountryIdentifier']).size().to_frame().reset_index()

In [9]:
city.rename(columns={0:'size'}, inplace=True)

In [13]:
dic = {}
for idx in tqdm_notebook(city.CityIdentifier.unique().tolist()):
    if city[city.CityIdentifier == idx].__len__() != 1:
        dic[idx] = city[city.CityIdentifier == idx].sort_values('size').iloc[-1].CountryIdentifier

HBox(children=(IntProgress(value=0, max=130523), HTML(value='')))




In [None]:
data['CityIdx_CountryIdx_multi'] = 0

In [22]:
for city_idx, country_idx in tqdm_notebook(dic.items()):
    data.loc[data.CityIdentifier == city_idx, 'CountryIdentifier'] = country_idx
    data.loc[data.CityIdentifier == city_idx, 'CityIdx_CountryIdx_multi'] = 1

HBox(children=(IntProgress(value=0, max=4235), HTML(value='')))




##### 임시저장

In [10]:
new_cols = new_cols + ['CountryIdentifier', 'CityIdx_CountryIdx_multi']

In [11]:
# data[new_cols].to_csv('./data_temp/new_feature_region.csv', index=False)

### 나라별 도시의 개수
* 나라별 도시의 개수
* 나라별 도시의 개수 grade - log, min-max

In [101]:
# 나라별 도시의 개수
temp = data.groupby(['CountryIdentifier', 'CityIdentifier']).size()
dic = {}
for country_idx in tqdm_notebook(data.CountryIdentifier.unique()):
    dic[country_idx] = temp.loc[country_idx].__len__()

HBox(children=(IntProgress(value=0, max=222), HTML(value='')))




In [97]:
data['CityCnt_Per_Country'] = data.CountryIdentifier.map(dic)
data['CityCnt_Per_Country_Minmax'] = np.around((data.CityCnt_Per_Country - data.CityCnt_Per_Country.min()) / (data.CityCnt_Per_Country.max() - data.CityCnt_Per_Country.min()) * 10).astype(int)
data['CityCnt_Per_Country_Log'] = np.around(np.log(data.CityCnt_Per_Country)).astype(int)

### 나라별 데이터의 개수
* 나라별 데이터의 개수
* 나라별 데이터의 개수 grede - log, min-max

In [102]:
# 나라별 데이터의 개수
temp = data.groupby(['CountryIdentifier']).size()

In [104]:
data['DataCnt_Per_Country'] = data.CountryIdentifier.map(temp)
data['DataCnt_Per_Country_Minmax'] = np.around((data.DataCnt_Per_Country - data.DataCnt_Per_Country.min()) / (data.DataCnt_Per_Country.max() - data.DataCnt_Per_Country.min()) * 10).astype(int)
data['DataCnt_Per_Country_Log'] = np.around(np.log(data.DataCnt_Per_Country)).astype(int)

##### 임시저장

In [12]:
new_cols = new_cols + ['CityCnt_Per_Country', 'CityCnt_Per_Country_Minmax', 'CityCnt_Per_Country_Log',
                       'DataCnt_Per_Country', 'DataCnt_Per_Country_Minmax', 'DataCnt_Per_Country_Log']

In [13]:
# data[new_cols].to_csv('./data_temp/new_feature_region.csv', index=False)

## CountryIdentifier + CityIdentifier + OrganizationIdentifier

### 개인 사용자 vs 기관 사용자

In [14]:
data['Personal_User'] = data.OrganizationIdentifier.isna() * 1

### 특정 나라의 특정 도시의 특정 기관

In [15]:
temp = data.groupby(['CountryIdentifier', 'CityIdentifier', 'OrganizationIdentifier']).size().to_frame().drop([0], axis=1)
temp['label'] = range(1, len(temp) + 1)

In [16]:
def get_org(data):
    try:
        return temp.loc[data.CountryIdentifier, data.CityIdentifier, data.OrganizationIdentifier]
    except:
        return np.nan

In [29]:
data['NewOrganizationIdentifier'] = data[['CountryIdentifier', 'CityIdentifier', 'OrganizationIdentifier']].apply(get_org, axis=1)

##### 임시저장

In [30]:
new_cols = new_cols + ['Personal_User', 'NewOrganizationIdentifier']

In [31]:
# data[new_cols].to_csv('./data_temp/new_feature_region.csv', index=False)