# Pre-procesamiento y selección de características

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression
from sklearn.feature_selection import VarianceThreshold

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

import sklearn
from sklearn import metrics, model_selection, tree
from sklearn.preprocessing import MinMaxScaler
import joblib
from collections import Counter

In [2]:
dtypes = {
    'MachineIdentifier':                                    'category',
    'ProductName':                                          'category',
    'EngineVersion':                                        'category',
    'AppVersion':                                           'category',
    'AvSigVersion':                                         'category',
    'IsBeta':                                               'int8',
    'RtpStateBitfield':                                     'float16',
    'IsSxsPassiveMode':                                     'int8',
    'DefaultBrowsersIdentifier':                            'float32',
    'AVProductStatesIdentifier':                            'float32',
    'AVProductsInstalled':                                  'float16',
    'AVProductsEnabled':                                    'float16',
    'HasTpm':                                               'int8',
    'CountryIdentifier':                                    'int16',
    'CityIdentifier':                                       'float32',
    'OrganizationIdentifier':                               'float16',
    'GeoNameIdentifier':                                    'float16',
    'LocaleEnglishNameIdentifier':                          'int16',
    'Platform':                                             'category',
    'Processor':                                            'category',
    'OsVer':                                                'category',
    'OsBuild':                                              'int16',
    'OsSuite':                                              'int16',
    'OsPlatformSubRelease':                                 'category',
    'OsBuildLab':                                           'category',
    'SkuEdition':                                           'category',
    'IsProtected':                                          'float16',
    'AutoSampleOptIn':                                      'int8',
    'PuaMode':                                              'category',
    'SMode':                                                'float16',
    'IeVerIdentifier':                                      'float16',
    'SmartScreen':                                          'category',
    'Firewall':                                             'float16',
    'UacLuaenable':                                         'float64',
    'Census_MDC2FormFactor':                                'category',
    'Census_DeviceFamily':                                  'category',
    'Census_OEMNameIdentifier':                             'float32',
    'Census_OEMModelIdentifier':                            'float32',
    'Census_ProcessorCoreCount':                            'float16',
    'Census_ProcessorManufacturerIdentifier':               'float16',
    'Census_ProcessorModelIdentifier':                      'float32',
    'Census_ProcessorClass':                                'category',
    'Census_PrimaryDiskTotalCapacity':                      'float64',
    'Census_PrimaryDiskTypeName':                           'category',
    'Census_SystemVolumeTotalCapacity':                     'float64',
    'Census_HasOpticalDiskDrive':                           'int8',
    'Census_TotalPhysicalRAM':                              'float32',
    'Census_ChassisTypeName':                               'category',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32',
    'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32',
    'Census_InternalPrimaryDisplayResolutionVertical':      'float32',
    'Census_PowerPlatformRoleName':                         'category',
    'Census_InternalBatteryType':                           'category',
    'Census_InternalBatteryNumberOfCharges':                'float64',
    'Census_OSVersion':                                     'category',
    'Census_OSArchitecture':                                'category',
    'Census_OSBranch':                                      'category',
    'Census_OSBuildNumber':                                 'int16',
    'Census_OSBuildRevision':                               'int32',
    'Census_OSEdition':                                     'category',
    'Census_OSSkuName':                                     'category',
    'Census_OSInstallTypeName':                             'category',
    'Census_OSInstallLanguageIdentifier':                   'float16',
    'Census_OSUILocaleIdentifier':                          'int16',
    'Census_OSWUAutoUpdateOptionsName':                     'category',
    'Census_IsPortableOperatingSystem':                     'int8',
    'Census_GenuineStateName':                              'category',
    'Census_ActivationChannel':                             'category',
    'Census_IsFlightingInternal':                           'float16',
    'Census_IsFlightsDisabled':                             'float16',
    'Census_FlightRing':                                    'category',
    'Census_ThresholdOptIn':                                'float16',
    'Census_FirmwareManufacturerIdentifier':                'float16',
    'Census_FirmwareVersionIdentifier':                     'float32',
    'Census_IsSecureBootEnabled':                           'int8',
    'Census_IsWIMBootEnabled':                              'float16',
    'Census_IsVirtualDevice':                               'float16',
    'Census_IsTouchEnabled':                                'int8',
    'Census_IsPenCapable':                                  'int8',
    'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
    'Wdft_IsGamer':                                         'float16',
    'Wdft_RegionIdentifier':                                'float16',
    'HasDetections':                                        'int8'
}

In [3]:
# https://www.kaggle.com/code/rinnqd/reduce-memory-usage/notebook
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [4]:
train_df = pd.read_csv('train.csv', dtype = dtypes)

In [5]:
train_df = reduce_mem_usage(train_df)

Memory usage after optimization is: 1619.69 MB
Decreased by 12.8%


In [6]:
test_df = pd.read_csv('test.csv', dtype = dtypes)

In [7]:
test_df = reduce_mem_usage(test_df)

Memory usage after optimization is: 1449.21 MB
Decreased by 12.6%


In [8]:
train_df.head()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,0000028988387b115f69f31a3bf04f09,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1735.0,0,7.0,0,,53447.0,...,36144.0,0,,0.0,0,0,0.0,0.0,10.0,0
1,000007535c3f730efa9ea0b7ef1bd645,win8defender,1.1.14600.4,4.13.17134.1,1.263.48.0,0,7.0,0,,53447.0,...,57858.0,0,,0.0,0,0,0.0,0.0,8.0,0
2,000007905a28d863f6d0d597892cd692,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1341.0,0,7.0,0,,53447.0,...,52682.0,0,,0.0,0,0,0.0,0.0,3.0,0
3,00000b11598a75ea8ba1beea8459149f,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1527.0,0,7.0,0,,53447.0,...,20050.0,0,,0.0,0,0,0.0,0.0,3.0,1
4,000014a5f00daa18e76b81417eeb99fc,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1379.0,0,7.0,0,,53447.0,...,19844.0,0,0.0,0.0,0,0,0.0,0.0,1.0,1


In [9]:
test_df.head()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier
0,0000010489e3af074adeac69c53e555e,win8defender,1.1.15400.5,4.18.1810.5,1.281.501.0,0,7.0,0,,53447.0,...,807.0,8554.0,1,,0.0,0,0,0.0,0.0,7.0
1,00000176ac758d54827acd545b6315a5,win8defender,1.1.15400.4,4.18.1809.2,1.279.301.0,0,7.0,0,,53447.0,...,554.0,33105.0,1,,0.0,0,0,0.0,1.0,12.0
2,0000019dcefc128c2d4387c1273dae1d,win8defender,1.1.15300.6,4.18.1809.2,1.277.230.0,0,7.0,0,,49480.0,...,556.0,63396.0,1,,0.0,0,0,0.0,1.0,11.0
3,0000055553dc51b1295785415f1a224d,win8defender,1.1.15400.5,4.18.1810.5,1.281.664.0,0,7.0,0,,42160.0,...,628.0,26320.0,1,0.0,0.0,0,0,0.0,0.0,10.0
4,00000574cefffeca83ec8adf9285b2bf,win8defender,1.1.15400.4,4.18.1809.2,1.279.236.0,0,7.0,0,,53447.0,...,556.0,63269.0,1,,0.0,0,0,0.0,1.0,3.0


In [10]:
train_df.shape

(8921483, 83)

In [11]:
test_df.shape

(7853253, 82)

In [12]:
train_df.describe()

Unnamed: 0,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,...,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
count,8921483.0,8889165.0,8921483.0,433438.0,8885262.0,8885262.0,8885262.0,8921483.0,8921483.0,8596074.0,...,8761350.0,8921483.0,3261780.0,8905530.0,8921483.0,8921483.0,8850140.0,8618032.0,8618032.0,8921483.0
mean,7.509962e-06,,0.01733378,,47840.02,,,0.9879711,108.049,81266.5,...,33027.93,0.4860229,0.0,0.0,0.1255431,0.03807091,,,,0.4997927
std,0.002740421,0.0,0.1305118,,14032.37,0.0,0.0,0.1090149,63.04706,48923.39,...,21206.91,0.4998046,0.0,0.0,0.3313338,0.1913675,0.0,0.0,0.0,0.5
min,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,1.0,5.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.0,7.0,0.0,788.0,49480.0,1.0,1.0,1.0,51.0,36825.0,...,13156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
50%,0.0,7.0,0.0,1632.0,53447.0,1.0,1.0,1.0,97.0,82373.0,...,33070.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0
75%,0.0,7.0,0.0,2372.0,53447.0,2.0,1.0,1.0,162.0,123700.0,...,52436.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,11.0,1.0
max,1.0,35.0,1.0,3212.0,70507.0,7.0,5.0,1.0,222.0,167962.0,...,72105.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,15.0,1.0


In [13]:
nans = []
pcts = []
for cols in train_df:
    nans.append(train_df[cols].isnull().sum())
    pcts.append(train_df[cols].isnull().sum() / train_df.shape[0] * 100)
    
missing_data = pd.DataFrame({'Column': train_df.columns, 'Missing Values': nans, 'Missing [%]': pcts}).sort_values('Missing Values', ascending = False)
missing_data[missing_data['Missing Values'] != 0].reset_index(drop = True)

Unnamed: 0,Column,Missing Values,Missing [%]
0,PuaMode,8919174,99.974119
1,Census_ProcessorClass,8884852,99.589407
2,DefaultBrowsersIdentifier,8488045,95.141637
3,Census_IsFlightingInternal,7408759,83.04403
4,Census_InternalBatteryType,6338429,71.046809
5,Census_ThresholdOptIn,5667325,63.524472
6,Census_IsWIMBootEnabled,5659703,63.439038
7,SmartScreen,3177011,35.610795
8,OrganizationIdentifier,2751518,30.841487
9,SMode,537759,6.027686


In [14]:
nans = []
pcts = []
for cols in test_df:
    nans.append(test_df[cols].isnull().sum())
    pcts.append(test_df[cols].isnull().sum() / test_df.shape[0] * 100)
    
missing_data2 = pd.DataFrame({'Column': test_df.columns, 'Missing Values': nans, 'Missing [%]': pcts}).sort_values('Missing Values', ascending = False)
missing_data2[missing_data2['Missing Values'] != 0].reset_index(drop = True)

Unnamed: 0,Column,Missing Values,Missing [%]
0,PuaMode,7851065,99.972139
1,Census_ProcessorClass,7835022,99.767854
2,DefaultBrowsersIdentifier,7546134,96.089277
3,Census_IsFlightingInternal,6673962,84.983408
4,Census_InternalBatteryType,5979844,76.144803
5,SMode,5831272,74.25295
6,Census_ThresholdOptIn,5529515,70.410504
7,Census_IsWIMBootEnabled,5522707,70.323814
8,SmartScreen,3498402,44.54717
9,OrganizationIdentifier,2482129,31.60638


Descartamos las columnas con un porcentaje mayor al 30% de valores nulos, ya que se consideró que no están aportando valor al modelo

In [15]:
train_df.drop(['PuaMode', 'Census_ProcessorClass', 'DefaultBrowsersIdentifier', 'Census_IsFlightingInternal', 'Census_InternalBatteryType', 
               'Census_ThresholdOptIn', 'Census_IsWIMBootEnabled', 'SmartScreen', 'OrganizationIdentifier'
              ], axis=1, inplace=True)

In [16]:
test_df.drop(['PuaMode', 'Census_ProcessorClass', 'DefaultBrowsersIdentifier', 'Census_IsFlightingInternal', 'Census_InternalBatteryType', 
               'Census_ThresholdOptIn', 'Census_IsWIMBootEnabled', 'SmartScreen', 'OrganizationIdentifier'
              ], axis=1, inplace=True)

In [17]:
def segregate_features():
    binary = [col for col in train_df.columns if train_df[col].nunique() == 2]
    numerical_floats = ['Census_ProcessorCoreCount',
                        'Census_PrimaryDiskTotalCapacity',
                        'Census_SystemVolumeTotalCapacity',
                        'Census_TotalPhysicalRAM',
                        'Census_InternalPrimaryDiagonalDisplaySizeInInches',
                        'Census_InternalPrimaryDisplayResolutionHorizontal',
                        'Census_InternalPrimaryDisplayResolutionVertical',
                        'Census_InternalBatteryNumberOfCharges']
    categorical = [col for col in train_df.columns if (col not in numerical_floats) & (col not in binary)]
    return binary, numerical_floats, categorical
    
binary_columns, numerical_float_columns, categorical_columns = segregate_features()

In [18]:
high_cardinality_cols = [col for col in categorical_columns if train_df[col].nunique() > 500] 
print('Columns with High Cardinality: \n')
high_cardinality_cols

Columns with High Cardinality: 



['MachineIdentifier',
 'AvSigVersion',
 'AVProductStatesIdentifier',
 'CityIdentifier',
 'OsBuildLab',
 'Census_OEMNameIdentifier',
 'Census_OEMModelIdentifier',
 'Census_ProcessorModelIdentifier',
 'Census_FirmwareManufacturerIdentifier',
 'Census_FirmwareVersionIdentifier']

Eliminar columnas con alta cardinalidad y el identificador del equipo

In [19]:
train_df.drop(high_cardinality_cols, axis = 1, inplace = True)
test_df.drop(high_cardinality_cols, axis = 1, inplace = True)

Eliminar filas con valores nulos

In [20]:
train_df.dropna(subset = numerical_float_columns, inplace=True)
test_df.dropna(subset = numerical_float_columns, inplace=True)

Quitamos los valores infinitos

In [21]:
train_df.replace([np.inf, -np.inf], np.nan, inplace = True)
test_df.replace([np.inf, -np.inf], np.nan, inplace = True)

In [22]:
binary_columns, numerical_float_columns, categorical_columns = segregate_features()

Encode de variables categoricas

In [23]:
label_encoder = {}
for col in categorical_columns:
    _, label_encoder[col] = pd.factorize(train_df[col])
    
for col in categorical_columns:
    train_df[col] = label_encoder[col].get_indexer(train_df[col])
    test_df[col] = label_encoder[col].get_indexer(test_df[col])

In [24]:
train_df.head()

Unnamed: 0,ProductName,EngineVersion,AppVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,...,Census_IsFlightsDisabled,Census_FlightRing,Census_IsSecureBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,0,0,0,0,0,0,0,0,1,0,...,0.0,0,0,0.0,0,0,0.0,0.0,0,0
1,0,1,1,0,0,0,0,0,1,1,...,0.0,1,0,0.0,0,0,0.0,0.0,1,0
2,0,0,0,0,0,0,0,0,1,2,...,0.0,0,0,0.0,0,0,0.0,0.0,2,0
3,0,0,0,0,0,0,0,0,1,3,...,0.0,0,0,0.0,0,0,0.0,0.0,2,1
4,0,0,0,0,0,0,0,0,1,4,...,0.0,0,0,0.0,0,0,0.0,0.0,3,1


In [25]:
x = train_df.drop(['HasDetections'], axis=1)
y = train_df['HasDetections']

Evaluamos las caracteristicas para ver cuales son las mas apropiadas a utilizar

In [26]:
select = VarianceThreshold(threshold = (.8 * (1 - .8)))
z = select.fit_transform(x, y)

In [27]:
filter = select.get_support()
features = list(x.columns)

In [28]:
selected_features = []

for i in range(0, len(features)):
    if filter[i]:
        print(features[i])
        selected_features.append(features[i])

EngineVersion
AppVersion
AVProductsInstalled
CountryIdentifier
GeoNameIdentifier
LocaleEnglishNameIdentifier
Processor
OsBuild
OsSuite
OsPlatformSubRelease
SkuEdition
IeVerIdentifier
Census_MDC2FormFactor
Census_ProcessorCoreCount
Census_PrimaryDiskTotalCapacity
Census_PrimaryDiskTypeName
Census_SystemVolumeTotalCapacity
Census_TotalPhysicalRAM
Census_ChassisTypeName
Census_InternalPrimaryDiagonalDisplaySizeInInches
Census_InternalPrimaryDisplayResolutionHorizontal
Census_InternalPrimaryDisplayResolutionVertical
Census_PowerPlatformRoleName
Census_InternalBatteryNumberOfCharges
Census_OSVersion
Census_OSArchitecture
Census_OSBranch
Census_OSBuildNumber
Census_OSBuildRevision
Census_OSEdition
Census_OSSkuName
Census_OSInstallTypeName
Census_OSInstallLanguageIdentifier
Census_OSUILocaleIdentifier
Census_OSWUAutoUpdateOptionsName
Census_GenuineStateName
Census_ActivationChannel
Census_FlightRing
Census_IsSecureBootEnabled
Wdft_IsGamer
Wdft_RegionIdentifier


In [29]:
x = x[selected_features]

Tomar un porcentaje del dataset

In [30]:
samples_train = []
samples_test = []
train_split = np.array_split(train_df, 10)
test_split = np.array_split(test_df, 10)

for chunk in train_split:
    chunk1 = chunk.sample(frac = .20, random_state = 33)
        
    samples_train.append(chunk1)
    
for chunk in test_split:
    chunk1 = chunk.sample(frac = .07, random_state = 33)
        
    samples_test.append(chunk1)

In [31]:
df_train = pd.concat(samples_train)
df_test = pd.concat(samples_test)

In [32]:
X = df_train[selected_features]
y = df_train['HasDetections']

Dataset ya suficientemente balanceado

In [33]:
print('Dataset shape:', Counter(y))

Dataset shape: Counter({0: 859951, 1: 857189})


Dividir en train, test y validation

In [34]:
X_train, X_test_total, y_train, y_test_total = model_selection.train_test_split(X, y, test_size = 0.45, random_state = 31)

X_test, X_validation, y_test, y_validation = model_selection.train_test_split(X_test_total, y_test_total, test_size = 0.33, random_state = 31)

In [35]:
test_final = df_test[selected_features]

In [36]:
scaler = MinMaxScaler()

In [37]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_validation = scaler.transform(X_validation)
test_final = scaler.transform(test_final)

In [38]:
pd.DataFrame(X_train).to_csv('x_train.csv', index = False)
pd.DataFrame(y_train).to_csv('y_train.csv', index = False)

In [39]:
pd.DataFrame(X_test).to_csv('x_test.csv', index = False)
pd.DataFrame(y_test).to_csv('y_test.csv', index = False)

In [40]:
pd.DataFrame(X_validation).to_csv('x_validation.csv', index = False)
pd.DataFrame(y_validation).to_csv('y_validation.csv', index = False)

In [41]:
pd.DataFrame(test_final).to_csv('test_final.csv', index = False)