In [None]:
ls ../input/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from tqdm import tqdm
from numba import jit
from sklearn.preprocessing import StandardScaler, RobustScaler
import warnings
warnings.filterwarnings("ignore")

sns.set()

In [None]:
#https://www.kaggle.com/theoviel/load-the-totality-of-the-data
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float32',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int16',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float64', # was 'float32'
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float32', # was 'float16'
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float32', # was 'float16'
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float64', # was 'float32'
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float64', # was 'float32'
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32', # was 'float16'
        'Census_InternalPrimaryDisplayResolutionVertical':      'float32', # was 'float16'
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float64', # was 'float32'
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def plot_dist(dist,freq):

    i = 1
    l = ["HD = 0", "HD = 1"]
    
    x = dist[i].index
    y = np.array(dist[i].values/freq, dtype=np.float32)
    
    fig = plt.figure(figsize=(16,6))
    fig.add_subplot(1,2,1)
    plt.plot(x,y, "d", linestyle="", label=l[i])
    plt.ylim(0,1)
    plt.legend()
    fig.add_subplot(1,2,2)
    plt.plot(freq, "--o", label="freq")
    plt.yscale("log")
    plt.legend()
    plt.show()
    return x,y

def get_dist_freq(train,col):
    dist = train.groupby("HasDetections")[col].value_counts()
    freq = train[col].value_counts().values
    print(dist[1].head())
    return dist,freq

def load_datasets(dtypes):
    train = pd.read_csv('../input/train.csv', dtype=dtypes)
    new_dtypes = dtypes.copy()
    new_dtypes.pop("HasDetections")
    test = pd.read_csv('../input/test.csv', dtype=new_dtypes)
    return train, test

In [None]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = [c for c,v in dtypes.items() if v in numerics]
categorical_columns = [c for c,v in dtypes.items() if v not in numerics]

# Declare mode

In [None]:
mode = "debbug"

In [None]:
%%time
train,test = load_datasets(dtypes)
# data = pd.concat([train, test], ignore_index=True)

# Feature selection

In [None]:
used_cols = [i for i in test.columns if i not in ["MachineIdentifier", #                   "HasDetections",
                                                "PuaMode", "Census_ProcessorClass",  # mostly missing
                                                "Census_IsWIMBootEnabled","IsBeta",
                                                "Census_IsFlightsDisabled","Census_IsFlightingInternal",
                                                "AutoSampleOptIn","Census_ThresholdOptIn",
                                                "SMode","Census_IsPortableOperatingSystem",
                                                "Census_DeviceFamily","UacLuaenable", "Census_IsVirtualDevice",  # too skewed columns
                                                "Census_OSSkuName",    # hightly-correlated features
                                                "Processor", "Census_OSInstallLanguageIdentifier", "train"]]

In [None]:
train = train[used_cols + ["HasDetections"]]
test = test[used_cols]
gc.collect()

In [None]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = [c for c,v in dtypes.items() if v in numerics]
numerical_columns = [i for i in numerical_columns if i in used_cols]
categorical_columns = [c for c,v in dtypes.items() if v not in numerics]
categorical_columns = [i for i in categorical_columns if i in used_cols]

In [None]:
from sklearn.preprocessing import LabelEncoder


def basic_features(train,test,col,column_unique_train,column_unique_test, treshold=100):

    uni = {f"{col}_unique": list(column_unique_train | column_unique_test)}

    data = pd.DataFrame(uni)
    data = data.set_index(f"{col}_unique")

    freq_train = train[col].value_counts()
    freq_test = test[col].value_counts()
    abs_proba_2HD = train.groupby("HasDetections")[col].value_counts()[1]
    data = pd.concat([data, freq_train, freq_test, abs_proba_2HD], axis=1)
    data.columns = ["abs_freq_train", "abs_freq_test", "abs_proba_2HD"]
    ################# Target encoding ############################
    data.abs_proba_2HD = data.abs_proba_2HD.fillna(0)
    data["proba_2HD"] = data.abs_proba_2HD/freq_train
    rest = (data.abs_freq_train<treshold)
    data.loc[rest,"proba_2HD"] = data.abs_proba_2HD[rest].sum()/data.abs_freq_train[rest].sum()
    data.proba_2HD = data.proba_2HD.fillna(0.5)
    ################### Frequency encoding #######################
    data["freq_train"] = data["abs_freq_train"]/data["abs_freq_train"].sum()
    data["freq_test"] = data["abs_freq_test"]/data["abs_freq_test"].sum()
    zero_cols = ["abs_freq_train", "abs_freq_test", "freq_train", "freq_test"]
    data[zero_cols]=data[zero_cols].fillna(0)
    data["abs_freq_all"] = data.abs_freq_test+data.abs_freq_train
    data["norm_freq_all"] = 100*data.abs_freq_all/data.abs_freq_all.sum()
    data["log_freq_all"] = np.log(data.norm_freq_all.values)
    ################### More information about test and train dataset #######################
    data["diff"] = (data.abs_freq_train/data.abs_freq_train.sum())/(data.abs_freq_test/data.abs_freq_test.sum())
    data["diff"] = data["diff"].replace(np.inf, 200)
    ################### Label encoding ###########################
    encoder = LabelEncoder()
    data["labels"] = encoder.fit_transform(data.index.values)+1
    conditions = ((data["diff"]<=0.2) | (data["diff"]>=4) | (data["norm_freq_all"]<=0.005))
    data.loc[conditions, "labels"] = 0
    data["labels"] = encoder.fit_transform(data["labels"].values)
    
    return data

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

gc.collect()

In [None]:
for n,col in enumerate(numerical_columns):

    if np.sum(np.isnan(train[col].unique())) > 0:
        train[col] = train[col].fillna(-np.float16(np.pi))
        test[col] = test[col].fillna(-np.float16(np.pi))

    column_unique_train = set(train[col].unique())
    column_unique_test = set(test[col].unique())
    
    data = basic_features(train,test,col,column_unique_train,column_unique_test)
    
    train[col] = train[col].map(data["labels"])
    test[col] = test[col].map(data["labels"])
    print(n,col)

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

# +++++++++++++++++++++++++++++++++++++++++++

# ######## DONE with numerical_columns! Start with categorical_columns

In [None]:
col = "OsBuildLab"
print(col)
train[col] = train[col].cat.add_categories(["0.0.0.0.0-0"])
test[col] = test[col].cat.add_categories(["0.0.0.0.0-0"])
train[col].fillna("0.0.0.0.0-0", inplace=True)
test[col].fillna("0.0.0.0.0-0", inplace=True)

for b in list("xyzwa"):
    categorical_columns.append("Os_BuildLab_"+b)

def f(x, n):
    x = x.split(".")
    if len(x)<5:
        x = ['17134', '1', 'amd64fre', 'rs4_release', '180410-1804']
    return  x[n]

for n,b in enumerate(list("xyzwa")):
    train[f"Os_BuildLab_{b}"] = train[col].apply(f, args=(n,))
    test[f"Os_BuildLab_{b}"] = test[col].apply(f, args=(n,))
print("Done.")

In [None]:
col = "SmartScreen"#categorical_columns[12]
# fillna
train[col] = train[col].cat.add_categories(["NaN"])
test[col] = test[col].cat.add_categories(["NaN"])
train[col].fillna("NaN", inplace=True)
test[col].fillna("NaN", inplace=True)

# lower the strings features
train[col] = train[col].map(lambda x: x.lower())
test[col] = test[col].map(lambda x: x.lower())

# Combine values reasonable
map_x = {"block": 1,
        "requireadmin": 2,
        "warn": 3,
        "00000000": 4,
        "of": 5,
        "prompt": 6,
        "promt": 7,
        "enabled": 8,
        "&#x03;": 9,
        "promprt": 7,
        "requiredadmin": 2,
        "&#x02;": 10,
        "existsnotset": 11,
        "on": 8,
        "off": 5,
        "deny": 1,
        "nan": 9,
        "&#x01;": 12,
        "0": 4}

train[col] = train[col].map(map_x)
test[col] = test[col].map(map_x)

In [None]:
col = "Census_FlightRing"#categorical_columns[29]
def rename_f(x):
    if x == "CBCanary":
        x = "Canary"
    return x

train[col] = train[col].map(rename_f)
test[col] = test[col].map(rename_f)

In [None]:
col = "Census_OSEdition"

train[col] = train[col].cat.add_categories(["NaN"])
test[col] = test[col].cat.add_categories(["NaN"])
train[col].fillna("NaN", inplace=True)
test[col].fillna("NaN", inplace=True)

# lower the strings features
train[col] = train[col].map(lambda x: x.lower())
test[col] = test[col].map(lambda x: x.lower())

def rename_f(x):   
    if x == "homebasic":
        x = "home"
    elif x == "enterpriseg":
        x = "enterprise"
    elif x == "window 10 enterprise":
        x = "enterprise"
    elif x == "#":
        x = "nan"
    elif x == "00426-oem-8992662-00006":
        x = "nan"
    return x

train[col] = train[col].map(rename_f)
test[col] = test[col].map(rename_f)

In [None]:
col = "Census_OSBranch"#categorical_columns[22]

def rename_f(x):   
    if x == "rs1_release_inmarket":
        x = "rs1_release"
    elif x == "rs2_release_svc_d":
        x = "rs2_release_svc"
    elif x == "rs_edge":
        x = "rs5_release_edge"
    elif "rs_onecore" in x:
        x = "rs_onecore"
    elif x == "rsmaster":
        x = "rs_shell"
    elif x == "win7sp1_gdr":
        x = "win7sp1_ldr"
    return x

train[col] = train[col].map(rename_f)
test[col] = test[col].map(rename_f)

In [None]:
col = "Census_InternalBatteryType"
train[col] = train[col].cat.add_categories(["NaN"])
test[col] = test[col].cat.add_categories(["NaN"])
train[col].fillna("NaN", inplace=True)
test[col].fillna("NaN", inplace=True)

# lower the strings features
train[col] = train[col].map(lambda x: x.lower())
test[col] = test[col].map(lambda x: x.lower())

def group_battery(x):
    x = x.lower()
    if 'li' in x:
        return 1
    else:
        return 0
train[col] = train[col].map(group_battery).astype("int8")
test[col] = test[col].map(group_battery).astype("int8")

col = "Census_ChassisTypeName"#categorical_columns[17]
# fillna
train[col] = train[col].cat.add_categories(["NaN"])
test[col] = test[col].cat.add_categories(["NaN"])
train[col].fillna("NaN", inplace=True)
test[col].fillna("NaN", inplace=True)


# lower the strings features
train[col] = train[col].map(lambda x: x.lower())
test[col] = test[col].map(lambda x: x.lower())

In [None]:
col = "Census_PrimaryDiskTypeName"#categorical_columns[16]

train[col] = train[col].cat.add_categories(["NaN"])
test[col] = test[col].cat.add_categories(["NaN"])
train[col].fillna("NaN", inplace=True)
test[col].fillna("NaN", inplace=True)

col = "Census_PowerPlatformRoleName"#categorical_columns[18]

train[col] = train[col].cat.add_categories(["NaN"])
test[col] = test[col].cat.add_categories(["NaN"])
train[col].fillna("NaN", inplace=True)
test[col].fillna("NaN", inplace=True)

col = "Census_GenuineStateName"#categorical_columns[27]

train[col] = train[col].cat.add_categories(["NaN"])
test[col] = test[col].cat.add_categories(["NaN"])
train[col].fillna("NaN", inplace=True)
test[col].fillna("NaN", inplace=True)

In [None]:
for n,col in enumerate(categorical_columns):

    column_unique_train = set(train[col].unique())
    column_unique_test = set(test[col].unique())
    
    data = basic_features(train,test,col,column_unique_train,column_unique_test)
    
    train[col] = train[col].map(data["labels"])
    test[col] = test[col].map(data["labels"])
    print(n,col)

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
gc.collect()

In [None]:
20%10

cols_2use = ["log_freq_all"]
scaler = StandardScaler()

for m, col in enumerate(test.columns):
    print(m, col, flush=True, end="")
    
    column_unique_train = set(train[col].unique())
    column_unique_test = set(test[col].unique())

    data = basic_features(train,test,col,column_unique_train,column_unique_test)

    if len(data.index) >= 20:
        
        X = scaler.fit_transform(data[cols_2use])
        for n,c in enumerate(cols_2use):
    #         plt.plot(sorted(X[:,n]), "o", label=c)
            data[f"{col}_new_{c}"] = X[:,n]
            train[f"{col}_new_{c}"] = train[col].map(data[f"{col}_new_{c}"])
            test[f"{col}_new_{c}"] = test[col].map(data[f"{col}_new_{c}"])
    
    if m%20 ==0:
        train = reduce_mem_usage(train)
        test = reduce_mem_usage(test)



# Now it's time to save the new datasets

In [None]:
gc.collect()

In [None]:
train.shape

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
gc.collect()

In [None]:
train.to_pickle("train_basic.pkl")

In [None]:
test.to_pickle("test_basic.pkl")

In [None]:
ls